prediction-results-analysis.Rmd

title: "Job power prediction result analysis"
author: "Danilo Carastan-Santos"
date: "2024-05-15"
output:
  rmdformats::readthedown
import pandas as pd
import seaborn as sns

import os

RESULTS_PATH = "../user-power-predictions/data/total_power_mean_predictions_users_allmethods_mean/"
PRED_COLS = ["hist_pred_total_power_mean",
            "LinearRegression_total_power_mean_watts",
            "RandomForestRegressor_total_power_mean_watts",
            "LinearSVR_total_power_mean_watts",
            "SGDRegressor_total_power_mean_watts"]


result_filenames = os.listdir(RESULTS_PATH)

df_all_results = pd.concat([pd.read_csv(RESULTS_PATH+filename, low_memory=False) for filename in result_filenames])

df_all_results = df_all_results.dropna(subset=PRED_COLS)
df_all_results.to_csv('/tmp/allresults-mean.csv', index=False)


from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

lst_users = df_all_results["user_id"].drop_duplicates().to_list()
#print(lst_users)

df_results_user_group = df_all_results.groupby("user_id")

lst_stats_per_user = []

for user in lst_users:
    results_user = df_results_user_group.get_group(user)
    hist_mape = mean_absolute_percentage_error(results_user["total_power_mean_watts"], results_user["hist_pred_total_power_mean"])
    LR_mape = mean_absolute_percentage_error(results_user["total_power_mean_watts"], results_user["LinearRegression_total_power_mean_watts"])
    RF_mape = mean_absolute_percentage_error(results_user["total_power_mean_watts"], results_user["RandomForestRegressor_total_power_mean_watts"])
    LSVR_mape = mean_absolute_percentage_error(results_user["total_power_mean_watts"], results_user["LinearSVR_total_power_mean_watts"])
    SGD_mape = mean_absolute_percentage_error(results_user["total_power_mean_watts"], results_user["SGDRegressor_total_power_mean_watts"])
    res = {"user_id": user,
           "hist_mape": hist_mape,
           "LinearRegression_mape": LR_mape,
           "RandomForestRegressor_mape": RF_mape,
           "LinearSVR_mape": LSVR_mape,
           "SGDRegressor_mape": SGD_mape}
    lst_stats_per_user.append(res)
    #break

df_stats_per_user = pd.DataFrame(lst_stats_per_user)
df_stats_per_user

COLS = ["hist_mape","LinearRegression_mape","RandomForestRegressor_mape","LinearSVR_mape","SGDRegressor_mape"]
df_stats_per_user[COLS].describe()

COLS = ["hist_mape","LinearRegression_mape","RandomForestRegressor_mape","LinearSVR_mape","SGDRegressor_mape"]
df_stats_per_user_pivot = pd.melt(df_stats_per_user, id_vars="user_id")
df_stats_per_user_pivot
import matplotlib.pyplot as plt

TINY_SIZE = 2
SMALL_SIZE = 5
MEDIUM_SIZE = 20
BIGGER_SIZE = 50
FIG_WIDTH = 40
FIG_HEIGHT = 10

#plt.rc('font', size=16)          # controls default text sizes
plt.rc('font', size=20)          # controls default text sizes
plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)     # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize
plt.rc('figure', titlesize=MEDIUM_SIZE)  # fontsize of the figure title
plt.rc('figure', figsize=(8,4))

#g = sns.boxplot(x="variable", y="value", data=df_stats_per_user_pivot, showfliers=False)
#plt.xticks(ticks=[0,1,2,3,4],labels=["History", "LinearRegression", "RandomForest", "LinearSVR", "SGDRegressor"],rotation=30)
g = sns.boxplot(y="variable", x="value", data=df_stats_per_user_pivot, showfliers=False)
plt.yticks(ticks=[0,1,2,3,4],labels=["History", "LinearRegression", "RandomForest", "LinearSVR", "SGDRegressor"],rotation=0)

g.set_ylabel("Prediction Method")
g.set_xlabel("Mean Absolute Percentage Error (MAPE)     ")
plt.tight_layout(pad=0)
plt.savefig("./fig3a-pred-mape-mean-power.svg")
plt.savefig("./fig3a-pred-mape-mean-power.pdf")
import pandas as pd
import seaborn as sns

import os

RESULTS_PATH = "../user-power-predictions/data/total_power_mean_predictions_users_allmethods_max/"

PRED_COLS = ["hist_pred_total_power_max",
            "LinearRegression_total_power_max_watts",
            "RandomForestRegressor_total_power_max_watts",
            "LinearSVR_total_power_max_watts",
            "SGDRegressor_total_power_max_watts"]


result_filenames = os.listdir(RESULTS_PATH)

df_all_results = pd.concat([pd.read_csv(RESULTS_PATH+filename, low_memory=False) for filename in result_filenames])

df_all_results = df_all_results.dropna(subset=PRED_COLS)
df_all_results.to_csv('/tmp/allresults-max.csv', index=False)
#df_all_results


from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

lst_users = df_all_results["user_id"].drop_duplicates().to_list()
#print(lst_users)

df_results_user_group = df_all_results.groupby("user_id")

lst_stats_per_user = []

for user in lst_users:
    results_user = df_results_user_group.get_group(user)
    hist_mape = mean_absolute_percentage_error(results_user["total_power_max_watts"], results_user["hist_pred_total_power_max"])
    LR_mape = mean_absolute_percentage_error(results_user["total_power_max_watts"], results_user["LinearRegression_total_power_max_watts"])
    RF_mape = mean_absolute_percentage_error(results_user["total_power_max_watts"], results_user["RandomForestRegressor_total_power_max_watts"])
    LSVR_mape = mean_absolute_percentage_error(results_user["total_power_max_watts"], results_user["LinearSVR_total_power_max_watts"])
    SGD_mape = mean_absolute_percentage_error(results_user["total_power_max_watts"], results_user["SGDRegressor_total_power_max_watts"])
    res = {"user_id": user,
           "hist_mape": hist_mape,
           "LinearRegression_mape": LR_mape,
           "RandomForestRegressor_mape": RF_mape,
           "LinearSVR_mape": LSVR_mape,
           "SGDRegressor_mape": SGD_mape}
    lst_stats_per_user.append(res)
    #break

df_stats_per_user = pd.DataFrame(lst_stats_per_user)
#df_stats_per_user

COLS = ["hist_mape","LinearRegression_mape","RandomForestRegressor_mape","LinearSVR_mape","SGDRegressor_mape"]
df_stats_per_user[COLS].describe()

COLS = ["hist_mape","LinearRegression_mape","RandomForestRegressor_mape","LinearSVR_mape","SGDRegressor_mape"]
df_stats_per_user_pivot = pd.melt(df_stats_per_user, id_vars="user_id")
df_stats_per_user_pivot
import matplotlib.pyplot as plt

TINY_SIZE = 2
SMALL_SIZE = 5
MEDIUM_SIZE = 20
BIGGER_SIZE = 50
FIG_WIDTH = 40
FIG_HEIGHT = 10


plt.rc('font', size=20)          # controls default text sizes
plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
plt.rc('axes', labelsize=MEDIUM_SIZE)     # fontsize of the x and y labels
plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize
plt.rc('figure', titlesize=MEDIUM_SIZE)  # fontsize of the figure title
plt.rc('figure', figsize=(8,4))

#g = sns.boxplot(x="variable", y="value", data=df_stats_per_user_pivot, showfliers=False)
#plt.xticks(ticks=[0,1,2,3,4],labels=["History", "LinearRegression", "RandomForest", "LinearSVR", "SGDRegressor"],rotation=30)
#g.set_xlabel("Prediction Method")
#g.set_ylabel("Mean Absolute Percentage Error (MAPE)            ")

g = sns.boxplot(y="variable", x="value", data=df_stats_per_user_pivot, showfliers=False)
plt.yticks(ticks=[0,1,2,3,4],labels=["History", "LinearRegression", "RandomForest", "LinearSVR", "SGDRegressor"],rotation=0)
g.set_ylabel("Prediction Method")
g.set_xlabel("Mean Absolute Percentage Error (MAPE)")
plt.tight_layout(pad=0)
plt.savefig("./fig3b-pred-mape-max-power.svg")
plt.savefig("./fig3b-pred-mape-max-power.pdf")
# clear all Python memory
import sys
sys.modules[__name__].__dict__.clear()
import gc
gc.collect()
library(tidyverse)

data_mean = read_csv('/tmp/allresults-mean.csv')
data_mean %>% ggplot(aes(x=total_power_mean_watts)) +
  geom_histogram() +
  scale_y_continuous(labels = scales::label_number()) +
  theme_bw(base_size=20) +
  labs(
    x='Total power (W)',
    y='Number of jobs'
  )
ggsave('./fig2a-distrib-job-power-mean.pdf', width=6, height=3)
ggsave('./fig2a-distrib-job-power-mean.svg', width=6, height=3)
rm(data_mean)

data_max = read_csv('/tmp/allresults-max.csv')
data_max %>% ggplot(aes(x=total_power_max_watts)) +
  geom_histogram() +
  scale_y_continuous(labels = scales::label_number()) +
  theme_bw(base_size=20) +
  labs(
    x='Total power (W)',
    y='Number of jobs'
  )
ggsave('./fig2b-distrib-job-power-max.pdf', width=6, height=3)
ggsave('./fig2b-distrib-job-power-max.svg', width=6, height=3)
rm(data_max)