consolidated prediction artifacts

c8c63a6f · danilo.carastandossantos · 17fd6eeb · c8c63a6f · c8c63a6f · c8c63a6f
Commit c8c63a6f authored 1 year ago by danilo.carastandossantos
--- a/.gitignore
+++ b/.gitignore
+artifact-overview.pdf
--- a/artifact-overview.typ
+++ b/artifact-overview.typ
@@ -377,125 +377,118 @@ The experimental workflow consists of three parts, (i) preprocessing of the orig

 ==== Step 1

-#tododanilo[in the script source-code: change output filename of step 1 
-from a_0_filter12_singlenode.csv and from a_0_filter12_multinode.csv 
-to 22-0X_filter12_singlenode.csv and 22-0X_filter12_multinode.csv]
-
-#fullbox(footer:[Memory: 128 Go. Time (sequential): 18:00:00])[
+#fullbox(footer:[#emph-overhead[Memory: 128 Go. Time (sequential): 18:00:00]])[
 ```python
  ./scripts-py/expe_energumen/m100_pred_preprocess_1.py \
-      -j ../m100-data/22-01_jobs.parquet \
-      -p m100-data/22-01_power_total.parquet
+      -j ./m100-data/22-01_jobs.parquet \
+      -p ./m100-data/22-01_power_total.parquet
  ```
 ```python
  ./scripts-py/expe_energumen/m100_pred_preprocess_1.py \
-      -j ../m100-data/22-02_jobs.parquet \
-      -p m100-data/22-02_power_total.parquet
+      -j ./m100-data/22-02_jobs.parquet \
+      -p ./m100-data/22-02_power_total.parquet
  ```
 ```python
  ./scripts-py/expe_energumen/m100_pred_preprocess_1.py \
-      -j ../m100-data/22-03_jobs.parquet \
-      -p m100-data/22-03_power_total.parquet
+      -j ./m100-data/22-03_jobs.parquet \
+      -p ./m100-data/22-03_power_total.parquet
  ```
 ```python
  ./scripts-py/expe_energumen/m100_pred_preprocess_1.py \
-      -j ../m100-data/22-04_jobs.parquet \
-      -p m100-data/22-04_power_total.parquet
+      -j ./m100-data/22-04_jobs.parquet \
+      -p ./m100-data/22-04_power_total.parquet
  ```
 ```python
  ./scripts-py/expe_energumen/m100_pred_preprocess_1.py \
-      -j ../m100-data/22-05_jobs.parquet \
-      -p m100-data/22-05_power_total.parquet
+      -j ./m100-data/22-05_jobs.parquet \
+      -p ./m100-data/22-05_power_total.parquet
  ```
 ```python
  ./scripts-py/expe_energumen/m100_pred_preprocess_1.py \
-      -j ../m100-data/22-06_jobs.parquet \
-      -p m100-data/22-06_power_total.parquet
+      -j ./m100-data/22-06_jobs.parquet \
+      -p ./m100-data/22-06_power_total.parquet
  ```
 ```python
  ./scripts-py/expe_energumen/m100_pred_preprocess_1.py \
-      -j ../m100-data/22-07_jobs.parquet \
-      -p m100-data/22-07_power_total.parquet
+      -j ./m100-data/22-07_jobs.parquet \
+      -p ./m100-data/22-07_power_total.parquet
  ```
 ```python
  ./scripts-py/expe_energumen/m100_pred_preprocess_1.py \
-      -j ../m100-data/22-08_jobs.parquet \
-      -p m100-data/22-08_power_total.parquet
+      -j ./m100-data/22-08_jobs.parquet \
+      -p ./m100-data/22-08_power_total.parquet
  ```
 ```python
  ./scripts-py/expe_energumen/m100_pred_preprocess_1.py \
-      -j ../m100-data/22-09_jobs.parquet \
-      -p m100-data/22-09_power_total.parquet
+      -j ./m100-data/22-09_jobs.parquet \
+      -p ./m100-data/22-09_power_total.parquet
  ```
 ]

 === Step 2
-#tododanilo[in the script source-code: change output filename of step 2 
-from a_0_filter123_singlenode.csv and from a_0_filter123_multinode.csv 
-to 22-0X_filter123_singlenode.csv and 22-0X_filter123_multinode.csv]

-#fullbox(footer:[Memory: 128 Go. Time (sequential): 66:00:00])[
+#fullbox(footer:[#emph-overhead[Memory: 128 Go. Time (sequential): 66:00:00]])[
 ```python
  ./scripts-py/expe_energumen/m100_pred_preprocess_2.py \
      -js ./m100-data/22-01_filter12_singlenode.csv \
      -jm ./m100-data/22-01_filter12_multinode.csv
-      -p m100-data/22-01_power_total.parquet
+      -p ./m100-data/22-01_power_total.parquet
  ```

 ```python
  ./scripts-py/expe_energumen/m100_pred_preprocess_2.py \
      -js ./m100-data/22-02_filter12_singlenode.csv \
      -jm ./m100-data/22-02_filter12_multinode.csv
-      -p m100-data/22-02_power_total.parquet
+      -p ../m100-data/22-02_power_total.parquet
  ```

 ```python
  ./scripts-py/expe_energumen/m100_pred_preprocess_2.py \
      -js ./m100-data/22-03_filter12_singlenode.csv \
      -jm ./m100-data/22-03_filter12_multinode.csv
-      -p m100-data/22-03_power_total.parquet
+      -p ./m100-data/22-03_power_total.parquet
  ```

 ```python
  ./scripts-py/expe_energumen/m100_pred_preprocess_2.py \
      -js ./m100-data/22-04_filter12_singlenode.csv \
      -jm ./m100-data/22-04_filter12_multinode.csv
-      -p m100-data/22-04_power_total.parquet
+      -p ./m100-data/22-04_power_total.parquet
  ```

 ```python
  ./scripts-py/expe_energumen/m100_pred_preprocess_2.py \
      -js ./m100-data/22-05_filter12_singlenode.csv \
      -jm ./m100-data/22-05_filter12_multinode.csv
-      -p m100-data/22-05_power_total.parquet
+      -p ./m100-data/22-05_power_total.parquet
  ```

 ```python
  ./scripts-py/expe_energumen/m100_pred_preprocess_2.py \
      -js ./m100-data/22-06_filter12_singlenode.csv \
      -jm ./m100-data/22-06_filter12_multinode.csv
-      -p m100-data/22-06_power_total.parquet
+      -p ./m100-data/22-06_power_total.parquet
  ```

 ```python
  ./scripts-py/expe_energumen/m100_pred_preprocess_2.py \
      -js ./m100-data/22-07_filter12_singlenode.csv \
      -jm ./m100-data/22-07_filter12_multinode.csv
-      -p m100-data/22-07_power_total.parquet
+      -p ./m100-data/22-07_power_total.parquet
  ```

 ```python
  ./scripts-py/expe_energumen/m100_pred_preprocess_2.py \
      -js ./m100-data/22-08_filter12_singlenode.csv \
      -jm ./m100-data/22-08_filter12_multinode.csv
-      -p m100-data/22-08_power_total.parquet
+      -p ./m100-data/22-08_power_total.parquet
  ```

 ```python
  ./scripts-py/expe_energumen/m100_pred_preprocess_2.py \
      -js ./m100-data/22-09_filter12_singlenode.csv \
      -jm ./m100-data/22-09_filter12_multinode.csv
-      -p m100-data/22-09_power_total.parquet
+      -p ./m100-data/22-09_power_total.parquet
  ```

 ]
@@ -508,52 +501,50 @@ find . -name '*filter123*' | tar -zcvf exadata_job_energy_profiles.tar.gz --file

 === Compute power metrics and add job information

-#tododanilo[Script source-code: change -d (dir path) and pass the path to the necessary files]
-
 #fullbox(footer: [Disk: 32 Go.])[
 ``` python 
    ./scripts-py/expe_energumen/m100_pred_jobs_extract_power_metrics.py \
-    -d ./data/year_month=22-01
+    -d ./m100-data/22-01
 ```

 ``` python 
    ./scripts-py/expe_energumen/m100_pred_jobs_extract_power_metrics.py \
-    -d ./data/year_month=22-02
+    -d ./m100-data/22-02
 ```  

 ``` python 
    ./scripts-py/expe_energumen/m100_pred_jobs_extract_power_metrics.py \
-    -d ./data/year_month=22-03
+    -d ./m100-data/22-03
 ```  

 ``` python 
    ./scripts-py/expe_energumen/m100_pred_jobs_extract_power_metrics.py \
-    -d ./data/year_month=22-04
+    -d ./m100-data/22-04
 ```  

 ``` python 
    ./scripts-py/expe_energumen/m100_pred_jobs_extract_power_metrics.py \
-    -d ./data/year_month=22-05
+    -d ./m100-data/22-05
 ```  

 ``` python 
    ./scripts-py/expe_energumen/m100_pred_jobs_extract_power_metrics.py \
-    -d ./data/year_month=22-06
+    -d ./m100-data/22-06
 ```  

 ``` python 
    ./scripts-py/expe_energumen/m100_pred_jobs_extract_power_metrics.py \
-    -d ./data/year_month=22-07
+    -d ./m100-data/22-07
 ```  

 ``` python 
    ./scripts-py/expe_energumen/m100_pred_jobs_extract_power_metrics.py \
-    -d ./data/year_month=22-08
+    -d ./m100-data/22-08
 ```  

 ``` python 
    ./scripts-py/expe_energumen/m100_pred_jobs_extract_power_metrics.py \
-    -d ./data/year_month=22-09
+    -d ./m100-data/22-09
 ```  
 ]

@@ -561,8 +552,6 @@ find . -name '*filter123*' | tar -zcvf exadata_job_energy_profiles.tar.gz --file

 This will output the `filter123_all_jobs_aggmetrics.csv.gz` needed for the prediction script

-#tododanilo[check if /m100-data/ path is correct and also the path of the output]
-
 #fullbox(footer: [Disk: 82 Mo.])[

 ``` python 
@@ -572,7 +561,7 @@ This will output the `filter123_all_jobs_aggmetrics.csv.gz` needed for the predi

 == Predicting Job mean and maximum power consumption 

-#fullbox(footer:[Memory: 128 Go. Time (sequential): 72:00:00])[
+#fullbox(footer:[#emph-overhead[Memory: 128 Go. Time (sequential): 72:00:00]])[
 ```
 mkdir ./m100-data/total_power_mean_predictions_users_allmethods_mean
 mkdir ./m100-data/total_power_mean_predictions_users_allmethods_max
@@ -617,7 +606,10 @@ Output from the previous section
 - `m100-data/power_pred_users_allmethods_mean.tar.gz`, the jobs mean power predictions.
 - `m100-data/power_pred_users_allmethods_max.tar.gz`, the jobs maximum power predictions.

-#tododanilo[Add notebook that make plots]
+=== Reproducing the paper's plots
+
+Please refer to this #link("./notebooks/m100_process_prediction_results.ipynb")[Notebook] for
+the scripts to reproduce the paper's plots, notably Figures 2 and 3.


 == Job scheduling with power prediction <sec-sched>

--- a/notebooks/m100_process_prediction_results.ipynb
+++ b/notebooks/m100_process_prediction_results.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Processing the mean power prediction results (script `run_prediction_per_user_allmethods_mean.py`)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import seaborn as sns\n",
+    "\n",
+    "import os\n",
+    "\n",
+    "RESULTS_PATH = \"../m100-data/total_power_mean_predictions_users_allmethods_mean/\"\n",
+    "PRED_COLS = [\"hist_pred_total_power_mean\",\n",
+    "            \"LinearRegression_total_power_mean_watts\",\n",
+    "            \"RandomForestRegressor_total_power_mean_watts\", \n",
+    "            \"LinearSVR_total_power_mean_watts\", \n",
+    "            \"SGDRegressor_total_power_mean_watts\"]\n",
+    "\n",
+    "\n",
+    "result_filenames = os.listdir(RESULTS_PATH)\n",
+    "\n",
+    "df_all_results = pd.concat([pd.read_csv(RESULTS_PATH+filename, low_memory=False) for filename in result_filenames])\n",
+    "\n",
+    "df_all_results = df_all_results.dropna(subset=PRED_COLS)\n",
+    "df_all_results\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error\n",
+    "\n",
+    "lst_users = df_all_results[\"user_id\"].drop_duplicates().to_list()\n",
+    "#print(lst_users)\n",
+    "\n",
+    "df_results_user_group = df_all_results.groupby(\"user_id\")\n",
+    "\n",
+    "lst_stats_per_user = []\n",
+    "\n",
+    "for user in lst_users:\n",
+    "    results_user = df_results_user_group.get_group(user)\n",
+    "    hist_mape = mean_absolute_percentage_error(results_user[\"total_power_mean_watts\"], results_user[\"hist_pred_total_power_mean\"])\n",
+    "    LR_mape = mean_absolute_percentage_error(results_user[\"total_power_mean_watts\"], results_user[\"LinearRegression_total_power_mean_watts\"])\n",
+    "    RF_mape = mean_absolute_percentage_error(results_user[\"total_power_mean_watts\"], results_user[\"RandomForestRegressor_total_power_mean_watts\"])\n",
+    "    LSVR_mape = mean_absolute_percentage_error(results_user[\"total_power_mean_watts\"], results_user[\"LinearSVR_total_power_mean_watts\"])\n",
+    "    SGD_mape = mean_absolute_percentage_error(results_user[\"total_power_mean_watts\"], results_user[\"SGDRegressor_total_power_mean_watts\"])\n",
+    "    res = {\"user_id\": user, \n",
+    "           \"hist_mape\": hist_mape, \n",
+    "           \"LinearRegression_mape\": LR_mape, \n",
+    "           \"RandomForestRegressor_mape\": RF_mape, \n",
+    "           \"LinearSVR_mape\": LSVR_mape,\n",
+    "           \"SGDRegressor_mape\": SGD_mape}\n",
+    "    lst_stats_per_user.append(res)\n",
+    "    #break\n",
+    "\n",
+    "df_stats_per_user = pd.DataFrame(lst_stats_per_user)\n",
+    "df_stats_per_user\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "COLS = [\"hist_mape\",\"LinearRegression_mape\",\"RandomForestRegressor_mape\",\"LinearSVR_mape\",\"SGDRegressor_mape\"]\n",
+    "\n",
+    "df_stats_per_user[COLS].describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "COLS = [\"hist_mape\",\"LinearRegression_mape\",\"RandomForestRegressor_mape\",\"LinearSVR_mape\",\"SGDRegressor_mape\"]\n",
+    "\n",
+    "df_stats_per_user_pivot = pd.melt(df_stats_per_user, id_vars=\"user_id\")\n",
+    "df_stats_per_user_pivot"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Figure 3 A"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "TINY_SIZE = 2\n",
+    "SMALL_SIZE = 5\n",
+    "MEDIUM_SIZE = 20\n",
+    "BIGGER_SIZE = 50\n",
+    "FIG_WIDTH = 40\n",
+    "FIG_HEIGHT = 10\n",
+    "\n",
+    "\n",
+    "#plt.rc('font', size=16)          # controls default text sizes\n",
+    "plt.rc('font', size=20)          # controls default text sizes\n",
+    "plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title\n",
+    "plt.rc('axes', labelsize=MEDIUM_SIZE)     # fontsize of the x and y labels\n",
+    "plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels\n",
+    "plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels\n",
+    "plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize\n",
+    "plt.rc('figure', titlesize=MEDIUM_SIZE)  # fontsize of the figure title\n",
+    "\n",
+    "#g = sns.boxplot(x=\"variable\", y=\"value\", data=df_stats_per_user_pivot, showfliers=False)\n",
+    "#plt.xticks(ticks=[0,1,2,3,4],labels=[\"History\", \"LinearRegression\", \"RandomForest\", \"LinearSVR\", \"SGDRegressor\"],rotation=30)\n",
+    "g = sns.boxplot(y=\"variable\", x=\"value\", data=df_stats_per_user_pivot, showfliers=False)\n",
+    "plt.yticks(ticks=[0,1,2,3,4],labels=[\"History\", \"LinearRegression\", \"RandomForest\", \"LinearSVR\", \"SGDRegressor\"],rotation=0)\n",
+    "\n",
+    "g.set_ylabel(\"Prediction Method\")\n",
+    "g.set_xlabel(\"Mean Absolute Percentage Error (MAPE)     \")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Processing the max power prediction results (script `run_prediction_per_user_allmethods_max.py`)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "import seaborn as sns\n",
+    "\n",
+    "import os\n",
+    "\n",
+    "RESULTS_PATH = \"./m100-data/total_power_mean_predictions_users_allmethods_max/\"\n",
+    "\n",
+    "PRED_COLS = [\"hist_pred_total_power_max\",\n",
+    "            \"LinearRegression_total_power_max_watts\",\n",
+    "            \"RandomForestRegressor_total_power_max_watts\", \n",
+    "            \"LinearSVR_total_power_max_watts\", \n",
+    "            \"SGDRegressor_total_power_max_watts\"]\n",
+    "\n",
+    "\n",
+    "result_filenames = os.listdir(RESULTS_PATH)\n",
+    "\n",
+    "df_all_results = pd.concat([pd.read_csv(RESULTS_PATH+filename, low_memory=False) for filename in result_filenames])\n",
+    "\n",
+    "df_all_results = df_all_results.dropna(subset=PRED_COLS)\n",
+    "df_all_results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error\n",
+    "\n",
+    "lst_users = df_all_results[\"user_id\"].drop_duplicates().to_list()\n",
+    "#print(lst_users)\n",
+    "\n",
+    "df_results_user_group = df_all_results.groupby(\"user_id\")\n",
+    "\n",
+    "lst_stats_per_user = []\n",
+    "\n",
+    "for user in lst_users:\n",
+    "    results_user = df_results_user_group.get_group(user)\n",
+    "    hist_mape = mean_absolute_percentage_error(results_user[\"total_power_max_watts\"], results_user[\"hist_pred_total_power_max\"])\n",
+    "    LR_mape = mean_absolute_percentage_error(results_user[\"total_power_max_watts\"], results_user[\"LinearRegression_total_power_max_watts\"])\n",
+    "    RF_mape = mean_absolute_percentage_error(results_user[\"total_power_max_watts\"], results_user[\"RandomForestRegressor_total_power_max_watts\"])\n",
+    "    LSVR_mape = mean_absolute_percentage_error(results_user[\"total_power_max_watts\"], results_user[\"LinearSVR_total_power_max_watts\"])\n",
+    "    SGD_mape = mean_absolute_percentage_error(results_user[\"total_power_max_watts\"], results_user[\"SGDRegressor_total_power_max_watts\"])\n",
+    "    res = {\"user_id\": user, \n",
+    "           \"hist_mape\": hist_mape, \n",
+    "           \"LinearRegression_mape\": LR_mape, \n",
+    "           \"RandomForestRegressor_mape\": RF_mape, \n",
+    "           \"LinearSVR_mape\": LSVR_mape,\n",
+    "           \"SGDRegressor_mape\": SGD_mape}\n",
+    "    lst_stats_per_user.append(res)\n",
+    "    #break\n",
+    "\n",
+    "df_stats_per_user = pd.DataFrame(lst_stats_per_user)\n",
+    "df_stats_per_user"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "COLS = [\"hist_mape\",\"LinearRegression_mape\",\"RandomForestRegressor_mape\",\"LinearSVR_mape\",\"SGDRegressor_mape\"]\n",
+    "\n",
+    "df_stats_per_user[COLS].describe()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "COLS = [\"hist_mape\",\"LinearRegression_mape\",\"RandomForestRegressor_mape\",\"LinearSVR_mape\",\"SGDRegressor_mape\"]\n",
+    "\n",
+    "df_stats_per_user_pivot = pd.melt(df_stats_per_user, id_vars=\"user_id\")\n",
+    "df_stats_per_user_pivot"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Figure 3 B"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "TINY_SIZE = 2\n",
+    "SMALL_SIZE = 5\n",
+    "MEDIUM_SIZE = 20\n",
+    "BIGGER_SIZE = 50\n",
+    "FIG_WIDTH = 40\n",
+    "FIG_HEIGHT = 10\n",
+    "\n",
+    "\n",
+    "plt.rc('font', size=20)          # controls default text sizes\n",
+    "plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title\n",
+    "plt.rc('axes', labelsize=MEDIUM_SIZE)     # fontsize of the x and y labels\n",
+    "plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels\n",
+    "plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels\n",
+    "plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize\n",
+    "plt.rc('figure', titlesize=MEDIUM_SIZE)  # fontsize of the figure title\n",
+    "\n",
+    "#g = sns.boxplot(x=\"variable\", y=\"value\", data=df_stats_per_user_pivot, showfliers=False)\n",
+    "#plt.xticks(ticks=[0,1,2,3,4],labels=[\"History\", \"LinearRegression\", \"RandomForest\", \"LinearSVR\", \"SGDRegressor\"],rotation=30)\n",
+    "#g.set_xlabel(\"Prediction Method\")\n",
+    "#g.set_ylabel(\"Mean Absolute Percentage Error (MAPE)            \")\n",
+    "\n",
+    "g = sns.boxplot(y=\"variable\", x=\"value\", data=df_stats_per_user_pivot, showfliers=False)\n",
+    "plt.yticks(ticks=[0,1,2,3,4],labels=[\"History\", \"LinearRegression\", \"RandomForest\", \"LinearSVR\", \"SGDRegressor\"],rotation=0)\n",
+    "g.set_ylabel(\"Prediction Method\")\n",
+    "g.set_xlabel(\"Mean Absolute Percentage Error (MAPE)\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Getting the actual mean and max power distributions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Mean (Figure 2 A)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "\n",
+    "TINY_SIZE = 2\n",
+    "SMALL_SIZE = 5\n",
+    "MEDIUM_SIZE = 20\n",
+    "BIGGER_SIZE = 50\n",
+    "FIG_WIDTH = 40\n",
+    "FIG_HEIGHT = 10\n",
+    "\n",
+    "plt.clf()\n",
+    "\n",
+    "plt.rc('figure', figsize=(8, 6))\n",
+    "plt.rc('font', size=MEDIUM_SIZE)          # controls default text sizes\n",
+    "plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title\n",
+    "plt.rc('axes', labelsize=MEDIUM_SIZE)     # fontsize of the x and y labels\n",
+    "plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels\n",
+    "plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels\n",
+    "plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize\n",
+    "plt.rc('figure', titlesize=MEDIUM_SIZE)  # fontsize of the figure title\n",
+    "\n",
+    "g = sns.histplot(x=\"total_power_mean_watts\", data=df_all_results, bins=25, fill=False)\n",
+    "#g.ax.set_yscale('log')\n",
+    "g.set_xlabel(\"Total Power (watts)\")\n",
+    "g.set_ylabel(\"Number of Jobs\")\n",
+    "plt.xticks(ticks=[0,250,500,750,1000,1250,1500], rotation=30)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Max (Figure 2 B)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
+    "\n",
+    "TINY_SIZE = 2\n",
+    "SMALL_SIZE = 5\n",
+    "MEDIUM_SIZE = 20\n",
+    "BIGGER_SIZE = 50\n",
+    "FIG_WIDTH = 40\n",
+    "FIG_HEIGHT = 10\n",
+    "\n",
+    "plt.clf()\n",
+    "\n",
+    "plt.rc('figure', figsize=(8, 6))\n",
+    "plt.rc('font', size=MEDIUM_SIZE)          # controls default text sizes\n",
+    "plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title\n",
+    "plt.rc('axes', labelsize=MEDIUM_SIZE)     # fontsize of the x and y labels\n",
+    "plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels\n",
+    "plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels\n",
+    "plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize\n",
+    "plt.rc('figure', titlesize=MEDIUM_SIZE)  # fontsize of the figure title\n",
+    "\n",
+    "#g = sns.displot(x=\"total_power_max_watts\", data=df_all_results)\n",
+    "g = sns.histplot(x=\"total_power_max_watts\", data=df_all_results, bins=25, fill=False)\n",
+    "\n",
+    "#g.ax.set_yscale('log')\n",
+    "g.set_xlabel(\"Total Power (watts)\")\n",
+    "g.set_ylabel(\"Number of Jobs\")\n",
+    "plt.xticks(ticks=[0,250,500,750,1000,1250,1500,1750,2000], rotation=30)\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
+%% Cell type:markdown id: tags:
+
+## Processing the mean power prediction results (script `run_prediction_per_user_allmethods_mean.py`)
+
+%% Cell type:code id: tags:
+
+``` python
+import pandas as pd
+import seaborn as sns
+
+import os
+
+RESULTS_PATH = "../m100-data/total_power_mean_predictions_users_allmethods_mean/"
+PRED_COLS = ["hist_pred_total_power_mean",
+            "LinearRegression_total_power_mean_watts",
+            "RandomForestRegressor_total_power_mean_watts",
+            "LinearSVR_total_power_mean_watts",
+            "SGDRegressor_total_power_mean_watts"]
+
+
+result_filenames = os.listdir(RESULTS_PATH)
+
+df_all_results = pd.concat([pd.read_csv(RESULTS_PATH+filename, low_memory=False) for filename in result_filenames])
+
+df_all_results = df_all_results.dropna(subset=PRED_COLS)
+df_all_results
+```
+
+%% Cell type:code id: tags:
+
+``` python
+from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
+
+lst_users = df_all_results["user_id"].drop_duplicates().to_list()
+#print(lst_users)
+
+df_results_user_group = df_all_results.groupby("user_id")
+
+lst_stats_per_user = []
+
+for user in lst_users:
+    results_user = df_results_user_group.get_group(user)
+    hist_mape = mean_absolute_percentage_error(results_user["total_power_mean_watts"], results_user["hist_pred_total_power_mean"])
+    LR_mape = mean_absolute_percentage_error(results_user["total_power_mean_watts"], results_user["LinearRegression_total_power_mean_watts"])
+    RF_mape = mean_absolute_percentage_error(results_user["total_power_mean_watts"], results_user["RandomForestRegressor_total_power_mean_watts"])
+    LSVR_mape = mean_absolute_percentage_error(results_user["total_power_mean_watts"], results_user["LinearSVR_total_power_mean_watts"])
+    SGD_mape = mean_absolute_percentage_error(results_user["total_power_mean_watts"], results_user["SGDRegressor_total_power_mean_watts"])
+    res = {"user_id": user,
+           "hist_mape": hist_mape,
+           "LinearRegression_mape": LR_mape,
+           "RandomForestRegressor_mape": RF_mape,
+           "LinearSVR_mape": LSVR_mape,
+           "SGDRegressor_mape": SGD_mape}
+    lst_stats_per_user.append(res)
+    #break
+
+df_stats_per_user = pd.DataFrame(lst_stats_per_user)
+df_stats_per_user
+```
+
+%% Cell type:code id: tags:
+
+``` python
+COLS = ["hist_mape","LinearRegression_mape","RandomForestRegressor_mape","LinearSVR_mape","SGDRegressor_mape"]
+
+df_stats_per_user[COLS].describe()
+```
+
+%% Cell type:code id: tags:
+
+``` python
+COLS = ["hist_mape","LinearRegression_mape","RandomForestRegressor_mape","LinearSVR_mape","SGDRegressor_mape"]
+
+df_stats_per_user_pivot = pd.melt(df_stats_per_user, id_vars="user_id")
+df_stats_per_user_pivot
+```
+
+%% Cell type:markdown id: tags:
+
+### Figure 3 A
+
+%% Cell type:code id: tags:
+
+``` python
+
+import matplotlib.pyplot as plt
+
+TINY_SIZE = 2
+SMALL_SIZE = 5
+MEDIUM_SIZE = 20
+BIGGER_SIZE = 50
+FIG_WIDTH = 40
+FIG_HEIGHT = 10
+
+
+#plt.rc('font', size=16)          # controls default text sizes
+plt.rc('font', size=20)          # controls default text sizes
+plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
+plt.rc('axes', labelsize=MEDIUM_SIZE)     # fontsize of the x and y labels
+plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
+plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
+plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize
+plt.rc('figure', titlesize=MEDIUM_SIZE)  # fontsize of the figure title
+
+#g = sns.boxplot(x="variable", y="value", data=df_stats_per_user_pivot, showfliers=False)
+#plt.xticks(ticks=[0,1,2,3,4],labels=["History", "LinearRegression", "RandomForest", "LinearSVR", "SGDRegressor"],rotation=30)
+g = sns.boxplot(y="variable", x="value", data=df_stats_per_user_pivot, showfliers=False)
+plt.yticks(ticks=[0,1,2,3,4],labels=["History", "LinearRegression", "RandomForest", "LinearSVR", "SGDRegressor"],rotation=0)
+
+g.set_ylabel("Prediction Method")
+g.set_xlabel("Mean Absolute Percentage Error (MAPE)     ")
+```
+
+%% Cell type:markdown id: tags:
+
+## Processing the max power prediction results (script `run_prediction_per_user_allmethods_max.py`)
+
+%% Cell type:code id: tags:
+
+``` python
+import pandas as pd
+import seaborn as sns
+
+import os
+
+RESULTS_PATH = "./m100-data/total_power_mean_predictions_users_allmethods_max/"
+
+PRED_COLS = ["hist_pred_total_power_max",
+            "LinearRegression_total_power_max_watts",
+            "RandomForestRegressor_total_power_max_watts",
+            "LinearSVR_total_power_max_watts",
+            "SGDRegressor_total_power_max_watts"]
+
+
+result_filenames = os.listdir(RESULTS_PATH)
+
+df_all_results = pd.concat([pd.read_csv(RESULTS_PATH+filename, low_memory=False) for filename in result_filenames])
+
+df_all_results = df_all_results.dropna(subset=PRED_COLS)
+df_all_results
+```
+
+%% Cell type:code id: tags:
+
+``` python
+from sklearn.metrics import mean_absolute_error, mean_squared_error, mean_absolute_percentage_error
+
+lst_users = df_all_results["user_id"].drop_duplicates().to_list()
+#print(lst_users)
+
+df_results_user_group = df_all_results.groupby("user_id")
+
+lst_stats_per_user = []
+
+for user in lst_users:
+    results_user = df_results_user_group.get_group(user)
+    hist_mape = mean_absolute_percentage_error(results_user["total_power_max_watts"], results_user["hist_pred_total_power_max"])
+    LR_mape = mean_absolute_percentage_error(results_user["total_power_max_watts"], results_user["LinearRegression_total_power_max_watts"])
+    RF_mape = mean_absolute_percentage_error(results_user["total_power_max_watts"], results_user["RandomForestRegressor_total_power_max_watts"])
+    LSVR_mape = mean_absolute_percentage_error(results_user["total_power_max_watts"], results_user["LinearSVR_total_power_max_watts"])
+    SGD_mape = mean_absolute_percentage_error(results_user["total_power_max_watts"], results_user["SGDRegressor_total_power_max_watts"])
+    res = {"user_id": user,
+           "hist_mape": hist_mape,
+           "LinearRegression_mape": LR_mape,
+           "RandomForestRegressor_mape": RF_mape,
+           "LinearSVR_mape": LSVR_mape,
+           "SGDRegressor_mape": SGD_mape}
+    lst_stats_per_user.append(res)
+    #break
+
+df_stats_per_user = pd.DataFrame(lst_stats_per_user)
+df_stats_per_user
+```
+
+%% Cell type:code id: tags:
+
+``` python
+COLS = ["hist_mape","LinearRegression_mape","RandomForestRegressor_mape","LinearSVR_mape","SGDRegressor_mape"]
+
+df_stats_per_user[COLS].describe()
+```
+
+%% Cell type:code id: tags:
+
+``` python
+COLS = ["hist_mape","LinearRegression_mape","RandomForestRegressor_mape","LinearSVR_mape","SGDRegressor_mape"]
+
+df_stats_per_user_pivot = pd.melt(df_stats_per_user, id_vars="user_id")
+df_stats_per_user_pivot
+```
+
+%% Cell type:markdown id: tags:
+
+### Figure 3 B
+
+%% Cell type:code id: tags:
+
+``` python
+
+import matplotlib.pyplot as plt
+
+TINY_SIZE = 2
+SMALL_SIZE = 5
+MEDIUM_SIZE = 20
+BIGGER_SIZE = 50
+FIG_WIDTH = 40
+FIG_HEIGHT = 10
+
+
+plt.rc('font', size=20)          # controls default text sizes
+plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
+plt.rc('axes', labelsize=MEDIUM_SIZE)     # fontsize of the x and y labels
+plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
+plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
+plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize
+plt.rc('figure', titlesize=MEDIUM_SIZE)  # fontsize of the figure title
+
+#g = sns.boxplot(x="variable", y="value", data=df_stats_per_user_pivot, showfliers=False)
+#plt.xticks(ticks=[0,1,2,3,4],labels=["History", "LinearRegression", "RandomForest", "LinearSVR", "SGDRegressor"],rotation=30)
+#g.set_xlabel("Prediction Method")
+#g.set_ylabel("Mean Absolute Percentage Error (MAPE)            ")
+
+g = sns.boxplot(y="variable", x="value", data=df_stats_per_user_pivot, showfliers=False)
+plt.yticks(ticks=[0,1,2,3,4],labels=["History", "LinearRegression", "RandomForest", "LinearSVR", "SGDRegressor"],rotation=0)
+g.set_ylabel("Prediction Method")
+g.set_xlabel("Mean Absolute Percentage Error (MAPE)")
+```
+
+%% Cell type:markdown id: tags:
+
+## Getting the actual mean and max power distributions
+
+%% Cell type:markdown id: tags:
+
+### Mean (Figure 2 A)
+
+%% Cell type:code id: tags:
+
+``` python
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+TINY_SIZE = 2
+SMALL_SIZE = 5
+MEDIUM_SIZE = 20
+BIGGER_SIZE = 50
+FIG_WIDTH = 40
+FIG_HEIGHT = 10
+
+plt.clf()
+
+plt.rc('figure', figsize=(8, 6))
+plt.rc('font', size=MEDIUM_SIZE)          # controls default text sizes
+plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
+plt.rc('axes', labelsize=MEDIUM_SIZE)     # fontsize of the x and y labels
+plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
+plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
+plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize
+plt.rc('figure', titlesize=MEDIUM_SIZE)  # fontsize of the figure title
+
+g = sns.histplot(x="total_power_mean_watts", data=df_all_results, bins=25, fill=False)
+#g.ax.set_yscale('log')
+g.set_xlabel("Total Power (watts)")
+g.set_ylabel("Number of Jobs")
+plt.xticks(ticks=[0,250,500,750,1000,1250,1500], rotation=30)
+```
+
+%% Cell type:markdown id: tags:
+
+### Max (Figure 2 B)
+
+%% Cell type:code id: tags:
+
+``` python
+import matplotlib.pyplot as plt
+import seaborn as sns
+
+TINY_SIZE = 2
+SMALL_SIZE = 5
+MEDIUM_SIZE = 20
+BIGGER_SIZE = 50
+FIG_WIDTH = 40
+FIG_HEIGHT = 10
+
+plt.clf()
+
+plt.rc('figure', figsize=(8, 6))
+plt.rc('font', size=MEDIUM_SIZE)          # controls default text sizes
+plt.rc('axes', titlesize=MEDIUM_SIZE)     # fontsize of the axes title
+plt.rc('axes', labelsize=MEDIUM_SIZE)     # fontsize of the x and y labels
+plt.rc('xtick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
+plt.rc('ytick', labelsize=MEDIUM_SIZE)    # fontsize of the tick labels
+plt.rc('legend', fontsize=MEDIUM_SIZE)    # legend fontsize
+plt.rc('figure', titlesize=MEDIUM_SIZE)  # fontsize of the figure title
+
+#g = sns.displot(x="total_power_max_watts", data=df_all_results)
+g = sns.histplot(x="total_power_max_watts", data=df_all_results, bins=25, fill=False)
+
+#g.ax.set_yscale('log')
+g.set_xlabel("Total Power (watts)")
+g.set_ylabel("Number of Jobs")
+plt.xticks(ticks=[0,250,500,750,1000,1250,1500,1750,2000], rotation=30)
+```
--- a/scripts-py/expe_energumen/m100_pred_jobs_extract_power_metrics.py
+++ b/scripts-py/expe_energumen/m100_pred_jobs_extract_power_metrics.py
@@ -8,10 +8,10 @@ from scipy.stats import iqr
 Read input data
 """
 def read_data(rootdir):
-    df_jobs_single = pd.read_csv(rootdir+"/plugin=job_table/metric=job_info_marconi100/a_0_filter123_singlenode.csv")
-    df_jobs_multi = pd.read_csv(rootdir+"/plugin=job_table/metric=job_info_marconi100/a_0_filter123_multinode.csv")
-    df_power_single = pd.read_csv(rootdir+"/plugin=ipmi_pub/metric=total_power/a_0_filter123_singlenode.csv")
-    df_power_multi = pd.read_csv(rootdir+"/plugin=ipmi_pub/metric=total_power/a_0_filter123_multinode.csv")
+    df_jobs_single = pd.read_csv(rootdir+"_filter123_singlenode.csv")
+    df_jobs_multi = pd.read_csv(rootdir+"_filter123_multinode.csv")
+    df_power_single = pd.read_csv(rootdir+"_filter123_singlenode.csv")
+    df_power_multi = pd.read_csv(rootdir+"_filter123_multinode.csv")
    df_jobs = pd.concat([df_jobs_single, df_jobs_multi]).reset_index(drop=True)
    df_power = pd.concat([df_power_single, df_power_multi]).reset_index(drop=True)
    df_power['node'] = pd.to_numeric(df_power['node'])
@@ -37,7 +37,7 @@ def calculate_agg_metrics(df_jobs, df_power):
 Save results
 """
 def save_results(df_jobs_aggmetrics, rootdir):
-    df_jobs_aggmetrics.to_csv(rootdir+"/plugin=job_table/metric=job_info_marconi100/a_0_filter123_aggmetrics.csv")
+    df_jobs_aggmetrics.to_csv(rootdir+"_filter123_aggmetrics.csv")

 """
 Run workflow

--- a/scripts-py/expe_energumen/m100_pred_merge_jobfiles.py
+++ b/scripts-py/expe_energumen/m100_pred_merge_jobfiles.py
@@ -9,7 +9,7 @@ Read job files spread in the months folders
 def read_jobifles(rootdir):
    #DATASET_PATH = "/home/dancarastan/Documentos/exadata_job_energy_profiles/"

-    jobfiles_list = glob.glob(rootdir+"*"+"/plugin=job_table"+"/metric=job_info_marconi100"+"/a_0_filter123_aggmetrics.csv")
+    jobfiles_list = glob.glob(rootdir+"*"+"_filter123_aggmetrics.csv")

    #print(len(jobfiles_list))
    df_jobs = pd.concat([pd.read_csv(jobfile) for jobfile in jobfiles_list]).reset_index(drop=True)

--- a/scripts-py/expe_energumen/m100_pred_preprocess_1.py
+++ b/scripts-py/expe_energumen/m100_pred_preprocess_1.py
@@ -105,11 +105,10 @@ def filter2_multi(df_jobs, df_power):
 """
 Save intermediate results to csv
 """
-def save_results(df_jobs_single, df_jobs_multi, jobfile, metricfile):    
-    jobfile_out = jobfile.rstrip("a_0.parquet")
-    metric = metricfile.split("/")[-2]    
-    df_jobs_single.to_csv(jobfile_out+metric+"_filter12_singlenode.csv", index=False)
-    df_jobs_multi.to_csv(jobfile_out+metric+"_filter12_multinode.csv", index=False)
+def save_results(df_jobs_single, df_jobs_multi, jobfile, metricfile):       
+    jobfile_out = jobfile.rstrip("jobs.parquet")      
+    df_jobs_single.to_csv(jobfile_out+"_filter12_singlenode.csv", index=False)
+    df_jobs_multi.to_csv(jobfile_out+"_filter12_multinode.csv", index=False)

 """
 Run workflow

--- a/scripts-py/expe_energumen/m100_pred_preprocess_2.py
+++ b/scripts-py/expe_energumen/m100_pred_preprocess_2.py
@@ -183,11 +183,10 @@ def filter3_1_multi(df_jobs_multi, df_total_power):
 Save results to csv
 """
 def save_results(df_exclusive_jobs_single, df_exclusive_jobs_multi, df_total_power_exclusive_single, df_total_power_exclusive_multi, jobfile_single, metricfile):   
-    metric = metricfile.split("/")[-2] 
-    jobfile_out = jobfile_single.rstrip(metric+"_filter12_singlenode.csv")     
-    metricfile_out = metricfile.rstrip("a_0.parquet")
-    df_exclusive_jobs_single.to_csv(jobfile_out+metric+"_filter123_singlenode.csv", index=False)
-    df_exclusive_jobs_multi.to_csv(jobfile_out+metric+"_filter123_multinode.csv", index=False)
+    jobfile_out = jobfile_single.rstrip("_filter12_singlenode.csv")     
+    metricfile_out = metricfile.rstrip("power_total.parquet")
+    df_exclusive_jobs_single.to_csv(jobfile_out+"_filter123_singlenode.csv", index=False)
+    df_exclusive_jobs_multi.to_csv(jobfile_out+"_filter123_multinode.csv", index=False)
    df_total_power_exclusive_single.to_csv(metricfile_out+"a_0_filter123_singlenode.csv", index=False)
    df_total_power_exclusive_multi.to_csv(metricfile_out+"a_0_filter123_multinode.csv", index=False)