diff --git a/artifact-overview.typ b/artifact-overview.typ index 59feaa9ac0888982d21afb0074ffed7610a3c541..ffd167d3358ac1d3794b5997c8538bca51fb582f 100644 --- a/artifact-overview.typ +++ b/artifact-overview.typ @@ -6,7 +6,7 @@ #let artifact-code-git-repo-irit = "https://gitlab.irit.fr/sepia-pub/open-science/artifact-europar24-lightweight-power-pred-sched" #let artifact-code-sh-permalink = "https://archive.softwareheritage.org/swh:1:rev:5a15139dadde8d923703ece93745fa250b1a0c53;origin=https://framagit.org/batsim/artifact-europar24-lightweight-power-pred-sched.git;visit=swh:1:snp:968650e57128ea88b02a858279a7054f62f0a0b0" #let artifact-code-git-commit = "5a15139dadde8d923703ece93745fa250b1a0c53" -#let zenodo-doi = "10.5281/zenodo.11173631" +#let zenodo-doi = "10.5281/zenodo.13961003" #let zenodo-url = "https://doi.org/" + zenodo-doi #set page( @@ -74,7 +74,7 @@ *Article.* Light-weight prediction for improving energy consumption in HPC platforms\ *Quick links*. #set list(marker: none, body-indent: 3.5mm) - - Preprint PDF on HAL. #url("https://hal.science/hal-04566184") + // - Preprint PDF on HAL. #url("https://hal.science/hal-04566184") - Artifact data on Zenodo. #url(zenodo-url) - Artifact Nix binary cache. #url("https://lightweight-pred-sched-europar24.cachix.org") - Artifact code Git repository. #link(artifact-code-git-repo-irit)[IRIT], #link(artifact-code-git-repo)[Framagit] @@ -435,6 +435,8 @@ nix develop .#py-scripts --command m100-pred-merge-jobfiles -d ./m100-data/ == Predicting Job mean and maximum power consumption +#text(fill: red)[*Warning: This section will be completed after the article submission.*] + #fullbox(footer:[#emph-overhead[Memory: 128 Go. Time (sequential): 72:00:00.]])[ ```sh mkdir ./m100-data/total_power_mean_predictions_users_allmethods_mean @@ -461,11 +463,14 @@ The expected output data has been stored on #link(zenodo-url)[Zenodo]. ./m100-data/total_power_mean_predictions_users_allmethods_mean tar -cvzf ./m100-data/power_pred_users_allmethods_mean.tar.gz \ ./m100-data/total_power_mean_predictions_users_allmethods_max + tar -cvzf ./m100-data/power_pred_users_allmethods_std.tar.gz \ + ./m100-data/total_power_std_predictions_users_allmethods_max ``` #filehashes(( "fdcc47998a7e998abde325162833b23e", "power_pred_users_allmethods_max.tar.gz", "954f782a75c9a5b21c53a95c0218e220", "power_pred_users_allmethods_mean.tar.gz", + "7335fce385f20d891efd85a91a4fd9d7", "power_pred_users_allmethods_std.tar.gz" )) ] @@ -480,6 +485,8 @@ The following command populates the `./user-power-predictions/data` by extractin tar xf ./user-power-predictions/*mean.tar.gz --directory ./user-power-predictions/data nix develop .#merge-m100-power-predictions --command \ tar xf ./user-power-predictions/*max.tar.gz --directory ./user-power-predictions/data + nix develop .#merge-m100-power-predictions --command \ + tar xf ./user-power-predictions/*std.tar.gz --directory ./user-power-predictions/data nix develop .#merge-m100-power-predictions --command \ gunzip ./user-power-predictions/data/*/*.gz ``` @@ -548,7 +555,7 @@ Required input files. ``` #filehashes(( "e1b4475f55938ad6de4ca500bddc7908", "expe-sched/workload-params.json", - "3a7e7d8183dcb733d6b49d86b2ab3b14", "expe-sched/simu-instances.json", + "a597d3940c3ebbe5c2bdab53718daa54", "expe-sched/simu-instances.json", )) ] @@ -563,6 +570,8 @@ To make things more convenient for the generation of simulation inputs, all the tar xf ./user-power-predictions/*mean.tar.gz --directory ./user-power-predictions/tmp nix develop .#merge-m100-power-predictions --command \ tar xf ./user-power-predictions/*max.tar.gz --directory ./user-power-predictions/tmp + nix develop .#merge-m100-power-predictions --command \ + tar xf ./user-power-predictions/*std.tar.gz --directory ./user-power-predictions/tmp nix develop .#merge-m100-power-predictions --command \ gunzip ./user-power-predictions/tmp/*/*.gz nix develop .#merge-m100-power-predictions --command \ @@ -572,7 +581,7 @@ To make things more convenient for the generation of simulation inputs, all the ``` #filehashes(( - "86a056a9d61cca59b80adf95fa8bff22", "./m100-data/22-job-power-estimations.csv", + "f91d0e57a6d8fa4678ae788a3a8c1de2", "m100-data/22-job-power-estimations.csv", )) ] @@ -590,7 +599,7 @@ Similarly, Marconi100 job traces are also merged into a single file. ``` #filehashes(( - "c7d00104663b13e2992ec10749d6a162", "m100-data/22-jobs-with-prediction.csv" + "90d9d606bb4c70146a2cb222b9e67bd2", "m100-data/22-jobs-with-prediction.csv" )) ] @@ -641,7 +650,8 @@ In particular to populate the `/tmp/wlds` directory you can *download file* `wor ``` #filehashes(( - "2f31cf5a3ca6b2f46a2d426c9558f351", "expe-sched/simu-campaign-agg-result.csv" + "35432a7728f00b3b591f162186851ec5", "expe-sched/simu-campaign-exec-state.json", + "3eca005f2030a76595cb74d8b16e1f81", "expe-sched/simu-campaign-agg-result.csv" )) ] @@ -659,11 +669,13 @@ Required input files. ``` #filehashes(( - "660144ea7340a7accf4eb8c7c2a7a3fa", "notebooks/fig4-sched-mean-power-distribution.svg", - "df07dec01ea5dd176ef406b26638d180", "notebooks/fig5-sched-mtt-diff-distribution.svg", - "e00304f9f2fd1819b72ca8b6b802db9c", "notebooks/simulation-output-analysis.html", + "5b9a3f5b79b72f10a619c6794a9a4645", "notebooks/proportion-above-powercap.svg", + "e77a0130dde9c1a71e82362662919251", "notebooks/sched-algo-comparison.svg", + "1521efb0253e0811ec9d52afc4a24cd5", "notebooks/sched-mean-power-distribution.svg", + "87c68a77d3b91970a62e3217f30dd20d", "notebooks/sched-mtt-distribution.svg", + "541ff9cee830292066331b030177cb07", "notebooks/simulation-output-analysis.html", ), fill: (x, y) => { - if y == 3 { red.lighten(80%) } + if y == 5 { red.lighten(80%) } }, ) diff --git a/flake.lock b/flake.lock index 1e44deed22560a0c6b9b84b9f4049d8fd1e7d4c4..fd5c1e749939bd6fbe8e30962351b380bbb4c695 100644 --- a/flake.lock +++ b/flake.lock @@ -100,17 +100,18 @@ ] }, "locked": { - "lastModified": 1712244708, - "narHash": "sha256-MQppCw+g2QVFQrmdjz009Jjt8fuiQhldp5kVrMHHyv8=", - "ref": "refs/heads/main", - "rev": "659660c35650e9f46ec47e8c0743d75649e68d7b", - "revCount": 4, + "lastModified": 1729529134, + "narHash": "sha256-fRF5XCnkB+TQIg1kB4Xcdby9VTn++PEn7g2fDREjJJs=", + "ref": "TPDS", + "rev": "1d7b8b0346fe21fdfa866cb76adcb839b3d51a7a", + "revCount": 7, "type": "git", - "url": "https://framagit.org/batsim/easy-powercap.git?tag=europar24" + "url": "https://gitlab.irit.fr/sepia-pub/batsim/easy-powercap.git" }, "original": { + "ref": "TPDS", "type": "git", - "url": "https://framagit.org/batsim/easy-powercap.git?tag=europar24" + "url": "https://gitlab.irit.fr/sepia-pub/batsim/easy-powercap.git" } }, "flake-parts": { diff --git a/flake.nix b/flake.nix index 80340f5ec05447d4618fcd329c34085ffac34b2a..029ad8c13383ea8944ffb7512e4ece500d38ad05 100644 --- a/flake.nix +++ b/flake.nix @@ -28,7 +28,7 @@ inputs.flake-utils.follows = "flake-utils"; }; easy-powercap-flake = { - url = "git+https://framagit.org/batsim/easy-powercap.git?tag=europar24"; + url = "git+https://gitlab.irit.fr/sepia-pub/batsim/easy-powercap.git?ref=TPDS"; inputs.nixpkgs.follows = "nixpkgs"; inputs.nur-kapack.follows = "nur-kapack"; inputs.batprotocol-flake.follows = "batprotocol-flake"; diff --git a/notebooks/simulation-output-analysis.Rmd b/notebooks/simulation-output-analysis.Rmd index d1f658c020e3395a6ebebfb28401641790347e93..c8e084d7d7d2877893f79892d3630257797473bd 100644 --- a/notebooks/simulation-output-analysis.Rmd +++ b/notebooks/simulation-output-analysis.Rmd @@ -4,32 +4,39 @@ author: "Millian Poquet" date: "2024-05-09" params: simulation_aggregated_output: "../expe-sched/simu-campaign-agg-result.csv" + simulation_output: "/tmp/simout/" output: rmdformats::readthedown --- # Introduction -This notebook analyzes the results of the resource management experimental campaign of the article "Light-weight prediction for improving energy consumption in HPC platforms" (sections 6.4 and 6.5) published at Euro-Par 2024. For full context of this experiment please refer to the article preprint, which is available on [[hal long-term open-access link]](https://hal.science/hal-04566184). +This notebook analyzes the results of the resource management experimental campaign of the article "Scheduling with lightweight predictions in power-constrained HPC platforms" (sections 6.E and 6.F) published at IEEE Transactions on Parallel and Distributed Systems. For full context of this experiment please refer to the article preprint. -**Résumé of the experiment.** 30 different 1-day workloads have been extracted at random points in time from [the Marconi 100 trace](https://gitlab.com/ecs-lab/exadata). Each workload has been replayed in simulation thanks to the Batsim simulator [[thesis link]](https://hal.science/tel-01757245v2) [[long-term software heritage code permalink]](https://archive.softwareheritage.org/swh:1:rev:ee797ccebbb95410479663ee0547e752112fc83e;origin=https://framagit.org/batsim/batsim.git;visit=swh:1:snp:ec2c0ac3c1fb85d35cc35049f314234cc34124cd). A constraint is set on the power that the whole platform can use during the first 3 hours of the simulation. This is implemented on our EASY backfilling implementation that supports a powercap [[gitlab code link]](https://gitlab.irit.fr/sepia-pub/batsim/easy-powercap) [[software heritage long-term code permalink]](https://archive.softwareheritage.org/swh:1:rev:659660c35650e9f46ec47e8c0743d75649e68d7b;origin=https://framagit.org/batsim/easy-powercap.git;visit=swh:1:snp:03d429f7d797ff3acbea78c29236f22f04984c95), which uses a prediction of the jobs' power consumption to take its decisions. +**Résumé of the experiment.** 30 different 1-day workloads have been extracted at random points in time from [the Marconi 100 trace](https://gitlab.com/ecs-lab/exadata). Each workload has been replayed in simulation thanks to the Batsim simulator [[thesis link]](https://hal.science/tel-01757245v2) [[long-term software heritage code permalink]](https://archive.softwareheritage.org/swh:1:rev:ee797ccebbb95410479663ee0547e752112fc83e;origin=https://framagit.org/batsim/batsim.git;visit=swh:1:snp:ec2c0ac3c1fb85d35cc35049f314234cc34124cd). A constraint is set on the power that the whole platform can use during the first 3 hours of the simulation. This is implemented on our EASY backfilling implementation that supports a powercap [[gitlab code link]](https://gitlab.irit.fr/sepia-pub/batsim/easy-powercap/-/tree/TPDS?ref=TPDS) [[software heritage long-term code permalink]](https://archive.softwareheritage.org/swh:1:rev:659660c35650e9f46ec47e8c0743d75649e68d7b;origin=https://framagit.org/batsim/easy-powercap.git;visit=swh:1:snp:03d429f7d797ff3acbea78c29236f22f04984c95), which uses a prediction of the jobs' power consumption to take its decisions. Besides the EASY backfilling, we proposed a greedy knapsack in the same git repository (linked to Section 6.F of the article). -The goal of this notebook is to determine the impact of the job power predictor on the schedules resulting from this scheduling algorithm execution. The notebook takes an aggregation of all the simulation executions as input. The notebook outputs image files that are Figures 4 and 5 of the article, and also provides additional analyses (images + short text analysis) that could not fit in the article page limit. +The goal of this notebook is to determine the impact of the job power predictor on the schedules resulting from this scheduling algorithm execution. The notebook takes an aggregation of all the simulation executions as input. The notebook outputs image files that are Figures 4, 5, 6, and 7 of the article, and also provides additional analyses (images + short text analysis) that could not fit in the article page limit. -## Power predictor naming difference w.r.t. article +# Part one: Predictors analysis + +On this part, we focus on comparing the different predictors. All experiments here use EASY backfilling with FCFS order. + +### Power predictor naming difference w.r.t. article - `upper_bound` is the predictor named `naive` in the article. It assumes that all the nodes allocated to the job are at full power during the whole job execution. This is an upper bound on the job power consumption that can be used safely from the scheduler point of view. - `real_mean` is the predictor that uses the real mean power of each job (perfect oracle, unfeasible in practice but shows the best we would get with a perfect predictor) - `real_max` is the predictor that uses the real maximum power of each job (perfect oracle, unfeasible in practice but shows the best we would get with a perfect predictor) - `mean` is the history-based (light-weight) mean job power predictor described in section 4.1 of the article - `max` is the history-based (light-weight) maximum job power predictor described in section 4.1 of the article - `zero` assumes that all jobs consume 0 W. This is strictly equivalent to EASY backfilling without powercap support, and is used as baseline for scheduling metrics. +- `gaussian` is the history-based (light-weight) calculated using the mean and standard deviation (std). We have gaussian 68 (mean + std), gaussian 95 (mean + 2 \* std), and gaussian 99 (mean + 3 \* std) -## Code to read and prepare data +### Code to read and prepare data ```{r, echo = TRUE} set.seed(1) suppressMessages(library(tidyverse)) suppressMessages(library(viridis)) library(knitr) +library(gridExtra) # data extracted from the analysis of the M100 real trace from 2022-01 to 2022-09 nb_nodes = 980 @@ -46,16 +53,21 @@ data = read_csv(params$simulation_aggregated_output, show_col_types = FALSE) %>% start_dt_s = as.factor(start_dt_s), job_power_estimation_field = as.factor(job_power_estimation_field) ) +data = data %>% filter(algo_name == 'easypower') +data = data %>% filter(order == 'fcfs') data$predictor_name = factor(data$predictor_name, - levels=c('upper_bound', 'max', 'real_max', 'real_mean', 'mean', 'zero')) + levels=c('zero', 'upper_bound', 'max', 'real_max', 'gaussian_99', 'real_gaussian_99', 'gaussian_95', 'real_gaussian_95', 'gaussian_68', 'real_gaussian_68', 'mean', 'real_mean')) data = data %>% mutate( predictor_metrics = ifelse(predictor_name %in% c('real_max', 'max'), 'max', ifelse(predictor_name %in% c('real_mean', 'mean'), 'mean', + ifelse(predictor_name %in% c('real_gaussian_68', 'gaussian_68'), 'gaussian_68', + ifelse(predictor_name %in% c('real_gaussian_95', 'gaussian_95'), 'gaussian_95', + ifelse(predictor_name %in% c('real_gaussian_99', 'gaussian_99'), 'gaussian_99', 'naive' - )), - predictor_method = ifelse(predictor_name %in% c('mean', 'max'), 'predicted', 'real') + ))))), + predictor_method = ifelse(predictor_name %in% c('mean', 'max', 'gaussian_68', 'gaussian_95', 'gaussian_99'), 'predicted', 'real') ) -data$predictor_metrics = factor(data$predictor_metrics, levels=c('naive', 'max', 'mean')) +data$predictor_metrics = factor(data$predictor_metrics, levels=c('naive', 'max', 'gaussian_99', 'gaussian_95', 'gaussian_68', 'mean')) data$predictor_method = factor(data$predictor_method, levels=c('predicted', 'real')) # compute scheduling metrics against their matching EASY baseline @@ -76,10 +88,10 @@ data_nz = inner_join(data_nz, data_z_joinable, by='start_dt_s') %>% mutate( ) ``` -# Consistency checks +## Consistency checks This section inspects the simulation data to make sure the values are consistent with our expectations on the algorithm. -## During the constrained time window, is the utilization proportional to the powercap value for each (predictor, workload)? **it should** +### During the constrained time window, is the utilization proportional to the powercap value for each (predictor, workload)? **it should** ```{r, dev="jpeg", fig.width=10, fig.height=16} data_nz %>% ggplot(aes(x=powercap_dynamic_value_ratio, y=mean_utilization / nb_nodes, color=predictor_name)) + @@ -88,7 +100,7 @@ data_nz %>% ggplot(aes(x=powercap_dynamic_value_ratio, y=mean_utilization / nb_n geom_abline(slope=1) + theme_bw() + theme(legend.position='top', legend.title=element_blank()) + - guides(color = guide_legend(nrow = 1)) + + guides(color = guide_legend(nrow = 2)) + scale_x_continuous(breaks=seq(0.1,0.7,0.2), labels = scales::percent) + scale_y_continuous(breaks=seq(0,1,0.2), labels = scales::percent) + scale_color_viridis(discrete=TRUE) + @@ -103,7 +115,7 @@ data_nz %>% ggplot(aes(x=powercap_dynamic_value_ratio, y=mean_utilization / nb_n **Conclusion**: Yes, almost perfectly proportional for all (workload, predictor) before the utilization becomes saturated. On 5/30 workloads the `max` predictor slightly jumps from one linear trend to another. This is consistent with the first-fit policy of the scheduling algorithm, here we think that EASY becomes able to execute a job whose power consumption is over estimated by `max`, and that EASY then cannot backfill smaller jobs since it thinks that there is not enough available power. -## During the constrained time window, is the utilization roughly proportional to the powercap value for each predictor regardless of the workload? +### During the constrained time window, is the utilization roughly proportional to the powercap value for each predictor regardless of the workload? ```{r, dev="jpeg", fig.width=10} data_nz %>% ggplot(aes(x=powercap_dynamic_value_ratio, y=mean_utilization / nb_nodes, color=predictor_name)) + @@ -112,7 +124,7 @@ data_nz %>% ggplot(aes(x=powercap_dynamic_value_ratio, y=mean_utilization / nb_n geom_abline(slope=1) + theme_bw() + theme(legend.position='top', legend.title=element_blank()) + - guides(color = guide_legend(nrow = 1)) + + guides(color = guide_legend(nrow = 2)) + scale_x_continuous(breaks=seq(0,0.7,0.1), labels = scales::percent) + scale_y_continuous(breaks=seq(0,1,0.2), labels = scales::percent) + scale_color_viridis(discrete=TRUE) + @@ -125,11 +137,11 @@ data_nz %>% ggplot(aes(x=powercap_dynamic_value_ratio, y=mean_utilization / nb_n **Conclusion**: Yes this is roughly proportional. For some workloads the utilization is saturated while using the `real_mean`/`mean` predictor for powercap > 55 %. -# Per-workload analysis +## Per-workload analysis This section analyzes how the algorithm behaves on each workload. We believe that this is the most important analysis section of this notebook, as scheduling results must be looked at for each workload to make sense. -## During the constrained time window, how much power is consumed on average for each (predictor, workload)? +### During the constrained time window, how much power is consumed on average for each (predictor, workload)? ```{r, dev="jpeg", fig.width=10, fig.height=16} data_nz %>% ggplot(aes(x=powercap_dynamic_value_ratio, y=mean_power/max_dynamic_power, color=predictor_name)) + @@ -138,7 +150,7 @@ data_nz %>% ggplot(aes(x=powercap_dynamic_value_ratio, y=mean_power/max_dynamic_ geom_abline(slope=1) + theme_bw() + theme(legend.position='top', legend.title=element_blank()) + - guides(color = guide_legend(nrow = 1)) + + guides(color = guide_legend(nrow = 2)) + scale_x_continuous(breaks=seq(0.1,0.7,0.2), labels = scales::percent) + scale_y_continuous(breaks=seq(0,1,0.2), labels = scales::percent) + scale_color_viridis(discrete=TRUE) + @@ -154,20 +166,19 @@ data_nz %>% ggplot(aes(x=powercap_dynamic_value_ratio, y=mean_power/max_dynamic_ - This plot has the same shape as the corresponding utilization plot, which is expected. The main difference is that the maximum mean power consumption is around 70 % while the utilization goes up to 100 %. -- The `real_mean` > `real_max` > `upper_bound` predictor order in terms of mean power consumption holds for all workloads. -- The `mean` > `max` > `upper_bound` predictor order in terms of mean power consumption holds for all workloads. +- The `real_mean` > `gaussians` > `real_max` > `upper_bound` predictor order in terms of mean power consumption holds for all workloads. +- The `mean` > `gaussians` > `max` > `upper_bound` predictor order in terms of mean power consumption holds for all workloads. - Using the `mean` history-based predictor instead of the real value `real_mean` (which cannot be used in practice as it is unknown at decision taking time, but which represents a perfect oracle estimator without error) has almost no impact on the power used during the constrained time window. - Using the `max` history-based predictor instead of the real value `real_max` (which cannot be used in practice as it is unknown at decision taking time, but which represents a perfect oracle estimator without error) decreases the mean power consumption. The decrease is very small on some workloads (*e.g.*, almost no impact on workload 19389030), but quite strong on other workloads (*e.g.*, on workload 10061708 while using powercap=70%, the mean power consumption moves from ~50 % with `real_max` to ~30 % with `max`). -## How is the scheduling performance (as measured by mean turnaround time) impacted by each predictor, for all workloads? +### How is the scheduling performance (as measured by mean turnaround time) impacted by each predictor, for all workloads? ```{r, dev="jpeg", fig.width=10, fig.height=16} data_nz %>% ggplot(aes(x=powercap_dynamic_value_ratio, y=mean_turnaround_time_minus_zero, color=predictor_name)) + geom_point() + - #geom_smooth(method = "lm", se = FALSE) + geom_hline(yintercept=0) + theme_bw() + theme(legend.position='top', legend.title=element_blank()) + - guides(color = guide_legend(nrow = 1)) + + guides(color = guide_legend(nrow = 2)) + scale_x_continuous(breaks=seq(0,0.7,0.2), labels = scales::percent) + scale_y_continuous() + scale_color_viridis(discrete=TRUE) + @@ -193,7 +204,7 @@ data_nz %>% geom_hline(yintercept=0) + theme_bw() + theme(legend.position='top', legend.title=element_blank()) + - guides(color = guide_legend(nrow = 1)) + + guides(color = guide_legend(nrow = 2)) + scale_x_continuous(breaks=seq(0,0.7,0.2), labels = scales::percent) + scale_y_continuous() + scale_color_viridis(discrete=TRUE) + @@ -207,21 +218,20 @@ data_nz %>% **Conclusions**: The scheduling performance degradation is clearly linear to the powercap value on some workloads (*e.g.*, 3079185 and 7934521), has a linear trend but with noise on most workloads (*e.g.*, 17539280), and is not linear on some workloads (*e.g.*,19389030 for predictors that are not `upper_bound`). Additionnally: -- The `mean` > `max` > `upper_bound` predictor order in terms of scheduling performance holds for most workloads. -- The `real_mean` > `real_max` > `upper_bound` predictor order in terms of scheduling performance holds for most workloads. +- The `mean` > `gaussians` > `max` > `upper_bound` predictor order in terms of scheduling performance holds for most workloads. +- The `real_mean` > `gaussians` > `real_max` > `upper_bound` predictor order in terms of scheduling performance holds for most workloads. - The mean turnaround time difference metrics spans on the same range of values for most workloads, which means **this metrics can be aggregated over all workloads** without a per-workload normalization step. - Similarly to the mean power consumption during the constrained time window metrics, using `mean` instead of `real_mean` seems to have very little impact on the mean turnaround time metrics on most workloads. - Using `max` instead of `real_max` has a small impact (small performance degradation) on the mean turnaround time metrics on most workloads. -# Analysis aggregating all workloads together +## Analysis aggregating all workloads together While we think that per-workload analysis is the most relevant, it obviously cannot fit in the 1.5-page window dedicated to the analysis in the article, as per-workload view of the data takes a lot of place. This section aggregates the result seen previously in smaller figures that can fit in the paper, and does additional analysis on the whole dataset. -## During the constrained time window, how far is the mean power compared to the powercap value for each predictor? +### During the constrained time window, how far is the mean power compared to the powercap value for each predictor? ```{r, fig.width=10, fig.height=6} data_nz %>% - #filter(powercap_dynamic_value_ratio %in% powercap_ratios_values_to_show) %>% mutate(powercap_label = sprintf("pcap=%g", powercap_dynamic_value_ratio)) %>% ggplot() + geom_hline(aes(yintercept=powercap_dynamic_value_ratio)) + @@ -249,7 +259,7 @@ data_nz %>% The final version seen in the article (Figure 4) is very similar, but for the sake of font readibility only half of the powercap values are shown. ```{r, fig.width=8, fig.height=4} -powercap_ratios_values_to_show = seq(0.1, 0.7, 0.1) +powercap_ratios_values_to_show = c(0.4, 0.5, 0.6, 0.7) scale=0.9 width_scale=0.3 data_nz %>% @@ -274,7 +284,7 @@ data_nz %>% x="Job power estimator" ) + scale_fill_grey(start=0.8, end=1) -ggsave("./fig4-sched-mean-power-distribution.svg", width=8*scale, height=4*scale) +ggsave("./sched-mean-power-distribution.svg", width=8*scale, height=4*scale) ``` Here is the code that produces the summarized power underutilization values seen in Section 6.5 of the article. @@ -283,6 +293,21 @@ The **average** value has been used in the article. ```{r} t = data_nz %>% + filter(powercap_dynamic_value_ratio %in% powercap_ratios_values_to_show) %>% + mutate(power_underutilization_ratio = (powercap_dynamic_watts - mean_power)/powercap_dynamic_watts) %>% + group_by(predictor_name) %>% + summarize( + average_power_underutilization_ratio = mean(power_underutilization_ratio), + median_power_underutilization_ratio = median(power_underutilization_ratio), + ) +knitr::kable(t) +``` + +Same table as before, but ignoring 0.7 + +```{r} +t = data_nz %>% + filter(powercap_dynamic_value_ratio %in% c(0.4, 0.5, 0.6)) %>% mutate(power_underutilization_ratio = (powercap_dynamic_watts - mean_power)/powercap_dynamic_watts) %>% group_by(predictor_name) %>% summarize( @@ -292,7 +317,7 @@ t = data_nz %>% knitr::kable(t) ``` -## How is the scheduling performance degraded by each predictor? +### How is the scheduling performance degraded by each predictor? Very similarly to the previous plot, here is how Figure 5 of the article is produced. ```{r, fig.width=8, fig.height=4} @@ -317,7 +342,7 @@ data_nz %>% x="Job power estimator" ) + scale_fill_grey(start=0.8, end=1) -ggsave("./fig5-sched-mtt-diff-distribution.svg", width=8*scale, height=4*scale) +ggsave("./sched-mtt-distribution.svg", width=8*scale, height=4*scale) ``` Here is the code that produces the summarized scheduling performance degradation values seen in Section 6.5 of the article. @@ -326,6 +351,7 @@ The `average_mtt_increase_ratio` value (**average** of normalized mean turnaroun ```{r} t = data_nz %>% + filter(powercap_dynamic_value_ratio %in% powercap_ratios_values_to_show) %>% filter(start_dt_s != outlier_workload_start_dt_s) %>% group_by(predictor_name) %>% summarize( @@ -336,15 +362,20 @@ t = data_nz %>% knitr::kable(t) ``` -## How much energy is consumed during the time window compared to the energy that should be used by being at the powercap value for the whole window duration? +### How much energy is consumed during the time window compared to the energy that should be used by being at the powercap value for the whole window duration? ```{r, fig.height = 4} -data_nz %>% ggplot() + +data_nz %>% + filter(powercap_dynamic_value_ratio %in% powercap_ratios_values_to_show) %>% + ggplot() + geom_hline(yintercept=0) + geom_violin(aes(x=predictor_name, y=energy_from_powercap / 1e9)) + geom_jitter(aes(x=predictor_name, y=energy_from_powercap / 1e9), alpha=0.1) + geom_boxplot(aes(x=predictor_name, y=energy_from_powercap / 1e9), width=0.025, outlier.shape=NA) + theme_bw() + + theme( + axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), + ) + labs( x="Power predictor", y="Distribution of the energy consumed (GJ)" @@ -353,21 +384,28 @@ data_nz %>% ggplot() + **Conclusions**: Energy values are consistent with the previous power plots. We can see that only the `mean` and `real_mean` used more energy than what the powercap enables on the analyzed workloads. We can see that `mean` frequently leads to more energy being used than what the powercap enables. -## How is the powercap exceeded during the time window for each predictor? -Whether the powercap has been exceeded or not has been computed for each second of each simulation. +### How is the powercap exceeded during the time window for each predictor? +Whether the powercap has been exceeded or not has been computed for each second of each simulation. The output is presented as Figure 6 in the article. ```{r, fig.height = 4} -data_nz %>% ggplot() + +data_nz %>% + filter(powercap_dynamic_value_ratio %in% powercap_ratios_values_to_show) %>% + filter(predictor_metrics != 'naive') %>% + ggplot() + geom_hline(yintercept=0) + - geom_violin(aes(x=predictor_name, y=nb_seconds_above_powercap/constrained_time_window_duration_seconds)) + - geom_jitter(aes(x=predictor_name, y=nb_seconds_above_powercap/constrained_time_window_duration_seconds), alpha=0.1) + - geom_boxplot(aes(x=predictor_name, y=nb_seconds_above_powercap/constrained_time_window_duration_seconds), width=0.025, outlier.shape=NA) + + geom_jitter(aes(x=predictor_metrics, y=nb_seconds_above_powercap/constrained_time_window_duration_seconds), alpha=0.1, height=0) + + geom_boxplot(aes(x=predictor_metrics, y=nb_seconds_above_powercap/constrained_time_window_duration_seconds), width=0.1, outlier.shape=NA) + theme_bw() + + theme( + axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1), + ) + + facet_wrap(vars(predictor_method), nrow=1) + labs( - x="Power predictor", - y="Proportion of time above powercap" + x="Job power estimator", + y="Proportion of time \n above powercap" ) + scale_y_continuous(labels = scales::percent) +ggsave("./proportion-above-powercap.svg", width=8*scale, height=3*scale) ``` **Conclusions**: Only `real_mean` and `mean` exceed the powercap on the analyzed workloads. They both exceed the powercap frequently, but `mean` breaks the powercap more frequently than `real_mean`. @@ -380,6 +418,7 @@ The **average** and **median** values have been used in the article. ```{r} t = data_nz %>% + filter(powercap_dynamic_value_ratio %in% powercap_ratios_values_to_show) %>% mutate(powercap_break = pmax(max_power_from_powercap, 0)) %>% mutate(powercap_break_ratio = powercap_break / powercap_dynamic_watts) %>% group_by(predictor_name) %>% @@ -394,11 +433,13 @@ Similarly, here is the code that computes in how many cases the powercap is exce ```{r} nb_simus = data_nz %>% + filter(powercap_dynamic_value_ratio %in% powercap_ratios_values_to_show) %>% group_by(predictor_name) %>% summarize( total_count = n() ) breaks = data_nz %>% + filter(powercap_dynamic_value_ratio %in% powercap_ratios_values_to_show) %>% mutate(powercap_break = pmax(max_power_from_powercap, 0)) %>% filter(powercap_break > 0) %>% group_by(predictor_name) %>% @@ -411,3 +452,182 @@ knitr::kable(t) ``` **Erratum note**: The first submitted version of the article states that `mean` breaks the powercap in 38 % of instances. The computation was wrong, `mean` breaks the powercap in 95 % of instances, and `real_mean` breaks the powercap 94 % of instances. + +# Part two: Other scheduling algorithms with power prediction + +Now, the results comparing different scheduling algorithms using Gaussian 99 power verification. + +### Scheduling algorithms names +- Easy FCFS: The same EASY backfilling as before, that is, EASY backfilling sorting the waiting queue by FCFS; +- Easy SAF: EASY backfilling with power capping but sorting the the waiting queue by Smallest Area First (SAF). The area is calculated by multiplying the job walltime by the number of resources demanded; +- Knapsack Wait.: Knapsack using power as weight and waiting time as profit. The knapsack is applied during the power capping time window. Outside this window, it applies Easy FCFS; +- Knapsack Stretch: Knapsack using power as weight and Stretch as profit. Stretch is waiting time plus expected execution time divided by expected execution time. The knapsack is applied during the power capping time window. Outside this window, it applies Easy FCFS; + +## Code to read and prepare data + +```{r, echo = TRUE} +# read input data, fix types, reorder predictor and split predictors in categories +data = read_csv(params$simulation_aggregated_output, show_col_types = FALSE) %>% mutate( + start_dt_s = as.factor(start_dt_s), + job_power_estimation_field = as.factor(job_power_estimation_field) +) + +data = data %>% filter(predictor_name == 'zero' | predictor_name == 'gaussian_99' | predictor_name == 'real_gaussian_99') +data = data %>% filter(powercap_dynamic_value_ratio == 0.5) + +data = data %>% mutate( + algo_adjusted_name = ifelse(algo_name == 'easypower' & order == 'fcfs', 'Easy FCFS', + ifelse(algo_name == 'easypower' & order == 'saf', 'Easy SAF', + ifelse(algo_name == 'easypower' & order == 'turnaround_saf', 'Easy Turn.+SAF', + ifelse(algo_name == 'knapsack_greedy' & type_knapsack == 'waiting_time', 'Knapsack Wait.', 'Knapsack Stretch' + )))), + predictor_method = ifelse(predictor_name %in% c('gaussian_68', 'gaussian_95', 'gaussian_99'), 'predicted', 'real') +) + +data$algo_adjusted_name = factor(data$algo_adjusted_name, levels=c('Easy FCFS', 'Easy SAF', 'Easy Turn.+SAF', 'Knapsack Wait.', 'Knapsack Stretch')) +data$predictor_method = factor(data$predictor_method, levels=c('predicted', 'real')) + +data = data %>% filter(algo_adjusted_name != 'Easy Turn.+SAF') + +# compute scheduling metrics against their matching EASY baseline +data_nz = data %>% filter(predictor_name != 'zero') +data_z = data %>% filter(predictor_name == 'zero' & + powercap_dynamic_value_ratio == max(data$powercap_dynamic_value_ratio)) + +data_z_joinable = data_z %>% transmute( + start_dt_s = start_dt_s, + zero_mean_utilization = mean_utilization, + zero_max_utilization = max_utilization, + zero_mean_turnaround_time = mean_turnaround_time, + zero_max_turnaround_time = max_turnaround_time, + zero_mean_waiting_time = mean_waiting_time, + zero_max_waiting_time = max_waiting_time, +) + +data_nz = inner_join(data_nz, data_z_joinable, by='start_dt_s') %>% mutate( + mean_turnaround_time_minus_zero = mean_turnaround_time - zero_mean_turnaround_time, +) %>% mutate( + mean_turnaround_time_increase_ratio = mean_turnaround_time_minus_zero / zero_mean_turnaround_time +) %>% mutate( + max_turnaround_time_minus_zero = max_turnaround_time - zero_max_turnaround_time +)%>% mutate( + mean_waiting_time_minus_zero = mean_waiting_time - zero_mean_waiting_time +)%>% mutate( + max_waiting_time_minus_zero = max_waiting_time - zero_max_waiting_time +) +``` + +## Create graphs +This code generates the figure 7. + +```{r, fig.width=8, fig.height=6.7} +scale = 0.8 +p1 = data_nz %>% + ggplot() + + geom_hline(aes(yintercept=powercap_dynamic_value_ratio), linewidth=width_scale) + + geom_boxplot(aes(y=mean_power/max_dynamic_power, x=algo_adjusted_name, fill=predictor_method), linewidth=width_scale, outlier.size=width_scale) + + theme_bw() + + theme( + legend.position=c(0.7, 0.15), + legend.direction='horizontal', + legend.title=element_blank(), + legend.background=element_rect(color='black'), + plot.title = element_text(hjust=0.5), + axis.text.x = element_text(angle = 45, vjust = 0.8, hjust=0.8), + plot.margin = margin(0, 0.1, 0, 0.1, "cm") + ) + + expand_limits(x=0) + + labs( + y="Mean power consumption", + x="", + title = "(a)" + ) + + scale_fill_grey(start=0.8, end=1) + +p2 = data_nz %>% + ggplot() + + geom_boxplot(aes(y=-mean_turnaround_time_minus_zero, x=algo_adjusted_name, fill=predictor_method), linewidth=width_scale, outlier.size=width_scale) + + theme_bw() + + theme( + legend.position="none", + plot.title = element_text(hjust=0.5), + axis.text.x = element_text(angle = 45, vjust = 0.8, hjust=0.8), + plot.margin = margin(0, 0.1, 0, 0.1, "cm") + ) + + labs( + y="Mean turnaround improvement (s)", + x="", + title = "(b)" + ) + + scale_fill_grey(start=0.8, end=1) + +p3 = data_nz %>% + ggplot() + + geom_boxplot(aes(y=max_turnaround_time_minus_zero, x=algo_adjusted_name, fill=predictor_method), linewidth=width_scale, outlier.size=width_scale) + + theme_bw() + + theme( + legend.position="none", + plot.title = element_text(hjust=0.5), + axis.text.x = element_text(angle = 45, vjust = 0.8, hjust=0.8), + plot.margin = margin(0, 0.1, 0, 0.1, "cm") + ) + + labs( + y="Max turnaround increase (s)", + x="", + title = "(c)" + ) + + scale_fill_grey(start=0.8, end=1) + +p4 = data_nz %>% + ggplot() + + geom_boxplot(aes(y=perc_jobs_inside_window * 100, x=algo_adjusted_name, fill=predictor_method), linewidth=width_scale, outlier.size=width_scale) + + theme_bw() + + theme( + legend.position="none", + plot.title = element_text(hjust=0.5), + axis.text.x = element_text(angle = 45, vjust = 0.8, hjust=0.8), + plot.margin = margin(0, 0.1, 0, 0.1, "cm") + ) + + labs( + y="Jobs inside window (%)", + x="", + title = "(d)" + ) + + scale_fill_grey(start=0.8, end=1) + +layout <- matrix(c(1, 2, 3, 4), ncol = 2, nrow=2, byrow = TRUE) +g <- arrangeGrob(p1, p2, p3, p4, layout_matrix = layout, bottom = "Scheduling") +ggsave(file="./sched-algo-comparison.svg", g) +``` + +## Mean power consumption + +```{r, fig.height=4} +p1 +``` + +**Conclusions**: We can see that the two knapsack are closer to the power capping over the experiments. However, they are more sensible to the variations of prediction. Regarding the real values, they have the best usage of the power. + +## Mean turnaround improvement + +```{r, fig.height=4} +p2 +``` + +**Conclusions**: This graph shows the mean improvement compared to the baseline (EASY FCFS without power capping). EASY FCFS with power capping is worst than the baseline, but EASY SAF and both knapsack are better. EASY SAF is the best one in this metric, but we will see in the next graph its drawback. + +## Max turnaround increase + +```{r, fig.height=4} +p3 +``` + +**Conclusions**: Here, the main drawback of EASY SAF. Its worst job regarding turnaround has a very high waiting time. On the other hand, both knapsacks are balanced, having similar values than EASY FCFS. + +## Jobs inside window + +```{r, fig.height=4} +p4 +``` + +**Conclusions**: In this graph, we can see that EASY SAF and both knapsack increse the number of jobs inside the power capping window, having better usage of the energy. However, this can explain why they are close to violate the capping (more jobs, more uncertainity). \ No newline at end of file diff --git a/scripts-py/expe_energumen/m100_agg_power_predictions.py b/scripts-py/expe_energumen/m100_agg_power_predictions.py index 421064d83fe8e79acf7ff06e77478c52def50f80..e5afba2e883e6737c7bdf391edca3bfa8a44c619 100644 --- a/scripts-py/expe_energumen/m100_agg_power_predictions.py +++ b/scripts-py/expe_energumen/m100_agg_power_predictions.py @@ -7,8 +7,14 @@ import pandas as pd FILENAME_PARSE_REGEX = '''.*/filter123_user_(\d+)_total_power_mean_pred\.csv$''' -def read_aggregate_one_dir(dir, estimated_metrics): +def read_aggregate_one_dir(dir, estimated_metrics, estimated=True): full_df = None + field_to_get = f'hist_pred_total_power_{estimated_metrics}' + if not estimated: + field_to_get = f'total_power_{estimated_metrics}_watts' + field_to_save = f'{estimated_metrics}_power_estimation' + if not estimated: + field_to_save = f'{estimated_metrics}_power_real' r = re.compile(FILENAME_PARSE_REGEX) for filename in glob.glob(f'{dir}/*.csv'): m = r.match(filename) @@ -18,8 +24,8 @@ def read_aggregate_one_dir(dir, estimated_metrics): df = pd.read_csv(filename, low_memory=False) df['user_id'] = user_id - df[f'{estimated_metrics}_power_estimation'] = df[f'hist_pred_total_power_{estimated_metrics}'] - df = df[['job_id', 'user_id', f'{estimated_metrics}_power_estimation']] + df[field_to_save] = df[field_to_get] + df = df[['job_id', 'user_id', field_to_save]] if full_df is None: full_df = df @@ -32,7 +38,9 @@ def read_aggregate_one_dir(dir, estimated_metrics): def read_aggregate_root_dir(root_dir): full_df_mean = read_aggregate_one_dir(f'{root_dir}/total_power_mean_predictions_users_allmethods_mean', 'mean') full_df_max = read_aggregate_one_dir(f'{root_dir}/total_power_mean_predictions_users_allmethods_max', 'max') - return full_df_mean.merge(full_df_max) + full_df_std = read_aggregate_one_dir(f'{root_dir}/total_power_mean_predictions_users_allmethods_std', 'std') + full_df_std_real = read_aggregate_one_dir(f'{root_dir}/total_power_mean_predictions_users_allmethods_std', 'std', False) + return full_df_mean.merge(full_df_max).merge(full_df_std).merge(full_df_std_real) def agg_all_files(): parser = argparse.ArgumentParser() diff --git a/scripts-py/expe_energumen/m100_compute_gantt_power_consumption.py b/scripts-py/expe_energumen/m100_compute_gantt_power_consumption.py index e3d02a807c81fd9894b462ef0e2ca9c04d58fde4..1a6d112245d85cc07630741768e172b7524f644c 100644 --- a/scripts-py/expe_energumen/m100_compute_gantt_power_consumption.py +++ b/scripts-py/expe_energumen/m100_compute_gantt_power_consumption.py @@ -9,6 +9,7 @@ import pandas as pd def main(): datetime_parser = lambda f: datetime.datetime.strptime(f, '%Y-%m-%d %H:%M:%S') parser = argparse.ArgumentParser() + parser.add_argument("input_schedule", help='path to the Batsim schedule output CSV file') parser.add_argument("input_jobs_gantt", help='path to the Batsim jobs output CSV file') parser.add_argument("input_batsim_workload", help='path to the Batsim workload JSON file') parser.add_argument("input_workload_root_path", help="filepath to the location of the root directory of the generated workloads") @@ -21,7 +22,7 @@ def main(): with open(args.input_batsim_workload) as f: batsim_workload = json.load(f) batsim_out_jobs = pd.read_csv(args.input_jobs_gantt) - + batsim_out_schedule = pd.read_csv(args.input_schedule) # determine which jobs are in the computation window jobs_in_window_mask = batsim_out_jobs['starting_time'] < args.end jobs_in_window = batsim_out_jobs[jobs_in_window_mask].copy().reset_index() @@ -42,6 +43,9 @@ def main(): utilization_during_window = np.zeros(window_nb_values) assert window_nb_values == len(platform_power_values_during_window) + perc_jobs_inside = len(jobs_in_window) / batsim_out_schedule["nb_jobs"][0] + jobs_inside = len(jobs_in_window) + total_jobs = int(batsim_out_schedule["nb_jobs"][0]) # compute the power consumed of all jobs in [0, end]. # - use input data when the job is scheduled # - complete with zeros @@ -107,6 +111,11 @@ def main(): mean_turnaround_time = batsim_out_jobs['turnaround_time'].mean() mean_slowdown = batsim_out_jobs['stretch'].mean() + max_waiting_time = batsim_out_jobs['waiting_time'].max() + max_turnaround_time = batsim_out_jobs['turnaround_time'].max() + quantiles_waiting = np.quantile(batsim_out_jobs['waiting_time'], [0.01, 0.1, 0.5, 0.9, 0.99]) + quantiles_turnaround = np.quantile(batsim_out_jobs['turnaround_time'], [0.01, 0.1, 0.5, 0.9, 0.99]) + # other infrastructure metrics mean_utilization = utilization_during_window.mean() max_utilization = utilization_during_window.max() @@ -132,8 +141,27 @@ def main(): 'mean_turnaround_time': mean_turnaround_time, 'mean_slowdown': mean_slowdown, + 'max_waiting_time': max_waiting_time, + 'max_turnaround_time': max_turnaround_time, + 'mean_utilization': mean_utilization, 'max_utilization': max_utilization, + + 'waiting_p1': quantiles_waiting[0], + 'waiting_p10': quantiles_waiting[1], + 'waiting_p50': quantiles_waiting[2], + 'waiting_p90': quantiles_waiting[3], + 'waiting_p99': quantiles_waiting[4], + + 'turnaround_p1': quantiles_turnaround[0], + 'turnaround_p10': quantiles_turnaround[1], + 'turnaround_p50': quantiles_turnaround[2], + 'turnaround_p90': quantiles_turnaround[3], + 'turnaround_p99': quantiles_turnaround[4], + + 'jobs_inside_window': jobs_inside, + 'perc_jobs_inside_window': perc_jobs_inside, + 'total_jobs': total_jobs, } print(json.dumps(metrics, sort_keys=True, allow_nan=False)) diff --git a/scripts-py/expe_energumen/m100_generate_batsim_workload.py b/scripts-py/expe_energumen/m100_generate_batsim_workload.py index d6ec821c531efd976137e8caf4ab71a83f09c9b1..bb13d699e68c9a700c48f6849c3c3ff781c978a2 100644 --- a/scripts-py/expe_energumen/m100_generate_batsim_workload.py +++ b/scripts-py/expe_energumen/m100_generate_batsim_workload.py @@ -160,6 +160,8 @@ def main(): 'zero_power_estimation': float(row['zero_power_estimation']), 'mean_power_estimation': float(row['mean_power_estimation'] * row['num_nodes']), 'max_power_estimation': float(row['max_power_estimation'] * row['num_nodes']), + 'std_power_estimation': float(row['std_power_estimation'] * row['num_nodes']), + 'std_power_real': float(row['std_power_real'] * row['num_nodes']), 'upper_bound_power_estimation': float(row['upper_bound_power_estimation']), 'job_details_filepath': job_profile_dir_suffix, } diff --git a/scripts-py/expe_energumen/m100_generate_expe_params.py b/scripts-py/expe_energumen/m100_generate_expe_params.py index 743b523fb179a93ebc4e50da07614208335303f8..4a08918f8ee6ada934680bfe8aac1a46b788dba1 100644 --- a/scripts-py/expe_energumen/m100_generate_expe_params.py +++ b/scripts-py/expe_energumen/m100_generate_expe_params.py @@ -23,7 +23,7 @@ def main(): 'powercap_dynamic_watts': int(i * 0.01 * max_dynamic_power), 'normal_dynamic_watts': max_dynamic_power, 'idle_watts': min_power_per_node, - } for i in range(10,71,5)] + } for i in range(10,91,10)] powercap_durations = [ {'powercap_end_time_seconds': 60*60*3}, @@ -31,12 +31,18 @@ def main(): algo_name = 'easypower' predictors = [ - {'algo_name': algo_name, 'predictor_name': 'zero', 'job_power_estimation_field': 'zero_power_estimation'}, - {'algo_name': algo_name, 'predictor_name': 'mean', 'job_power_estimation_field': 'mean_power_estimation'}, - {'algo_name': algo_name, 'predictor_name': 'max', 'job_power_estimation_field': 'max_power_estimation'}, - {'algo_name': algo_name, 'predictor_name': 'upper_bound', 'job_power_estimation_field': 'upper_bound_power_estimation'}, - {'algo_name': algo_name, 'predictor_name': 'real_mean', 'job_power_estimation_field': 'real_mean_power_estimation'}, - {'algo_name': algo_name, 'predictor_name': 'real_max', 'job_power_estimation_field': 'real_max_power_estimation'}, + {'algo_name': algo_name, 'predictor_name': 'zero', 'job_power_estimation_field': 'zero_power_estimation', "order": "fcfs"}, + {'algo_name': algo_name, 'predictor_name': 'mean', 'job_power_estimation_field': 'mean_power_estimation', "order": "fcfs"}, + {'algo_name': algo_name, 'predictor_name': 'max', 'job_power_estimation_field': 'max_power_estimation', "order": "fcfs"}, + {'algo_name': algo_name, 'predictor_name': 'upper_bound', 'job_power_estimation_field': 'upper_bound_power_estimation', "order": "fcfs"}, + {'algo_name': algo_name, 'predictor_name': 'real_mean', 'job_power_estimation_field': 'real_mean_power_estimation', "order": "fcfs"}, + {'algo_name': algo_name, 'predictor_name': 'real_max', 'job_power_estimation_field': 'real_max_power_estimation', "order": "fcfs"}, + {'algo_name': algo_name, 'predictor_name': 'gaussian_68', 'job_power_estimation_field': 'gaussian', 'sigma_times': 1, "order": "fcfs"}, + {'algo_name': algo_name, 'predictor_name': 'gaussian_95', 'job_power_estimation_field': 'gaussian', 'sigma_times': 2, "order": "fcfs"}, + {'algo_name': algo_name, 'predictor_name': 'gaussian_99', 'job_power_estimation_field': 'gaussian', 'sigma_times': 3, "order": "fcfs"}, + {'algo_name': algo_name, 'predictor_name': 'real_gaussian_68', 'job_power_estimation_field': 'real_gaussian', 'sigma_times': 1, "order": "fcfs"}, + {'algo_name': algo_name, 'predictor_name': 'real_gaussian_95', 'job_power_estimation_field': 'real_gaussian', 'sigma_times': 2, "order": "fcfs"}, + {'algo_name': algo_name, 'predictor_name': 'real_gaussian_99', 'job_power_estimation_field': 'real_gaussian', 'sigma_times': 3, "order": "fcfs"}, ] platforms = [ @@ -54,6 +60,41 @@ def main(): instance_hash = sha1(encoded_without_hash).hexdigest() instances[instance_hash] = instance nb_instances += 1 + + predictors = [ + {'algo_name': 'easypower', 'predictor_name': 'gaussian_68', 'job_power_estimation_field': 'gaussian', 'sigma_times': 1, "order": "saf"}, + {'algo_name': 'easypower', 'predictor_name': 'gaussian_95', 'job_power_estimation_field': 'gaussian', 'sigma_times': 2, "order": "saf"}, + {'algo_name': 'easypower', 'predictor_name': 'gaussian_99', 'job_power_estimation_field': 'gaussian', 'sigma_times': 3, "order": "saf"}, + {'algo_name': 'knapsack_greedy', 'predictor_name': 'gaussian_68', 'job_power_estimation_field': 'gaussian', 'sigma_times': 1, "type_knapsack": "waiting_time"}, + {'algo_name': 'knapsack_greedy', 'predictor_name': 'gaussian_95', 'job_power_estimation_field': 'gaussian', 'sigma_times': 2, "type_knapsack": "waiting_time"}, + {'algo_name': 'knapsack_greedy', 'predictor_name': 'gaussian_99', 'job_power_estimation_field': 'gaussian', 'sigma_times': 3, "type_knapsack": "waiting_time"}, + {'algo_name': 'knapsack_greedy', 'predictor_name': 'gaussian_68', 'job_power_estimation_field': 'gaussian', 'sigma_times': 1, "type_knapsack": "waiting_time_ratio"}, + {'algo_name': 'knapsack_greedy', 'predictor_name': 'gaussian_95', 'job_power_estimation_field': 'gaussian', 'sigma_times': 2, "type_knapsack": "waiting_time_ratio"}, + {'algo_name': 'knapsack_greedy', 'predictor_name': 'gaussian_99', 'job_power_estimation_field': 'gaussian', 'sigma_times': 3, "type_knapsack": "waiting_time_ratio"}, + {'algo_name': 'easypower', 'predictor_name': 'real_gaussian_68', 'job_power_estimation_field': 'real_gaussian', 'sigma_times': 1, "order": "saf"}, + {'algo_name': 'easypower', 'predictor_name': 'real_gaussian_95', 'job_power_estimation_field': 'real_gaussian', 'sigma_times': 2, "order": "saf"}, + {'algo_name': 'easypower', 'predictor_name': 'real_gaussian_99', 'job_power_estimation_field': 'real_gaussian', 'sigma_times': 3, "order": "saf"}, + {'algo_name': 'knapsack_greedy', 'predictor_name': 'real_gaussian_68', 'job_power_estimation_field': 'real_gaussian', 'sigma_times': 1, "type_knapsack": "waiting_time"}, + {'algo_name': 'knapsack_greedy', 'predictor_name': 'real_gaussian_95', 'job_power_estimation_field': 'real_gaussian', 'sigma_times': 2, "type_knapsack": "waiting_time"}, + {'algo_name': 'knapsack_greedy', 'predictor_name': 'real_gaussian_99', 'job_power_estimation_field': 'real_gaussian', 'sigma_times': 3, "type_knapsack": "waiting_time"}, + {'algo_name': 'knapsack_greedy', 'predictor_name': 'real_gaussian_68', 'job_power_estimation_field': 'real_gaussian', 'sigma_times': 1, "type_knapsack": "waiting_time_ratio"}, + {'algo_name': 'knapsack_greedy', 'predictor_name': 'real_gaussian_95', 'job_power_estimation_field': 'real_gaussian', 'sigma_times': 2, "type_knapsack": "waiting_time_ratio"}, + {'algo_name': 'knapsack_greedy', 'predictor_name': 'real_gaussian_99', 'job_power_estimation_field': 'real_gaussian', 'sigma_times': 3, "type_knapsack": "waiting_time_ratio"}, + ] + + powercaps = [{ + 'powercap_dynamic_value_ratio': f'{50 * 0.01:.2f}', + 'powercap_dynamic_watts': int(50 * 0.01 * max_dynamic_power), + 'normal_dynamic_watts': max_dynamic_power, + 'idle_watts': min_power_per_node, + }] + for instance_t in itertools.product(predictors, powercaps, powercap_durations, platforms, wl_params): + instance = reduce(lambda a,b: {**a, **b}, instance_t) + encoded_without_hash = json.dumps(instance, sort_keys=True).encode('utf-8') + instance_hash = sha1(encoded_without_hash).hexdigest() + instances[instance_hash] = instance + nb_instances += 1 + assert nb_instances == len(instances), 'collision: two instances have the same hash' f = sys.stdout diff --git a/scripts-py/expe_energumen/m100_run_batsim_instances.py b/scripts-py/expe_energumen/m100_run_batsim_instances.py index 2cd2b4cba7c791f8507cfeb1afdd991a33f672a6..7b5560ce1a1e01d84c1908356de63fcbe70d5e09 100644 --- a/scripts-py/expe_energumen/m100_run_batsim_instances.py +++ b/scripts-py/expe_energumen/m100_run_batsim_instances.py @@ -61,6 +61,7 @@ def manage_batsim_instance(instance_hash, instance, output_dir, workloads_dir): 'm100-compute-gantt-power-consumption', '--powercap_watts', f'{instance["powercap_dynamic_watts"]}', '-o', f'{instance_dir}/', + f'{instance_dir}/schedule.csv', f'{instance_dir}/jobs.csv', f'{workloads_dir}/wload_delay_{instance["start_dt_s"]}.json', f'{workloads_dir}',