Skip to content
Snippets Groups Projects
Commit 5768c146 authored by Millian Poquet's avatar Millian Poquet
Browse files

clean files not used in final europar24 artifacts

parent a0ac9342
No related branches found
No related tags found
No related merge requests found
# Reproduction prerequisites
Install Nix with the `nix` command and Nix `flakes` enabled.
TODO: explain how this is done currently and link to doc for later on
# Reproduce M100 data analysis
Unless otherwise specified, all commands should be run from the root of this Git repository.
## download M100 traces
TODO
## extract M100 traces
TODO
## aggregate 2022 power traces
### filtering note
28249 measures about node 155 are odd: total_power=p0_power=p1_power=0 W.
For information, the second lowest value observed on any node for total_power is 240 W.
filtering: removed values with total_power=0. this concerns 28249 measures that are all about node=155
```sh
nix develop .#aggregate-power-m100-months --command m100-agg-power-months ./m100-data/ ./m100-data/22-agg_ 22-01 22-02 22-03 22-04 22-05 22-06 22-07 22-08 22-09
```
### plot ecdf of power values
```sh
nix develop .#analyze-m100-agg-power --command Rscript ./scripts-r/visualize-m100-power-ecdf.R ./m100-data/22-agg_power_total.csv ./m100-data/viz-22-ecdf_power_total.pdf total
```
### plot power distribution of each node (requires heavy computing)
```sh
nix develop .#analyze-m100-agg-power --command Rscript ./scripts-r/visualize-m100-power-distrib.R ./m100-data/22-agg_power_total.csv ./m100-data/viz-22-distrib_power_total.pdf total
```
### plot power ecdf of each node
```sh
nix develop .#analyze-m100-agg-power --command Rscript ./scripts-r/visualize-m100-power-ecdf.R ./m100-data/22-agg_power_total.csv /tmp/meh.pdf
```
### compute quantiles on aggregated 2022 power traces
```sh
nix develop .#analyze-m100-agg-power --command Rscript ./scripts-r/m100-per-machine-model.R ./m100-data/22-agg_power_total.csv ./m100-data/22-powermodel_total.csv
```
# generate workload
## power estimation of each job
This assumes that you have a dump from Danilo scripts :/.
```sh
nix develop .#use-python-scripts --command m100-agg-danilo-estimations danilo-dump ./m100-data/22-job-power-estimations.csv
```
## put all useful job information into a single file
```sh
nix develop .#use-python-scripts --command m100-agg-jobs-info ./m100-data/ ./m100-data/22-jobs.csv 22-01 22-02 22-03 22-04 22-05 22-06 22-07 22-08 22-09
```
## merge power estimations with job informations into a single file
```sh
nix develop .#use-python-scripts --command m100-join-usable-jobs-info ./m100-data/22-job-power-estimations.csv ./m100-data/22-jobs.csv ./m100-data/22-jobs-with-prediction.csv
#expected output:
#total computation area (all jobs): 21347607003
#total computation area (filtered jobs): 12691524790
#filtered computation area accounts for 0.5945174458297104 of all jobs computation area
```
# generate SimGrid platform
```sh
nix develop .#use-python-scripts --command m100-generate-sg-platform ./m100-data/22-powermodel_total.csv 1000 -o m100-platform.xml
```
#!/usr/bin/env Rscript
library(tidyverse)
per_socket = read_csv('all-power-measures-socket.csv')
quantile(per_socket$avg_watts_per_minute, seq(0,1,0.01))
per_socket %>% ggplot(aes(x=avg_watts_per_minute)) +
geom_histogram(bins=100) +
theme_bw()
per_socket %>% filter(avg_watts_per_minute < 300) %>% ggplot(aes(x=socket_id, y=avg_watts_per_minute)) +
geom_boxplot() +
theme_bw() +
coord_flip()
ggsave('./all-measures-boxplots.pdf', width=16, height=27)
ggsave('./all-measures-boxplots.png', width=16, height=27, dpi=150)
p = per_socket %>% filter(avg_watts_per_minute < 300) %>% ggplot(aes(x=socket_id, y=avg_watts_per_minute)) +
geom_violin(size=0.5) +
theme_bw() +
coord_flip()
ggsave('./all-measures-violin.pdf', p, width=16, height=48)
rm(per_socket)
per_job = read_csv('all-power-measures-job.csv') %>% mutate(job_id = as.factor(job_id))
p = per_job %>% ggplot(aes(x=job_id, y=avg_watts_per_minute)) +
geom_boxplot() +
theme_bw() +
coord_flip()
ggsave('/tmp/raw-measures-per-job.png', width=16, height=32)
wrong_measures = per_job %>% filter(avg_watts_per_minute > 200) %>% mutate(job_id = as.factor(job_id))
wrong_measures %>% ggplot(aes(y=job_id, x=avg_watts_per_minute)) +
geom_point() +
theme_bw()
summ_wrong = wrong_measures %>% group_by(job_id) %>% summarize(nb_wrong_measures=n()) %>% arrange(desc(nb_wrong_measures))
summ_all = per_job %>% group_by(job_id) %>% summarize(nb_measures=n())
ggplot(summ_wrong, aes(x=nb_wrong_measures)) +
geom_histogram() +
theme_bw() +
labs(x="Nombre d'erreurs de mesure (moyenne de consommation d'une socket sur une minute > 200 W) par job", y="Nombre d'occurrences")
ggsave('./wrong-measures-per-job-hist.png', width=8, height=4.5)
left_join(summ_wrong, summ_all) %>% mutate(wrong_proportion = nb_wrong_measures / nb_measures) %>% arrange(desc(wrong_proportion))
jobs = read_csv('preprocessed_dataset_multisocket_filter1234.csv') %>%
mutate(job_id=as.factor(job_id)) %>%
group_by(job_id) %>%
summarize(nb_allocated_sockets = n()) %>%
ungroup()
wrong_jobs = left_join(summ_wrong, jobs)
ggplot(wrong_jobs, aes(x=nb_allocated_sockets, y=nb_wrong_measures)) +
geom_point() +
geom_smooth(method='lm') +
theme_bw() +
labs(x="Number of sockets allocated to each job", y="Number of wrong measures (per-minute average of power consumption > 200 W)")
ggsave('./corr-nbwrongmeasures-nbsocketsalloc-per-job.png', width=16, height=9)
#!/usr/bin/env python3
import pandas as pd
dahu_dir_path = './geo-data/dahu'
sleep_files = [
'2146534_dahu-19.grenoble.grid5000.fr_1660215573',
'2146496_dahu-28.grenoble.grid5000.fr_1660215387',
'2146514_dahu-31.grenoble.grid5000.fr_1660215441',
'2146520_dahu-26.grenoble.grid5000.fr_1660215497',
'2146508_dahu-5.grenoble.grid5000.fr_1660215433',
'2146543_dahu-11.grenoble.grid5000.fr_1660215585',
'2146525_dahu-21.grenoble.grid5000.fr_1660215544',
'2146521_dahu-23.grenoble.grid5000.fr_1660215525',
'2146485_dahu-27.grenoble.grid5000.fr_1660215168',
'2146537_dahu-17.grenoble.grid5000.fr_1660215585',
]
df_list = []
for sleep_file in sleep_files:
filepath = dahu_dir_path + '/' + sleep_file
df = pd.read_csv(filepath, delimiter=' ')
df['expe_id'] = sleep_file
df_list.append(df)
all_idle_expes_df = pd.concat(df_list, axis='index', ignore_index=True)
power_df_list = []
average_power_consumption_whole_single_expe = []
for index, expe in all_idle_expes_df.iterrows():
power_filepath = f"{dahu_dir_path}/{expe['expe_id']}_mojitos/{expe['hostname']}_{expe['fullname']}_{expe['startTime']}"
df = pd.read_csv(power_filepath, delimiter=' ')
df.rename(columns={'#timestamp': 'timestamp'}, inplace=True)
df['sum_energy'] = df['package-00'] + df['dram0'] + df['package-11'] + df['dram1']
df['previous_timestamp'] = df['timestamp'].shift(1)
new_df = df.iloc[1:].copy()
new_df['hostname'] = expe['hostname'].replace('.grenoble.grid5000.fr', '')
new_df['C0only'] = expe['C0only']
new_df['fmax'] = expe['fmax']
new_df['avg_power'] = new_df['sum_energy'] / (new_df['timestamp'] - new_df['previous_timestamp'])
power_df_list.append(new_df)
average_whole_expe = sum(df['sum_energy']) / (df['timestamp'].iloc[-1] - df['timestamp'].iloc[0])
average_power_consumption_whole_single_expe.append(average_whole_expe)
all_idle_power_df = pd.concat(power_df_list, axis='index', ignore_index=True)
avg_per_expe_df = pd.DataFrame(data=average_power_consumption_whole_single_expe, columns=['average_power_whole_experiment'])
all_idle_power_df.to_csv('./geo-idle-power-data.csv', index=False)
#!/usr/bin/env Rscript
library(tidyverse)
idle_data = read_csv('geo-idle-power-data.csv') %>% mutate(
fmax = as.factor(sprintf("%1.1f GHz", fmax / 1e6)),
c0only_label = ifelse(C0only, 'disable deep sleep', 'enable deep sleep'),
avg_power = avg_power / 1e6 # µW -> W
)
idle_data %>% filter(avg_power < 1e10) %>% ggplot(aes(y=avg_power, x=fmax)) +
geom_violin() +
geom_boxplot(width=0.05) +
facet_grid(rows=vars(c0only_label)) +
scale_y_continuous(limits=c(0,NA)) +
labs(x="Fréquence des processeurs", y="Puissance électrique d'une machine idle selon RAPL (p00+dram0+p11+dram1) en W") +
theme_bw()
ggsave('./idle-power-distribution-c0only.png', width=16, height=9)
#!/usr/bin/env Rscript
library(tidyverse)
max_power_data = read_csv('estimated-max-power.csv')
ggplot(max_power_data, aes(x=q.999)) +
geom_histogram() +
scale_x_continuous(limits=c(0,NA)) +
theme_bw() +
labs(x="quantile(0.999) de la consommation électrique d'une socket observée par RAPL dans tous les jobs filtrés (W)", y="Nombre d'occurrences")
ggsave('./hist-q999.pdf', width=8, height=4.5)
ggplot(max_power_data, aes(x=q1)) +
geom_histogram() +
scale_x_continuous(limits=c(0,NA)) +
theme_bw() +
labs(x="quantile(1)=max de la consommation électrique d'une socket observée par RAPL dans tous les jobs filtrés (W)", y="Nombre d'occurrences")
ggsave('./hist-q1.pdf', width=8, height=4.5)
summary(max_power_data)
Source diff could not be displayed: it is too large. Options to address this: view the blob.
#!/usr/bin/env python3
import json
import math
import os
import re
import sys
import pandas as pd
from collections import OrderedDict
def generate_power_traces_per_socket(csv_filename, json_filename):
with open(json_filename) as f:
power_data = json.load(f)
jobs = pd.read_csv(csv_filename)
power_data_df_per_socket = {}
power_data_df_per_job = {}
for index, job in jobs.iterrows():
job_id = str(job['job_id'])
job_host = f'dahu' + str(job['host_oar'])
job_socket = str(job['processor'])
socket_id = job_host + '_' + job_socket
json_power_profile = power_data[job_id][job_host][job_socket]['energy_profile']
df_job_power_profile = pd.read_json(json_power_profile)
cores_colname = 'pp0_package' + job_socket
ram_colname = 'DRAM_package' + job_socket
power_column = df_job_power_profile[[cores_colname, ram_colname]].rename(columns={cores_colname: 'cores_power', ram_colname: 'ram_power'})
power_column['avg_watts_per_minute'] = power_column['cores_power'] + power_column['ram_power']
if socket_id in power_data_df_per_socket:
power_data_df_per_socket[socket_id] = pd.concat([power_data_df_per_socket[socket_id], power_column], ignore_index=True)
else:
power_data_df_per_socket[socket_id] = power_column
if job_id in power_data_df_per_job:
power_data_df_per_job[job_id] = pd.concat([power_data_df_per_job[job_id], power_column], ignore_index=True)
else:
power_data_df_per_job[job_id] = power_column
# estimate maximum power
estimated_max_power = {socket_id:(p.quantile(1), p.quantile(0.999), p.quantile(0.99)) for socket_id,p in power_data_df_per_socket.items()}
estimated_max_power_df = pd.DataFrame.from_dict(estimated_max_power, orient='index')
estimated_max_power_df.to_csv('./estimated-max-power.csv', index_label='socket_id', header=['q1', 'q.999', 'q.99'])
# generate power traces
df_list = []
for socket_id, power_df in power_data_df_per_socket.items():
socket_df = pd.DataFrame(data=power_df)
socket_df['socket_id'] = socket_id
df_list.append(socket_df)
all_power_measures = pd.concat(df_list, axis='index')
all_power_measures.to_csv('./all-power-measures-socket.csv', index=False)
df_list = []
for job_id, power_df in power_data_df_per_job.items():
socket_df = pd.DataFrame(data=power_df)
socket_df['job_id'] = job_id
df_list.append(socket_df)
all_power_measures = pd.concat(df_list, axis='index')
all_power_measures.to_csv('./all-power-measures-job.csv', index=False)
# power_min, power_max in W.
# host_speed in SimGrid host speed (usually amount of floating-point operations per second)
# measure_period in seconds (the time between two power measures ; expected to be the same for all measures)
def generate_batsim_profiles(csv_filename, json_filename, output_directory, power_min=17.185949, power_max=144.767806, host_speed=1e9, measure_period=60):
with open(json_filename) as f:
power_data = json.load(f)
jobs = pd.read_csv(csv_filename)
# job id (str) -> trace filename (str)
trace_filenames_per_job = {}
for index, job in jobs.iterrows():
job_id = str(job['job_id'])
job_host = f'dahu' + str(job['host_oar'])
job_socket = str(job['processor'])
socket_id = job_host + '_' + job_socket
json_power_profile = power_data[job_id][job_host][job_socket]['energy_profile']
df_job_power_profile = pd.read_json(json_power_profile)
cores_colname = 'pp0_package' + job_socket
ram_colname = 'DRAM_package' + job_socket
power_column = df_job_power_profile[[cores_colname, ram_colname]].rename(columns={cores_colname: 'cores_power', ram_colname: 'ram_power'})
power_column['avg_watts_per_minute'] = power_column['cores_power'] + power_column['ram_power']
# put "outliers" in [power_min, power_max]
power_column['avg_watts_per_minute'] = power_column['avg_watts_per_minute'].apply(lambda x: max(min(power_max, x), power_min))
# generate usage from power
offset = power_min
interval_length = power_max - power_min
power_column['usage'] = power_column['avg_watts_per_minute'].apply(lambda x: (x-offset)/interval_length)
# generate amounts of floating-point operations to do
flop_amount_to_do_per_measure = host_speed * measure_period
if job_id in trace_filenames_per_job:
trace_filenames_per_job[job_id][socket_id] = power_column['usage']
else:
d = OrderedDict()
d[socket_id] = power_column['usage']
trace_filenames_per_job[job_id] = d
# generate Batsim-compatible trace files for each traced job
for job_id, trace_dict in trace_filenames_per_job.items():
job_dir = f'{output_directory}/jobs/{job_id}'
os.makedirs(job_dir, exist_ok=True)
host_id = 0
profile_filenames = []
for socket_id, col in trace_dict.items():
filename = f"{job_id}_{socket_id}.txt"
with open(f'{job_dir}/{filename}', 'w') as f:
for index, usage in col.iteritems():
print(f"{host_id} m_usage {usage:g} {flop_amount_to_do_per_measure:g}", file=f)
profile_filenames.append(filename)
host_id += 1
with open(f'{job_dir}/{job_id}.txt', 'w') as f:
for filename in profile_filenames:
print(filename, file=f)
# generate a Batsim workload that contains all profiles
bat_profiles = { job_id: {'type': 'usage_trace', 'trace': f'./jobs/{job_id}/{job_id}.txt'} for job_id, _ in trace_filenames_per_job.items() }
walltime_re = re.compile('(\d+):(\d+):(\d+)')
# submittime, walltime, nb_sockets_used, job_id, profile_id, extra_data
minimum_submission_time = jobs['submission_time_oar'].min()
job_tuples = []
for job_id in trace_filenames_per_job:
jobs_rows = jobs.loc[jobs['job_id'] == int(job_id)]
first_jobs_row = jobs_rows.loc[jobs_rows.index[0]]
submission_time = float(first_jobs_row['submission_time_oar'] - minimum_submission_time)
nb_sockets_used = len(jobs_rows)
walltime = first_jobs_row['walltime_oar']
if isinstance(walltime, float):
if math.isfinite(walltime):
if walltime < 0:
raise ValueError(f"walltime should be positive, got '{walltime}'")
walltime = int(walltime)
else:
walltime = int(-1)
elif isinstance(walltime, str):
m = walltime_re.match(walltime)
if m is None:
raise ValueError(f"walltime should be formatted as X:Y:Z (with X Y and Z as positive integers), got '{walltime}' instead")
hours, minutes, seconds = [int(x) for x in m.groups()]
walltime = seconds + 60*minutes + 60*60*hours
else:
raise ValueError(f"walltime should either be a floating-point value or a string, got type='{type(walltime)}', value='{walltime}'")
extra_data = json.dumps({ 'user': first_jobs_row['job_user_oar'] })
job_id = str(job_id)
job_tuples.append((submission_time, walltime, nb_sockets_used, job_id, job_id, extra_data))
job_tuples = sorted(job_tuples)
bat_jobs = [
{
'id': job_tuple[3],
'subtime': job_tuple[0],
'walltime': job_tuple[1],
'res': job_tuple[2],
'profile': job_tuple[4],
'extra_data': job_tuple[5]
} for job_tuple in job_tuples
]
with open(f'{output_directory}/workload.json', 'w') as f:
json.dump({
'jobs': bat_jobs,
'profiles': bat_profiles,
'nb_res': 74*2, # raw value from https://gricad-doc.univ-grenoble-alpes.fr/hpc/description/#dahu-plateforme-hpcda ; *2 because 2 sockets per network host
'description': f"automatically generated from GRICAD dahu traces by running {' '.join(sys.argv)}"
}, f)
csv_filename = 'preprocessed_dataset_multisocket_filter1234.csv'
json_filename = 'preprocessed_dataset_multisocket_filter1234_energyprofiles.json'
output_directory = '/tmp/out'
generate_batsim_profiles(csv_filename, json_filename, output_directory)
<?xml version='1.0'?>
<!DOCTYPE platform SYSTEM "http://simgrid.gforge.inria.fr/simgrid/simgrid.dtd">
<platform version="4.1">
<zone id="AS0" routing="Full">
<cluster id="cluster_compute" prefix="host" suffix="" radical="0-147"
bw="125MBps" lat="50us" bb_bw="2.25GBps" bb_lat="500us"
speed="100.0Mf, 88.95899053627761Mf, 83.67952522255192Mf, 80.57142857142857Mf, 76.21621621621621Mf, 72.49357326478149Mf, 68.78048780487805Mf, 64.6788990825688Mf, 60.775862068965516Mf, 58.62785862785863Mf, 50.088809946714036Mf, 49.21465968586388Mf, 44.97607655502392Mf, 1e-9Mf, 0.1639344262295082f, 0.006599788806758183f">
<!-- real pstates: 0 to 12
off: pstate: 13
consumption: 9.75 W
shutdown: pstate: 14
time: 6.1 s,
consumption: 100.99672131147543 W
boot: pstate: 15
time: 151.52 s,
consumption: 125.1743848996832 W
-->
<prop id="wattage_per_state" value="95.0:190.738, 95.0:171.02, 95.0:165.62, 95.0:160.47, 95.0:155.729, 95.0:151.3, 95.0:146.92, 95.0:142.95, 95.0:138.928, 95.0:135.368, 95.0:132.519, 95.0:128.87, 95.0:125.88, 9.75:9.75, 100.99672131147543:100.99672131147543, 125.1743848996832:125.1743848996832" />
<prop id="wattage_off" value="9.75" />
<!-- OFF : ON->OFF (shutdown) : OFF->ON (booting) -->
<prop id="sleep_pstates" value="13:14:15" />
</cluster>
<cluster id="cluster_master" prefix="master_host" suffix="" radical="0-0"
bw="125MBps" lat="50us" bb_bw="2.25GBps" bb_lat="500us"
speed="100.0Mf, 88.95899053627761Mf, 83.67952522255192Mf, 80.57142857142857Mf, 76.21621621621621Mf, 72.49357326478149Mf, 68.78048780487805Mf, 64.6788990825688Mf, 60.775862068965516Mf, 58.62785862785863Mf, 50.088809946714036Mf, 49.21465968586388Mf, 44.97607655502392Mf, 1e-9Mf, 0.1639344262295082f, 0.006599788806758183f">
<!-- real pstates: 0 to 12
off: pstate: 13
consumption: 9.75 W
shutdown: pstate: 14
time: 6.1 s,
consumption: 100.99672131147543 W
boot: pstate: 15
time: 151.52 s,
consumption: 125.1743848996832 W
-->
<prop id="wattage_per_state" value="95.0:190.738, 95.0:171.02, 95.0:165.62, 95.0:160.47, 95.0:155.729, 95.0:151.3, 95.0:146.92, 95.0:142.95, 95.0:138.928, 95.0:135.368, 95.0:132.519, 95.0:128.87, 95.0:125.88, 9.75:9.75, 100.99672131147543:100.99672131147543, 125.1743848996832:125.1743848996832" />
<prop id="wattage_off" value="9.75" />
<prop id="role" value="master" />
<!-- OFF : ON->OFF (shutdown) : OFF->ON (booting) -->
<prop id="sleep_pstates" value="13:14:15" />
</cluster>
<link id="backbone" bandwidth="1.25GBps" latency="500us" />
<zoneRoute src="cluster_compute" dst="cluster_master" gw_src="hostcluster_compute_router"
gw_dst="master_hostcluster_master_router">
<link_ctn id="backbone" />
</zoneRoute>
</zone>
</platform>
#!/usr/bin/env sh
robin --output-dir ./out \
--batcmd="batsim -p ./platform.xml -w workload/workload.json -e out/ -E" \
--schedcmd="batsched -v sequencer"
#!/usr/bin/env Rscript
# CLI
library(argparse)
parser = ArgumentParser(description='Compute desc. stats for the power consumption profile of each node')
parser$add_argument('input_file', type='character', help='data will be read from this CSV file. Example: /path/to/input-file.csv')
parser$add_argument('output_file', type='character', help='desc. stats will be written there as CSV. Example: /path/to/output-file.csv')
args = parser$parse_args()
print(args)
input_file = args$input_file
output_file = args$output_file
library(tidyverse)
data = read_csv(input_file) %>% arrange(node, power) %>% filter(node != 155 | power > 0)
quantile_df <- function(x, probs = seq(0, 1, by=0.1)) {
tibble(
val = quantile(x, probs, na.rm = TRUE),
quant = sprintf("p%d", as.integer(probs*100))
)
}
data_flat = uncount(data, weights = nbocc)
stats_per_node = data_flat %>% reframe(quantile_df(power), .by = node) %>% pivot_wider(names_from = quant, values_from = val)
write_csv(stats_per_node, output_file)
#!/usr/bin/env R
library(argparse)
parser = ArgumentParser(description='Compute desc. stats for the power consumption profile of each node')
parser$add_argument('input_file', type='character', help='data will be read from this CSV file. Example: /path/to/input-file.csv')
parser$add_argument('output_file', type='character', help='the output file to plot. Example: /path/to/output-file')
parser$add_argument('name', type='character', help='the name of the power metrics that is plotted. Example: p0')
args = parser$parse_args()
print(args)
input_file = args$input_file
output_file = args$output_file
name = args$name
library(tidyverse)
data = read_csv(input_file)
nb_nodes = nrow(data %>% select(node) %>% unique())
p = uncount(data, weights = nbocc) %>% mutate(node = as.factor(node)) %>% ggplot() +
geom_violin(aes(x=node, y=power)) +
theme_bw() + labs(
x = "Node",
y = sprintf("Power (W) in 2022 as stored in ipmi/%s_power", name)
) + coord_flip()
ggsave(output_file, p, width=16, height=as.integer(nb_nodes/4), limitsize=FALSE)
#!/usr/bin/env R
library(argparse)
parser = ArgumentParser(description='Compute desc. stats for the power consumption profile of each node')
parser$add_argument('input_file', type='character', help='data will be read from this CSV file. Example: /path/to/input-file.csv')
parser$add_argument('output_file', type='character', help='the output file to plot. Example: /path/to/output-file')
parser$add_argument('name', type='character', help='the name of the power metrics that is plotted. Example: p0')
args = parser$parse_args()
print(args)
input_file = args$input_file
output_file = args$output_file
name = args$name
library(tidyverse)
data = read_csv(input_file)
nb_nodes = nrow(data %>% select(node) %>% unique())
cumulated_data = data %>% group_by(node) %>% arrange(power) %>% mutate(
cum_nbocc = cumsum(nbocc),
node = as.factor(node)
)
p = cumulated_data %>% ggplot() +
geom_step(aes(x=power, y=cum_nbocc, colour=node), show.legend = FALSE, direction = 'hv') +
scale_colour_manual(values=rep("#00000020", nb_nodes)) +
theme_bw() + labs(
x = sprintf("Power (W) in 2022 as stored in ipmi/%s_power", name),
y = "Number of occurrences"
)
ggsave(output_file, p, width=16, height=9)
#!/usr/bin/env Rscript
# CLI
library(argparse)
parser = ArgumentParser(description='Compute desc. stats for the power consumption profile of each node')
parser$add_argument('powermodel_input_file', type='character', help='data will be read from this CSV file. Example: /path/to/input-file.csv')
parser$add_argument('output_prefix', type='character', help='where output plots and data should be put. Example: /path/to/output-prefix')
powermodel_input_file = args$input_file
output_prefix = args$output_prefix
powermodel_input_file = '/home/mpoquet/expe-energumen/m100-data/22-powermodel_total.csv'
output_prefix = '/home/mpoquet/expe-energumen/m100-data/'
# start of script
library(tidyverse)
library(viridis)
data = read_csv(powermodel_input_file) %>%
rename_with(~ gsub("p", "", .x, fixed = TRUE))
lazy_nodes = data %>% filter(`90` < 500) %>% mutate(type = 'lazy')
nb_lazy_nodes = nrow(lazy_nodes)
normal_nodes = data %>% filter(`90` >= 500) %>% mutate(type = 'normal')
nb_normal_nodes = nrow(normal_nodes)
data_typed = bind_rows(
lazy_nodes %>% mutate(type_label = sprintf('lazy nodes (power value < 500 W for at least 90 %% of measures). N=%d', nb_lazy_nodes)),
normal_nodes %>% mutate(type_label = sprintf('normal nodes. N=%d', nb_normal_nodes))
) %>% mutate(
type = as.factor(type),
type_label = as.factor(type_label),
)
data_longer = data_typed %>% pivot_longer(cols = 2:12, names_to='percentile', values_to='power') %>%
mutate(percentile = as.integer(percentile))
# figure 1 : distribution of quantiles values among nodes
p = data_longer %>% ggplot() +
geom_violin(aes(factor(percentile, ordered=TRUE), power)) +
theme_bw() +
labs(
x = 'Percentile',
y = 'Power (W)',
title = paste(
'total_power IPMI values from the Marconi 100 trace in 2022 on 980 nodes.',
'Total of 1.13 billion measures. 28249 values have been filtered out (all with power=0 and node=155).',
'Each line shows the distribution of a given percentile value among nodes of a given group (lazy or normal).',
'11 percentiles values have been computed for each node.',
'In particular, percentile=0 represents the minimum value observed on each node and percentile=100 represents the maximum value observed on each node.',
sep='\n'
)
) +
coord_flip() +
facet_wrap(vars(type_label), ncol=1)
ggsave(sprintf("%s22-power-quantile-node-distrib.pdf", output_prefix), p, width=16, height=9)
# figure 2 : scatter plot of min/max power values of nodes
p = data_typed %>% ggplot() +
geom_jitter(aes(x=`0`, y=`100`, shape=type_label, color=type_label), width=3, height=3) +
stat_ellipse(data = data_typed %>% filter(type == 'normal'), aes(x=`0`, y=`100`, color=type_label), level=0.68, type='norm') +
stat_ellipse(data = data_typed %>% filter(type == 'normal'), aes(x=`0`, y=`100`, color=type_label), level=0.95, type='norm') +
stat_ellipse(data = data_typed %>% filter(type == 'normal'), aes(x=`0`, y=`100`, color=type_label), level=0.997, type='norm') +
labs(
x='Minimum observed power value (W)',
y='Maximum observed power value (W)',
title=paste(
'Proposed power model of Marconi 100 nodes.',
'Each point represents a node.',
'Ellipses are 2D gaussian regressions with 1/2/3 sigmas.',
sep='\n'
) +
theme_bw() +
scale_color_viridis(discrete=TRUE, end=0.8, direction=-1) +
theme(legend.position = 'top', legend.title = element_blank())
p_unzoomed = p +
expand_limits(x=0, y=0) +
expand_limits(x=max(data_typed$`100`))
ggsave(sprintf("%s22-power-min-max-node-jitter.pdf", output_prefix), p_unzoomed, width=9, height=9)
ggsave(sprintf("%s22-power-min-max-node-jitter-zoomed.pdf", output_prefix), p, width=16, height=9)
#!/usr/bin/env Rscript
library(tidyverse)
library(viridis)
agg_file = './expe1/agg-result.csv'
output_dir = '/tmp'
epoch_m100 = ymd_hms('2022-01-01 00:00:00')
`%notin%` = Negate(`%in%`)
data = read_csv(agg_file) %>% mutate(
start_dt_s = as.factor(start_dt_s),
job_power_estimation_field = as.factor(job_power_estimation_field)
)
data$predictor_name = factor(data$predictor_name, levels=c('upper_bound', 'max', 'real_max', 'real_mean', 'mean', 'zero'))
data = data %>% mutate(
predictor_metrics = ifelse(predictor_name %in% c('real_max', 'max'), 'max',
ifelse(predictor_name %in% c('real_mean', 'mean'), 'mean',
'naive'
)),
predictor_method = ifelse(predictor_name %in% c('mean', 'max'), 'predicted', 'real')
)
data$predictor_metrics = factor(data$predictor_metrics, levels=c('naive', 'max', 'mean'))
data$predictor_method = factor(data$predictor_method, levels=c('predicted', 'real'))
# compute scheduling metrics against their matching EASY baseline
data_nz = data %>% filter(predictor_name != 'zero')
data_z = data %>% filter(predictor_name == 'zero' & powercap_dynamic_value_ratio == max(data$powercap_dynamic_value_ratio))
data_z_joinable = data_z %>% transmute(
start_dt_s = start_dt_s,
zero_mean_utilization = mean_utilization,
zero_max_utilization = max_utilization,
zero_mean_turnaround_time = mean_turnaround_time,
zero_mean_waiting_time = mean_waiting_time,
)
data_nz = inner_join(data_nz, data_z_joinable) %>% mutate(
mean_turnaround_time_minus_zero = mean_turnaround_time - zero_mean_turnaround_time,
mean_waiting_time_minus_zero = mean_waiting_time - zero_mean_waiting_time
) %>% mutate(
mean_turnaround_time_increase_ratio = mean_turnaround_time_minus_zero / zero_mean_turnaround_time
)
# energy diff from powercap, depending on powercap ratio and predictor
data_nz %>% ggplot(aes(x=powercap_dynamic_value_ratio, y=energy_from_powercap / 1e9, color=predictor_name)) +
geom_jitter(width=1/100, height=0) +
geom_smooth(method = "lm", se = FALSE) +
theme_bw() +
theme(legend.position='top', legend.title=element_blank()) +
guides(color = guide_legend(nrow = 1)) +
scale_x_continuous(breaks=seq(0,0.7,0.1), labels = scales::percent) +
scale_color_viridis(discrete=TRUE) +
expand_limits(x=0) +
labs(
y="Energy difference from the powercap during the constrained period for each simulation (GJ)",
x="Powercap value (proportion of the maximum dynamic power range). Shown with horizontal jitter."
)
ggsave(sprintf("%s/energy-diff-against-powercap-predictor.pdf", output_dir), width=16, height=9)
# energy surplus from powercap, depending on powercap ratio and predictor
data_nz %>% ggplot(aes(x=powercap_dynamic_value_ratio, y=surplus_energy / 1e9, color=predictor_name)) +
geom_jitter(width=1/100, height=0) +
geom_smooth(method = "lm", se = FALSE) +
theme_bw() +
theme(legend.position='top', legend.title=element_blank()) +
guides(color = guide_legend(nrow = 1)) +
scale_x_continuous(breaks=seq(0,0.7,0.1), labels = scales::percent) +
scale_color_viridis(discrete=TRUE) +
expand_limits(x=0) +
labs(
y="Energy surplus from the powercap during the constrained period for each simulation (GJ)",
x="Powercap value (proportion of the maximum dynamic power range). Shown with horizontal jitter."
)
ggsave(sprintf("%s/energy-surplus-against-powercap-predictor.pdf", output_dir), width=16, height=9)
# utilization
data_nz %>% ggplot(aes(x=powercap_dynamic_value_ratio, y=mean_utilization / 980, color=predictor_name)) +
geom_jitter(width=1/100, height=0) +
geom_smooth(method = "lm", se = FALSE) +
geom_abline(slope=1) +
theme_bw() +
theme(legend.position='top', legend.title=element_blank()) +
guides(color = guide_legend(nrow = 1)) +
scale_x_continuous(breaks=seq(0,0.7,0.1), labels = scales::percent) +
scale_y_continuous(breaks=seq(0,1,0.2), labels = scales::percent) +
scale_color_viridis(discrete=TRUE) +
expand_limits(x=0) +
labs(
y="Utilization (proportion of nodes)",
x="Powercap value (proportion of the maximum dynamic power range). Shown with horizontal jitter."
)
ggsave(sprintf("%s/utilization-against-powercap-predictor.pdf", output_dir), width=16, height=9)
# mean turnaround time metrics
data_nz %>% ggplot(aes(x=powercap_dynamic_value_ratio, y=mean_turnaround_time_minus_zero, color=predictor_name)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE) +
geom_hline(yintercept=0) +
theme_bw() +
theme(legend.position='top', legend.title=element_blank()) +
guides(color = guide_legend(nrow = 1)) +
scale_x_continuous(breaks=seq(0,0.7,0.1), labels = scales::percent) +
scale_y_continuous() +
scale_color_viridis(discrete=TRUE) +
facet_wrap(vars(start_dt_s), scales='free_y') +
expand_limits(x=0) +
labs(
y="Mean turnaround time difference against EASY without any powercap for each simulation",
x="Powercap value (proportion of the maximum dynamic power range)"
)
ggsave(sprintf("%s/mean-turnaround-time-against-powercap-predictor.pdf", output_dir), width=16, height=9)
max_observed_total_power = 955080
max_power_per_node = 2100.0
min_power_per_node = 240.0
nb_nodes = 980
max_dynamic_power = max_observed_total_power - min_power_per_node * nb_nodes
# distribution of the mean power used for each predictor
powercap_ratios_values_to_show = c(0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7)
width_scale=0.3
data_nz %>%
filter(powercap_dynamic_value_ratio %in% powercap_ratios_values_to_show) %>%
mutate(powercap_label = sprintf("pcap=%g", powercap_dynamic_value_ratio)) %>%
ggplot() +
geom_hline(aes(yintercept=powercap_dynamic_value_ratio), linewidth=width_scale) +
geom_boxplot(aes(y=mean_power/max_dynamic_power, fill=predictor_method, x=predictor_metrics), linewidth=width_scale, outlier.size=width_scale) +
theme_bw() +
theme(
legend.position=c(0.2, 0.9),
legend.direction='horizontal',
legend.title=element_blank(),
legend.background=element_rect(color='black'),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
) +
expand_limits(x=0) +
scale_y_continuous(breaks=seq(0,0.7,0.1)) +
facet_wrap(vars(powercap_label), nrow=1) +
labs(
y="Mean platform power consumption",
x="Job power estimator"
) +
scale_fill_grey(start=0.8, end=1)
scale=0.9
ggsave(sprintf("%s/sched-mean-power-distribution.pdf", output_dir), width=8*scale, height=4*scale)
# distribution of mean turnaround time diff (EASY) for each predictor
outlier_workload_start_dt_s = 18474670 # sched metrics are strogly better than EASY there
width_scale=0.3
data_nz %>%
filter(start_dt_s != outlier_workload_start_dt_s) %>%
filter(powercap_dynamic_value_ratio %in% powercap_ratios_values_to_show) %>%
mutate(powercap_label = sprintf("pcap=%g", powercap_dynamic_value_ratio)) %>%
ggplot() +
geom_boxplot(aes(y=mean_turnaround_time_minus_zero, fill=predictor_method, x=predictor_metrics), linewidth=width_scale, outlier.size=width_scale) +
theme_bw() +
theme(
legend.position=c(0.16, 0.12),
legend.direction='horizontal',
legend.background=element_rect(color='black'),
legend.title=element_blank(),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
) +
facet_wrap(vars(powercap_label), nrow=1) +
labs(
y="Mean turnaround time increase (s)",
x="Job power estimator"
) +
scale_fill_grey(start=0.8, end=1)
scale=0.9
ggsave(sprintf("%s/sched-mtt-distribution.pdf", output_dir), width=8*scale, height=4*scale)
# show all columns...
options(dplyr.width = Inf)
# compute overall power under-utilization compared to the powercap
data_nz %>%
mutate(power_underutilization_ratio = (powercap_dynamic_watts - mean_power)/powercap_dynamic_watts) %>%
group_by(predictor_name) %>%
summarize(
min_power_underutilization_ratio = min(power_underutilization_ratio),
average_power_underutilization_ratio = mean(power_underutilization_ratio)
)
# compute overall mean turnaround time increase
data_nz %>%
filter(start_dt_s != outlier_workload_start_dt_s) %>%
group_by(predictor_name) %>%
summarize(
average_mtt_increase = mean(mean_turnaround_time_minus_zero),
average_mtt_increase_ratio = mean(mean_turnaround_time_increase_ratio)
)
# compute overall powercap breaks
data_nz %>%
mutate(powercap_break = pmax(max_power_from_powercap, 0)) %>%
mutate(powercap_break_ratio = powercap_break / powercap_dynamic_watts) %>%
group_by(predictor_name) %>%
summarize(
mean_max_power_from_powercap = mean(max_power_from_powercap),
min_max_power_from_powercap = min(max_power_from_powercap),
max_max_power_from_powercap = max(max_power_from_powercap),
)
# enable comparison of mean power values between predictors for all (workload, powercap) tuples
data_p_mean = data %>% pivot_wider(names_from = predictor_name, values_from = mean_power) %>%
replace_na(list(max=0,upper_bound=0,zero=0,real_max=0,mean=0,real_mean=0))
against_zero = data_p_mean %>%
group_by(start_dt_s, powercap_dynamic_value_ratio) %>%
summarize(
upper_bound_below_zero = sum(upper_bound) <= sum(zero),
real_mean_below_zero = sum(real_mean) <= sum(zero),
mean_below_zero = sum(mean) <= sum(zero),
real_max_below_zero = sum(real_max) <= sum(zero),
max_below_zero = sum(max) <= sum(zero),
) %>% mutate(
all_below_zero = upper_bound_below_zero & real_mean_below_zero & mean_below_zero & real_max_below_zero & max_below_zero
)
instances_where_some_mean_power_is_NOT_below_EASY = against_zero %>% filter(!all_below_zero)
print(sprintf("number of occurrences where EASY consumes less power than any other EASY+powercap predictors: %d/%d", nrow(instances_where_some_mean_power_is_NOT_below_EASY), nrow(against_zero)))
print(instances_where_some_mean_power_is_NOT_below_EASY)
against_upper_bound = data_p_mean %>%
group_by(start_dt_s, powercap_dynamic_value_ratio) %>%
summarize(
zero_above_upper_bound = sum(zero) >= sum(upper_bound),
real_mean_above_upper_bound = sum(real_mean) >= sum(upper_bound),
mean_above_upper_bound = sum(mean) >= sum(upper_bound),
real_max_above_upper_bound = sum(real_max) >= sum(upper_bound),
max_above_upper_bound = sum(max) >= sum(upper_bound),
) %>% mutate(
all_above_upper_bound = zero_above_upper_bound & real_mean_above_upper_bound & mean_above_upper_bound & real_max_above_upper_bound & max_above_upper_bound
)
instances_where_some_mean_power_is_NOT_above_upper_bound = against_upper_bound %>% filter(!all_above_upper_bound)
print(sprintf("number of occurrences where upper_bound consumes more power than any other EASY+powercap predictors: %d/%d", nrow(instances_where_some_mean_power_is_NOT_above_upper_bound), nrow(against_zero)))
print(instances_where_some_mean_power_is_NOT_above_upper_bound)
against_mean = data_p_mean %>%
group_by(start_dt_s, powercap_dynamic_value_ratio) %>%
summarize(
real_mean_below_mean = sum(real_mean) <= sum(mean),
real_max_below_mean = sum(real_max) <= sum(mean),
max_below_mean = sum(max) <= sum(mean),
) %>% mutate(
all_max_below_mean = real_max_below_mean & max_below_mean,
all_below_mean = real_mean_below_mean & real_max_below_mean & max_below_mean
)
instances_where_some_max_power_is_NOT_below_mean = against_mean %>% filter(!all_max_below_mean)
print(sprintf("number of occurrences where mean consumes more power than max/real_max: %d/%d", nrow(instances_where_some_max_power_is_NOT_below_mean), nrow(against_mean)))
data_p_mtt = data_nz %>% filter(start_dt_s != outlier_workload_start_dt_s) %>%
pivot_wider(names_from = predictor_name, values_from = mean_turnaround_time_minus_zero) %>%
replace_na(list(max=0,upper_bound=0,zero=0,real_max=0,mean=0,real_mean=0))
against_upper_bound = data_p_mtt %>%
group_by(start_dt_s, powercap_dynamic_value_ratio) %>%
summarize(
real_mean_below_upper_bound = sum(real_mean) <= sum(upper_bound),
mean_below_upper_bound = sum(mean) <= sum(upper_bound),
real_max_below_upper_bound = sum(real_max) <= sum(upper_bound),
max_below_upper_bound = sum(max) <= sum(upper_bound),
) %>% mutate(
all_below_upper_bound = real_mean_below_upper_bound & mean_below_upper_bound & real_max_below_upper_bound & max_below_upper_bound
)
against_mean = data_p_mtt %>%
group_by(start_dt_s, powercap_dynamic_value_ratio) %>%
summarize(
real_mean_above_mean = sum(real_mean) >= sum(mean),
upper_bound_above_mean = sum(mean) >= sum(mean),
real_max_above_mean = sum(real_max) >= sum(mean),
max_above_mean = sum(max) >= sum(mean),
) %>% mutate(
all_above_mean = real_mean_above_mean & upper_bound_above_mean & real_max_above_mean & max_above_mean
)
data_nz %>% filter(start_dt_s != 18474670) %>% ggplot(aes(y=energy_from_powercap / 1e9, x=mean_utilization / 980, color=predictor_name)) +
geom_point() +
stat_ellipse() +
theme_bw() +
scale_color_viridis(discrete=TRUE)
ggsave(sprintf("%s/meh1.pdf", output_dir), width=16, height=9)
data_nz %>% ggplot() +
geom_violin(aes(x=predictor_name, y=energy_from_powercap / 1e9)) +
geom_jitter(aes(x=predictor_name, y=energy_from_powercap / 1e9), alpha=0.1) +
geom_boxplot(aes(x=predictor_name, y=energy_from_powercap / 1e9), width=0.025, outlier.shape=NA) +
theme_bw() +
labs(
x="Power predictor",
y="Distribution of the energy consumed during the constrained period for each simulation (GJ).\nComputed as the integral of the dynamic power minus the dynamic powercap value."
)
ggsave(sprintf("%s/meh2.pdf", output_dir), width=16, height=9)
data_nz %>% ggplot() +
geom_violin(aes(x=predictor_name, y=surplus_energy / 1e9)) +
geom_jitter(aes(x=predictor_name, y=surplus_energy / 1e9), alpha=0.1) +
geom_boxplot(aes(x=predictor_name, y=surplus_energy / 1e9), width=0.025, outlier.shape=NA) +
theme_bw() +
labs(
x="Power predictor",
y="Distribution of the surplus energy consumed during the constrained period for each simulation (GJ).\nComputed as the integral of the dynamic power minus the dynamic powercap value, only keeping positive values."
)
ggsave(sprintf("%s/meh3.pdf", output_dir), width=16, height=9)
data_nz %>% ggplot() +
geom_violin(aes(x=predictor_name, y=unused_energy / 1e9)) +
geom_jitter(aes(x=predictor_name, y=unused_energy / 1e9), alpha=0.1) +
geom_boxplot(aes(x=predictor_name, y=unused_energy / 1e9), width=0.025, outlier.shape=NA) +
theme_bw() +
labs(
x="Power predictor",
y="Distribution of the unused energy consumed during the constrained period for each simulation (GJ).\nComputed as the integral of the dynamic power minus the dynamic powercap value, only keeping negative values."
)
ggsave(sprintf("%s/meh4.pdf", output_dir), width=16, height=9)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment