Skip to content
Snippets Groups Projects
Commit d216708b authored by Maël Madon's avatar Maël Madon
Browse files

change the way the normalized distance is calculated

parent d07005f3
No related branches found
No related tags found
No related merge requests found
......@@ -49,8 +49,20 @@ def open_and_compare(file1, file2):
out1 = out1[out1.success == 1]
out2 = out2[out2.success == 1]
runtimes1 = out1.finish_time - out1.starting_time
runtimes2 = out2.finish_time - out2.starting_time
if not np.allclose(runtimes1, runtimes2, atol=1):
warnings.warn(f"Some jobs in {file1} and {file2} don't have the same runtime (+/- 1sec). It is unusual, as runtime are normally an input of the simulation.")
return out1, out2
def sum_execution_times(out):
"""Returns the sum of all job execution times from a clean batsim output"""
runtimes = out.finish_time - out.starting_time
return runtimes.sum()
def euclidean_distance(s1, s2):
"""Returns the Euclidean distance between two series s1 and s2"""
......@@ -64,15 +76,6 @@ def lateness_distance(s1, s2):
return float(np.sum([y-x for x, y in zip(s1, s2)]))
def normalized_euclidian_distance(s1, s2):
"""Return the euclidien distance normalized by the l2 norm of the vectors,
or None if one of the vectors is the null vector (undefined)"""
n1, n2 = l2_norm(s1), l2_norm(s2)
if n1==0 or n2==0:
return None
eucl_dist = euclidean_distance(s1, s2)
return float( eucl_dist**2 / (n1 * n2) )
def l2_norm(s):
"""Return the l2 norm of the series s"""
......@@ -87,13 +90,20 @@ def distances(file1, file2, euclidean=True, lateness=False, norm_eucl=False,
out1, out2 = open_and_compare(file1, file2)
sum_runtimes1 = sum_execution_times(out1)
sum_runtimes2 = sum_execution_times(out2)
dist = {}
for f in field:
dist[f] = {}
eucl = euclidean_distance(out1[f], out2[f])
if euclidean:
dist[f]["euclidean"] = euclidean_distance(out1[f], out2[f])
dist[f]["euclidean"] = eucl
if norm_eucl:
dist[f]["normalized_euclidean"] = normalized_euclidian_distance(out1[f], out2[f])
if sum_runtimes1 == 0 or sum_runtimes2 == 0:
dist[f]["normalized_euclidean"] = None
else:
dist[f]["normalized_euclidean"] = eucl / np.sqrt(sum_runtimes1 * sum_runtimes2)
if lateness:
dist[f]["lateness"] = lateness_distance(out1[f], out2[f])
......
%% Cell type:markdown id: tags:
# Example of distances
This notebook calculate distances between several _jobs.csv as an example.
%% Cell type:code id: tags:
``` python
# Initialization
import pandas as pd
from evalys.jobset import JobSet
from evalys.visu.gantt import plot_gantt
three_jobs = "test/input/3jobs.csv"
three_jobs_w_session = "test/input/3jobs_w_sessions.csv"
three_jobs_zero = "test/input/3jobs_zeros.csv"
mc_10days_a60 = "test/input/mc_10days_a60_jobs.csv"
mc_10days_m60 = "test/input/mc_10days_m60_jobs.csv"
mc_10days_rigid = "test/input/mc_10days_rigid_jobs.csv"
```
%% Cell type:markdown id: tags:
Vizualize the useful columns of a jobs.csv:
%% Cell type:code id: tags:
``` python
out1 = pd.read_csv(mc_10days_a60)
# Select
desired_cols = ["job_id", "submission_time", "starting_time", "finish_time", "success"]
select = out1.loc[:, desired_cols]
# Clean job_id
select.job_id = select.job_id.astype(str)
select.job_id = select.job_id.str.split(':', expand=True)[0]
select.job_id = select.job_id.astype(int)
select
```
%% Output
job_id submission_time starting_time finish_time success
0 1216 55532.0 55532.0 55894.0 1
1 247 9327.0 9327.0 66429.0 1
2 1242 56876.0 56876.0 66496.0 1
3 1434 66504.0 66504.0 67496.0 1
4 1438 66506.0 66506.0 69764.0 1
.. ... ... ... ... ...
319 18943 643942.0 643942.0 954600.0 1
320 18945 643943.0 643943.0 958295.0 1
321 21862 753752.0 753752.0 960982.0 1
322 18944 643942.0 643942.0 961174.0 1
323 21995 760446.0 760446.0 1031099.0 1
[324 rows x 5 columns]
%% Cell type:markdown id: tags:
## With mock files
Visualize our two mock files 3jobs and 3jobs_w_sessions:
%% Cell type:code id: tags:
``` python
plot_gantt(JobSet.from_csv(three_jobs), title="three_jobs")
plot_gantt(JobSet.from_csv(three_jobs_w_session), title="three_jobs_w_session")
```
%% Output
/home/mael/.local/lib/python3.10/site-packages/evalys/visu/core.py:62: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.
self.fig.show()
/home/mael/.local/lib/python3.10/site-packages/evalys/visu/core.py:62: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.
self.fig.show()
%% Cell type:code id: tags:
``` python
!python3 distance_batsim_output.py {three_jobs} {three_jobs_w_session} --all
```
%% Output
/home/mael/ownCloud/workspace/batmen-tools/distance_batsim_output.py:56: UserWarning: Some jobs in test/input/3jobs.csv and test/input/3jobs_w_sessions.csv don't have the same runtime (+/- 1sec). It is unusual, as runtime are normally an input of the simulation.
warnings.warn(f"Some jobs in {file1} and {file2} don't have the same runtime (+/- 1sec). It is unusual, as runtime are normally an input of the simulation.")
{
"submission_time": {
"euclidean": 0.0,
"normalized_euclidean": 0.0,
"lateness": 0.0
},
"starting_time": {
"euclidean": 0.0,
"normalized_euclidean": 0.0,
"lateness": 0.0
},
"finish_time": {
"euclidean": 28.284271247461902,
"normalized_euclidean": 0.08000000000000002,
"normalized_euclidean": 0.4040610178208843,
"lateness": 0.0
}
}
%% Cell type:code id: tags:
``` python
!python3 distance_batsim_output.py {three_jobs} {three_jobs_zero} --all
```
%% Output
/home/mael/ownCloud/workspace/batmen-tools/distance_batsim_output.py:56: UserWarning: Some jobs in test/input/3jobs.csv and test/input/3jobs_zeros.csv don't have the same runtime (+/- 1sec). It is unusual, as runtime are normally an input of the simulation.
warnings.warn(f"Some jobs in {file1} and {file2} don't have the same runtime (+/- 1sec). It is unusual, as runtime are normally an input of the simulation.")
/home/mael/ownCloud/workspace/batmen-tools/distance_batsim_output.py:114: RuntimeWarning: divide by zero encountered in double_scalars
dist[f]["normalized_euclidean"] = eucl / np.sqrt(sum_runtimes1 * sum_runtimes2)
{
"submission_time": {
"euclidean": 50.0,
"normalized_euclidean": null,
"normalized_euclidean": Infinity,
"lateness": -70.0
},
"starting_time": {
"euclidean": 50.0,
"normalized_euclidean": null,
"normalized_euclidean": Infinity,
"lateness": -70.0
},
"finish_time": {
"euclidean": 100.0,
"normalized_euclidean": null,
"normalized_euclidean": Infinity,
"lateness": -140.0
}
}
%% Cell type:code id: tags:
``` python
!python3 distance_batsim_output.py {mc_10days_a60} {mc_10days_m60} --all
```
%% Output
{
"submission_time": {
"euclidean": 172.1394783308001,
"normalized_euclidean": 2.739806304756353e-10,
"normalized_euclidean": 1.2942984329407551e-05,
"lateness": 2254.0
},
"starting_time": {
"euclidean": 172.1394783308001,
"normalized_euclidean": 2.739806304756353e-10,
"normalized_euclidean": 1.2942984329407551e-05,
"lateness": 2254.0
},
"finish_time": {
"euclidean": 172.1394783308001,
"normalized_euclidean": 2.3852456580198333e-10,
"normalized_euclidean": 1.2942984329407551e-05,
"lateness": 2254.0
}
}
%% Cell type:code id: tags:
``` python
!python3 distance_batsim_output.py {mc_10days_a60} {mc_10days_rigid} --all
```
%% Output
{
"submission_time": {
"euclidean": 241.15762480170517,
"normalized_euclidean": 5.377222025784075e-10,
"normalized_euclidean": 1.813216587526566e-05,
"lateness": 3311.0
},
"starting_time": {
"euclidean": 241.15762480170517,
"normalized_euclidean": 5.377222025784075e-10,
"normalized_euclidean": 1.813216587526566e-05,
"lateness": 3311.0
},
"finish_time": {
"euclidean": 255.14897609044016,
"normalized_euclidean": 5.240304046302593e-10,
"normalized_euclidean": 1.91841479662116e-05,
"lateness": 3633.0
}
}
%% Cell type:code id: tags:
``` python
```
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment