Skip to content
Snippets Groups Projects
Commit 221240c1 authored by Maël Madon's avatar Maël Madon
Browse files

progress on distance tool. should maybe add a 'similarity' test as well

parent c12f9c44
No related branches found
No related tags found
No related merge requests found
......@@ -3,24 +3,131 @@
"""Compute a set of distances between two batsim outputs (_jobs.csv)"""
import pandas as pd
import numpy as np
import argparse
import json
import warnings
def clean_and_select(df):
"""Select only desired column from the dataframe and clean the job_ids"""
# Select
desired_cols = ["job_id", "submission_time", "starting_time", "finish_time"]
desired_cols = ["job_id", "submission_time",
"starting_time", "finish_time", "success"]
select = df.loc[:, desired_cols]
# Clean job_id
# Clean job_id (remove the sessions, if present)
select.job_id = select.job_id.astype(str)
select["job_id"] = select["job_id"].str.split(':', expand=True)[0]
select.job_id = select.job_id.astype(int)
return select.sort_values(by="job_id")
return select
def open_and_compare(file1, file2):
"""Open file1 and file2, two _jobs.csv files. Checks if the job_ids match
and return their pandas Dataframe representation"""
"""Open file1 and file2, two _jobs.csv files. Checks if the job_id columns
match and return their cleaned pandas Dataframe representation"""
out1 = pd.read_csv(file1)
out2 = pd.read_csv(file2)
out1 = clean_and_select(out1)
out2 = clean_and_select(out2)
if not out1.job_id.equals(out2.job_id):
raise KeyError(
f"{file1} and {file2} cannot be compared: they don't have the same job_ids")
if not out1.success.equals(out2.success):
warnings.warn(f"Some jobs in {file1} and {file2} don't have the same success status. Comparing only the jobs that were successful in both.")
diff = out1.success.eq(out2)
out1 = out1.loc[diff, :]
out2 = out2.loc[diff, :]
out1 = out1[out1.success == 1]
out2 = out2[out2.success == 1]
return out1, out2
def euclidean_distance(s1, s2):
"""Returns the Euclidean distance between two series s1 and s2"""
dist = np.sqrt(np.sum([(x-y) * (x-y) for x, y in zip(s1, s2)]))
return dist
def lateness_distance(s1, s2):
"""Returns the 'lateness' of s2 compared to s1"""
return np.sum([y-x for x, y in zip(s1, s2)])
def distances(file1, file2, euclidean=True, lateness=False,
field=["finish_time"]):
"""Computes and returns a set of distances between two batsim outputs, if
they have the same job_ids."""
out1, out2 = open_and_compare(file1, file2)
dist = {}
for f in field:
dist[f] = {}
if euclidean:
dist[f]["euclidean"] = euclidean_distance(out1[f], out2[f])
if lateness:
dist[f]["lateness"] = lateness_distance(out1[f], out2[f])
return dist
def pretty_print(dist):
"""Nice printing of the dictionnary dist"""
if len(dist) == 1:
val = list(dist.values())[0]
if len(val) == 1:
print(list(val.values())[0])
else:
pretty = json.dumps(dist, indent=4)
print(pretty)
def main():
"""Program entry point if called with CLI"""
parser = argparse.ArgumentParser(
description="Computes and prints a set of distances between two batsim"
"outputs, if they have the same job_ids. "
"Default: euclidean distance on finish_time.")
parser.add_argument('file1', type=str,
help='The first _jobs.csv file')
parser.add_argument('file2', type=str,
help='The second _jobs.csv file')
parser.add_argument("--type", nargs='+', default=['euclidean'],
help="Type of distance to use. Available values are "
"{euclidean, lateness}")
parser.add_argument("--field", nargs='+', default=['finish_time'],
help="The field to use to compute the distance. "
"Available values are {submission_time, starting_time, "
"finish_time}")
parser.add_argument("--all", action="store_true",
help="Print all available distances on all available fields")
args = parser.parse_args()
if args.all:
args.type = ["euclidean", "lateness"]
args.field = ["submission_time", "starting_time", "finish_time"]
dist = distances(file1=args.file1, file2=args.file2,
euclidean="euclidean" in args.type,
lateness="lateness" in args.type,
field=list(args.field))
pretty_print(dist)
if __name__ == "__main__":
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment