diff --git a/distance_batsim_output.py b/distance_batsim_output.py old mode 100644 new mode 100755 index 883ad187135ee30b36dab59d8e2254c835607fc8..65597ecfce600a58707815ed2182b78a3e2cf725 --- a/distance_batsim_output.py +++ b/distance_batsim_output.py @@ -3,24 +3,131 @@ """Compute a set of distances between two batsim outputs (_jobs.csv)""" import pandas as pd +import numpy as np +import argparse +import json +import warnings + def clean_and_select(df): """Select only desired column from the dataframe and clean the job_ids""" # Select - desired_cols = ["job_id", "submission_time", "starting_time", "finish_time"] + desired_cols = ["job_id", "submission_time", + "starting_time", "finish_time", "success"] select = df.loc[:, desired_cols] - # Clean job_id + # Clean job_id (remove the sessions, if present) select.job_id = select.job_id.astype(str) select["job_id"] = select["job_id"].str.split(':', expand=True)[0] + select.job_id = select.job_id.astype(int) + + return select.sort_values(by="job_id") - return select def open_and_compare(file1, file2): - """Open file1 and file2, two _jobs.csv files. Checks if the job_ids match - and return their pandas Dataframe representation""" + """Open file1 and file2, two _jobs.csv files. Checks if the job_id columns + match and return their cleaned pandas Dataframe representation""" out1 = pd.read_csv(file1) out2 = pd.read_csv(file2) + out1 = clean_and_select(out1) + out2 = clean_and_select(out2) + + if not out1.job_id.equals(out2.job_id): + raise KeyError( + f"{file1} and {file2} cannot be compared: they don't have the same job_ids") + + if not out1.success.equals(out2.success): + warnings.warn(f"Some jobs in {file1} and {file2} don't have the same success status. Comparing only the jobs that were successful in both.") + + diff = out1.success.eq(out2) + out1 = out1.loc[diff, :] + out2 = out2.loc[diff, :] + + out1 = out1[out1.success == 1] + out2 = out2[out2.success == 1] + + return out1, out2 + + +def euclidean_distance(s1, s2): + """Returns the Euclidean distance between two series s1 and s2""" + + dist = np.sqrt(np.sum([(x-y) * (x-y) for x, y in zip(s1, s2)])) + return dist + + +def lateness_distance(s1, s2): + """Returns the 'lateness' of s2 compared to s1""" + + return np.sum([y-x for x, y in zip(s1, s2)]) + + +def distances(file1, file2, euclidean=True, lateness=False, + field=["finish_time"]): + """Computes and returns a set of distances between two batsim outputs, if + they have the same job_ids.""" + + out1, out2 = open_and_compare(file1, file2) + dist = {} + + for f in field: + dist[f] = {} + if euclidean: + dist[f]["euclidean"] = euclidean_distance(out1[f], out2[f]) + if lateness: + dist[f]["lateness"] = lateness_distance(out1[f], out2[f]) + + return dist + + +def pretty_print(dist): + """Nice printing of the dictionnary dist""" + + if len(dist) == 1: + val = list(dist.values())[0] + if len(val) == 1: + print(list(val.values())[0]) + else: + pretty = json.dumps(dist, indent=4) + print(pretty) + +def main(): + """Program entry point if called with CLI""" + + parser = argparse.ArgumentParser( + description="Computes and prints a set of distances between two batsim" + "outputs, if they have the same job_ids. " + "Default: euclidean distance on finish_time.") + parser.add_argument('file1', type=str, + help='The first _jobs.csv file') + parser.add_argument('file2', type=str, + help='The second _jobs.csv file') + + parser.add_argument("--type", nargs='+', default=['euclidean'], + help="Type of distance to use. Available values are " + "{euclidean, lateness}") + parser.add_argument("--field", nargs='+', default=['finish_time'], + help="The field to use to compute the distance. " + "Available values are {submission_time, starting_time, " + "finish_time}") + parser.add_argument("--all", action="store_true", + help="Print all available distances on all available fields") + + args = parser.parse_args() + + if args.all: + args.type = ["euclidean", "lateness"] + args.field = ["submission_time", "starting_time", "finish_time"] + + dist = distances(file1=args.file1, file2=args.file2, + euclidean="euclidean" in args.type, + lateness="lateness" in args.type, + field=list(args.field)) + pretty_print(dist) + + +if __name__ == "__main__": + main()