#!/usr/bin/env python3 """Compute a set of distances between two batsim outputs (_jobs.csv)""" import pandas as pd import numpy as np import argparse import json import warnings def clean_and_select(df): """Select only desired column from the dataframe and clean the job_ids""" # Select desired_cols = ["job_id", "submission_time", "starting_time", "finish_time", "success"] select = df.loc[:, desired_cols] # Clean job_id (remove the sessions, if present) select.job_id = select.job_id.astype(str) select["job_id"] = select["job_id"].str.split(':', expand=True)[0] select.job_id = select.job_id.astype(int) select.sort_values(by="job_id") return select.set_index("job_id") def open_and_compare(file1, file2): """Open file1 and file2, two _jobs.csv files. Checks if the job_id columns match and return their cleaned pandas Dataframe representation""" out1 = pd.read_csv(file1) out2 = pd.read_csv(file2) out1 = clean_and_select(out1) out2 = clean_and_select(out2) if not out1.index.equals(out2.index): raise KeyError( f"{file1} and {file2} cannot be compared: they don't have the same job_ids") if not out1.success.equals(out2.success): warnings.warn(f"Some jobs in {file1} and {file2} don't have the same success status. Comparing only the jobs that were successful in both.") diff = out1.success.eq(out2.success) out1 = out1.loc[diff, :] out2 = out2.loc[diff, :] out1 = out1[out1.success == 1] out2 = out2[out2.success == 1] return out1, out2 def euclidean_distance(s1, s2): """Returns the Euclidean distance between two series s1 and s2""" dist = np.sqrt(np.sum([(x-y) * (x-y) for x, y in zip(s1, s2)])) return float(dist) def lateness_distance(s1, s2): """Returns the 'lateness' of s2 compared to s1""" return float(np.sum([y-x for x, y in zip(s1, s2)])) def normalized_euclidian_distance(s1, s2): """Return the euclidien distance normalized by the l2 norm of the vectors, or None if one of the vectors is the null vector (undefined)""" n1, n2 = l2_norm(s1), l2_norm(s2) if n1==0 or n2==0: return None eucl_dist = euclidean_distance(s1, s2) return float( eucl_dist**2 / (n1 * n2) ) def l2_norm(s): """Return the l2 norm of the series s""" return float( np.sqrt(np.sum([x * x for x in s])) ) def distances(file1, file2, euclidean=True, lateness=False, norm_eucl=False, field=["finish_time"]): """Computes and returns a set of distances between two batsim outputs, if they have the same job_ids.""" out1, out2 = open_and_compare(file1, file2) dist = {} for f in field: dist[f] = {} if euclidean: dist[f]["euclidean"] = euclidean_distance(out1[f], out2[f]) if norm_eucl: dist[f]["normalized_euclidean"] = normalized_euclidian_distance(out1[f], out2[f]) if lateness: dist[f]["lateness"] = lateness_distance(out1[f], out2[f]) # If only one value, returns it directly if len(dist) == 1: val = list(dist.values())[0] if len(val) == 1: return list(val.values())[0] # Otherwise returns the dictionnary return dist def pretty_print(dist): """Nice printing of the dictionnary dist""" if isinstance(dist, dict): pretty = json.dumps(dist, indent=4) print(pretty) else: print(dist) def main(): """Program entry point if called with CLI""" parser = argparse.ArgumentParser( description="Computes and prints a set of distances between two batsim" "outputs, if they have the same job_ids. " "Default: euclidean distance on finish_time.") parser.add_argument('file1', type=str, help='The first _jobs.csv file') parser.add_argument('file2', type=str, help='The second _jobs.csv file') parser.add_argument("--type", nargs='+', default=['euclidean'], help="Type of distance to use. Available values are " "{euclidean, lateness, normalized_euclidean}") parser.add_argument("--field", nargs='+', default=['finish_time'], help="The field to use to compute the distance. " "Available values are {submission_time, starting_time, " "finish_time}") parser.add_argument("--all", action="store_true", help="Print all available distances on all available fields") args = parser.parse_args() if args.all: args.type = ["euclidean", "lateness", "normalized_euclidean"] args.field = ["submission_time", "starting_time", "finish_time"] dist = distances(file1=args.file1, file2=args.file2, euclidean="euclidean" in args.type, lateness="lateness" in args.type, norm_eucl="normalized_euclidean" in args.type, field=list(args.field)) pretty_print(dist) if __name__ == "__main__": main()