diff --git a/distance_batsim_output.py b/distance_batsim_output.py index 65597ecfce600a58707815ed2182b78a3e2cf725..6682789bc5cdca2f48cb1fca4dcbcb05afe8d0ca 100755 --- a/distance_batsim_output.py +++ b/distance_batsim_output.py @@ -7,6 +7,7 @@ import numpy as np import argparse import json import warnings +from scipy.stats import entropy def clean_and_select(df): @@ -64,19 +65,32 @@ def lateness_distance(s1, s2): return np.sum([y-x for x, y in zip(s1, s2)]) +def normalized_euclidian_distance(s1, s2): + """Return the euclidien distance normalized by the l2 norm of the vectors""" -def distances(file1, file2, euclidean=True, lateness=False, + eucl_dist = euclidean_distance(s1, s2) + return eucl_dist**2 / (l2_norm(s1) * l2_norm(s2)) + +def l2_norm(s): + """Return the l2 norm of the series s""" + + return np.sqrt(np.sum([x * x for x in s])) + + +def distances(file1, file2, euclidean=True, lateness=False, norm_eucl=False, field=["finish_time"]): """Computes and returns a set of distances between two batsim outputs, if they have the same job_ids.""" out1, out2 = open_and_compare(file1, file2) - dist = {} + dist = {} for f in field: dist[f] = {} - if euclidean: + if euclidean: dist[f]["euclidean"] = euclidean_distance(out1[f], out2[f]) + if norm_eucl: + dist[f]["normalized_euclidean"] = normalized_euclidian_distance(out1[f], out2[f]) if lateness: dist[f]["lateness"] = lateness_distance(out1[f], out2[f]) @@ -108,7 +122,7 @@ def main(): parser.add_argument("--type", nargs='+', default=['euclidean'], help="Type of distance to use. Available values are " - "{euclidean, lateness}") + "{euclidean, lateness, normalized_euclidean}") parser.add_argument("--field", nargs='+', default=['finish_time'], help="The field to use to compute the distance. " "Available values are {submission_time, starting_time, " @@ -119,12 +133,13 @@ def main(): args = parser.parse_args() if args.all: - args.type = ["euclidean", "lateness"] + args.type = ["euclidean", "lateness", "normalized_euclidean"] args.field = ["submission_time", "starting_time", "finish_time"] dist = distances(file1=args.file1, file2=args.file2, euclidean="euclidean" in args.type, lateness="lateness" in args.type, + norm_eucl="normalized_euclidean" in args.type, field=list(args.field)) pretty_print(dist)