From c2807a1581da55cf8c33466b36757bc41554bf8d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ma=C3=ABl=20Madon?= <mael.madon@irit.fr>
Date: Mon, 14 Nov 2022 14:59:40 +0100
Subject: [PATCH] add a 'normalized euclidien' distance to the set of distances

---
 distance_batsim_output.py | 25 ++++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/distance_batsim_output.py b/distance_batsim_output.py
index 65597ec..6682789 100755
--- a/distance_batsim_output.py
+++ b/distance_batsim_output.py
@@ -7,6 +7,7 @@ import numpy as np
 import argparse
 import json
 import warnings
+from scipy.stats import entropy
 
 
 def clean_and_select(df):
@@ -64,19 +65,32 @@ def lateness_distance(s1, s2):
 
     return np.sum([y-x for x, y in zip(s1, s2)])
 
+def normalized_euclidian_distance(s1, s2):
+    """Return the euclidien distance normalized by the l2 norm of the vectors"""
 
-def distances(file1, file2, euclidean=True, lateness=False,
+    eucl_dist = euclidean_distance(s1, s2)
+    return eucl_dist**2 / (l2_norm(s1) * l2_norm(s2))
+
+def l2_norm(s):
+    """Return the l2 norm of the series s"""
+
+    return np.sqrt(np.sum([x * x for x in s]))
+
+
+def distances(file1, file2, euclidean=True, lateness=False, norm_eucl=False,
               field=["finish_time"]):
     """Computes and returns a set of distances between two batsim outputs, if 
     they have the same job_ids."""
 
     out1, out2 = open_and_compare(file1, file2)
-    dist = {}
 
+    dist = {}
     for f in field:
         dist[f] = {}
-        if euclidean: 
+        if euclidean:
             dist[f]["euclidean"] = euclidean_distance(out1[f], out2[f])
+        if norm_eucl:
+            dist[f]["normalized_euclidean"] = normalized_euclidian_distance(out1[f], out2[f])
         if lateness:
             dist[f]["lateness"] = lateness_distance(out1[f], out2[f])
 
@@ -108,7 +122,7 @@ def main():
 
     parser.add_argument("--type", nargs='+', default=['euclidean'],
                         help="Type of distance to use. Available values are "
-                        "{euclidean, lateness}")
+                        "{euclidean, lateness, normalized_euclidean}")
     parser.add_argument("--field", nargs='+', default=['finish_time'],
                         help="The field to use to compute the distance. "
                         "Available values are {submission_time, starting_time, "
@@ -119,12 +133,13 @@ def main():
     args = parser.parse_args()
 
     if args.all:
-        args.type = ["euclidean", "lateness"]
+        args.type = ["euclidean", "lateness", "normalized_euclidean"]
         args.field = ["submission_time", "starting_time", "finish_time"]
 
     dist = distances(file1=args.file1, file2=args.file2,
                     euclidean="euclidean" in args.type,
                     lateness="lateness" in args.type,
+                    norm_eucl="normalized_euclidean" in args.type,
                     field=list(args.field))
     pretty_print(dist)
 
-- 
GitLab