progress on distance tool. should maybe add a 'similarity' test as well

221240c1 · Maël Madon · c12f9c44 · 221240c1
Commit 221240c1 authored 2 years ago by Maël Madon
--- a/distance_batsim_output.py
+++ b/distance_batsim_output.py
@@ -3,24 +3,131 @@
 """Compute a set of distances between two batsim outputs (_jobs.csv)"""

 import pandas as pd
+import numpy as np
+import argparse
+import json
+import warnings
+

 def clean_and_select(df):
    """Select only desired column from the dataframe and clean the job_ids"""

    # Select
-    desired_cols = ["job_id", "submission_time", "starting_time", "finish_time"]
+    desired_cols = ["job_id", "submission_time",
+                    "starting_time", "finish_time", "success"]
    select = df.loc[:, desired_cols]

-    # Clean job_id
+    # Clean job_id (remove the sessions, if present)
    select.job_id = select.job_id.astype(str)
    select["job_id"] = select["job_id"].str.split(':', expand=True)[0]
+    select.job_id = select.job_id.astype(int)
+
+    return select.sort_values(by="job_id")

-    return select

 def open_and_compare(file1, file2):
-    """Open file1 and file2, two _jobs.csv files. Checks if the job_ids match 
-    and return their pandas Dataframe representation"""
+    """Open file1 and file2, two _jobs.csv files. Checks if the job_id columns 
+    match and return their cleaned pandas Dataframe representation"""

    out1 = pd.read_csv(file1)
    out2 = pd.read_csv(file2)

+    out1 = clean_and_select(out1)
+    out2 = clean_and_select(out2)
+
+    if not out1.job_id.equals(out2.job_id):
+        raise KeyError(
+            f"{file1} and {file2} cannot be compared: they don't have the same job_ids")
+
+    if not out1.success.equals(out2.success):
+        warnings.warn(f"Some jobs in {file1} and {file2} don't have the same success status. Comparing only the jobs that were successful in both.")
+    
+        diff = out1.success.eq(out2)
+        out1 = out1.loc[diff, :]
+        out2 = out2.loc[diff, :]
+
+    out1 = out1[out1.success == 1]
+    out2 = out2[out2.success == 1]
+
+    return out1, out2
+
+
+def euclidean_distance(s1, s2):
+    """Returns the Euclidean distance between two series s1 and s2"""
+
+    dist = np.sqrt(np.sum([(x-y) * (x-y) for x, y in zip(s1, s2)]))
+    return dist
+
+
+def lateness_distance(s1, s2):
+    """Returns the 'lateness' of s2 compared to s1"""
+
+    return np.sum([y-x for x, y in zip(s1, s2)])
+
+
+def distances(file1, file2, euclidean=True, lateness=False,
+              field=["finish_time"]):
+    """Computes and returns a set of distances between two batsim outputs, if 
+    they have the same job_ids."""
+
+    out1, out2 = open_and_compare(file1, file2)
+    dist = {}
+
+    for f in field:
+        dist[f] = {}
+        if euclidean: 
+            dist[f]["euclidean"] = euclidean_distance(out1[f], out2[f])
+        if lateness:
+            dist[f]["lateness"] = lateness_distance(out1[f], out2[f])
+
+    return dist
+
+
+def pretty_print(dist):
+    """Nice printing of the dictionnary dist"""
+    
+    if len(dist) == 1:
+        val = list(dist.values())[0]
+        if len(val) == 1:
+            print(list(val.values())[0])
+    else:
+        pretty = json.dumps(dist, indent=4)
+        print(pretty)
+
+def main():
+    """Program entry point if called with CLI"""
+
+    parser = argparse.ArgumentParser(
+        description="Computes and prints a set of distances between two batsim"
+                    "outputs, if they have the same job_ids. "
+                    "Default: euclidean distance on finish_time.")
+    parser.add_argument('file1', type=str,
+                        help='The first _jobs.csv file')
+    parser.add_argument('file2', type=str,
+                        help='The second _jobs.csv file')
+
+    parser.add_argument("--type", nargs='+', default=['euclidean'],
+                        help="Type of distance to use. Available values are "
+                        "{euclidean, lateness}")
+    parser.add_argument("--field", nargs='+', default=['finish_time'],
+                        help="The field to use to compute the distance. "
+                        "Available values are {submission_time, starting_time, "
+                        "finish_time}")
+    parser.add_argument("--all", action="store_true",
+                        help="Print all available distances on all available fields") 
+
+    args = parser.parse_args()
+
+    if args.all:
+        args.type = ["euclidean", "lateness"]
+        args.field = ["submission_time", "starting_time", "finish_time"]
+
+    dist = distances(file1=args.file1, file2=args.file2,
+                    euclidean="euclidean" in args.type,
+                    lateness="lateness" in args.type,
+                    field=list(args.field))
+    pretty_print(dist)
+
+
+if __name__ == "__main__":
+    main()