test: add distance tool for tests and instructions to update it

1ed95d72 · Maël Madon · ba042cf0 · 1ed95d72 · 1ed95d72
Commit 1ed95d72 authored 2 years ago by Maël Madon
--- a/test/README.md
+++ b/test/README.md
+# Batmen tests
+
+### Extra dependencies
+Some tests use a tool from the external library `batmen-tools` developped separately. To update it to the latest version:
+
+```bash
+cd test
+curl -O https://gitlab.irit.fr/sepia-pub/mael/batmen-tools/-/raw/main/distance_batsim_output.py
+```
+
 ### Running tests
 ``` bash
 nix-shell ../default.nix -A test --command 'pytest'

--- a/test/distance_batsim_output.py
+++ b/test/distance_batsim_output.py
+#!/usr/bin/env python3
+
+"""Compute a set of distances between two batsim outputs (_jobs.csv)"""
+
+import pandas as pd
+import numpy as np
+import argparse
+import json
+import warnings
+from scipy.stats import entropy
+
+
+def clean_and_select(df):
+    """Select only desired column from the dataframe and clean the job_ids"""
+
+    # Select
+    desired_cols = ["job_id", "submission_time",
+                    "starting_time", "finish_time", "success"]
+    select = df.loc[:, desired_cols]
+
+    # Clean job_id (remove the sessions, if present)
+    select.job_id = select.job_id.astype(str)
+    select["job_id"] = select["job_id"].str.split(':', expand=True)[0]
+    select.job_id = select.job_id.astype(int)
+
+    return select.sort_values(by="job_id")
+
+
+def open_and_compare(file1, file2):
+    """Open file1 and file2, two _jobs.csv files. Checks if the job_id columns 
+    match and return their cleaned pandas Dataframe representation"""
+
+    out1 = pd.read_csv(file1)
+    out2 = pd.read_csv(file2)
+
+    out1 = clean_and_select(out1)
+    out2 = clean_and_select(out2)
+
+    if not out1.job_id.equals(out2.job_id):
+        raise KeyError(
+            f"{file1} and {file2} cannot be compared: they don't have the same job_ids")
+
+    if not out1.success.equals(out2.success):
+        warnings.warn(f"Some jobs in {file1} and {file2} don't have the same success status. Comparing only the jobs that were successful in both.")
+    
+        diff = out1.success.eq(out2.success)
+        out1 = out1.loc[diff, :]
+        out2 = out2.loc[diff, :]
+
+    out1 = out1[out1.success == 1]
+    out2 = out2[out2.success == 1]
+
+    return out1, out2
+
+
+def euclidean_distance(s1, s2):
+    """Returns the Euclidean distance between two series s1 and s2"""
+
+    dist = np.sqrt(np.sum([(x-y) * (x-y) for x, y in zip(s1, s2)]))
+    return float(dist)
+
+
+def lateness_distance(s1, s2):
+    """Returns the 'lateness' of s2 compared to s1"""
+
+    return float(np.sum([y-x for x, y in zip(s1, s2)]))
+
+def normalized_euclidian_distance(s1, s2):
+    """Return the euclidien distance normalized by the l2 norm of the vectors, 
+    or None if one of the vectors is the null vector (undefined)"""
+
+    n1, n2 = l2_norm(s1), l2_norm(s2)
+    if n1==0 or n2==0:
+        return None
+    eucl_dist = euclidean_distance(s1, s2)
+    return float( eucl_dist**2 / (n1 * n2) )
+
+def l2_norm(s):
+    """Return the l2 norm of the series s"""
+
+    return float( np.sqrt(np.sum([x * x for x in s])) )
+
+
+def distances(file1, file2, euclidean=True, lateness=False, norm_eucl=False,
+              field=["finish_time"]):
+    """Computes and returns a set of distances between two batsim outputs, if 
+    they have the same job_ids."""
+
+    out1, out2 = open_and_compare(file1, file2)
+
+    dist = {}
+    for f in field:
+        dist[f] = {}
+        if euclidean:
+            dist[f]["euclidean"] = euclidean_distance(out1[f], out2[f])
+        if norm_eucl:
+            dist[f]["normalized_euclidean"] = normalized_euclidian_distance(out1[f], out2[f])
+        if lateness:
+            dist[f]["lateness"] = lateness_distance(out1[f], out2[f])
+
+    # If only one value, returns it directly
+    if len(dist) == 1:
+        val = list(dist.values())[0]
+        if len(val) == 1:
+            return list(val.values())[0]
+
+    # Otherwise returns the dictionnary
+    return dist
+
+
+def pretty_print(dist):
+    """Nice printing of the dictionnary dist"""
+    
+    if isinstance(dist, dict):
+        pretty = json.dumps(dist, indent=4)
+        print(pretty)
+    else:
+        print(dist)
+
+def main():
+    """Program entry point if called with CLI"""
+
+    parser = argparse.ArgumentParser(
+        description="Computes and prints a set of distances between two batsim"
+                    "outputs, if they have the same job_ids. "
+                    "Default: euclidean distance on finish_time.")
+    parser.add_argument('file1', type=str,
+                        help='The first _jobs.csv file')
+    parser.add_argument('file2', type=str,
+                        help='The second _jobs.csv file')
+
+    parser.add_argument("--type", nargs='+', default=['euclidean'],
+                        help="Type of distance to use. Available values are "
+                        "{euclidean, lateness, normalized_euclidean}")
+    parser.add_argument("--field", nargs='+', default=['finish_time'],
+                        help="The field to use to compute the distance. "
+                        "Available values are {submission_time, starting_time, "
+                        "finish_time}")
+    parser.add_argument("--all", action="store_true",
+                        help="Print all available distances on all available fields") 
+
+    args = parser.parse_args()
+
+    if args.all:
+        args.type = ["euclidean", "lateness", "normalized_euclidean"]
+        args.field = ["submission_time", "starting_time", "finish_time"]
+
+    dist = distances(file1=args.file1, file2=args.file2,
+                    euclidean="euclidean" in args.type,
+                    lateness="lateness" in args.type,
+                    norm_eucl="normalized_euclidean" in args.type,
+                    field=list(args.field))
+    pretty_print(dist)
+
+
+if __name__ == "__main__":
+    main()