Skip to content
Snippets Groups Projects
Commit 1ed95d72 authored by Maël Madon's avatar Maël Madon
Browse files

test: add distance tool for tests and instructions to update it

parent ba042cf0
No related branches found
No related tags found
1 merge request!2Feature "replay with feedback" ready and tested
# Batmen tests
### Extra dependencies
Some tests use a tool from the external library `batmen-tools` developped separately. To update it to the latest version:
```bash
cd test
curl -O https://gitlab.irit.fr/sepia-pub/mael/batmen-tools/-/raw/main/distance_batsim_output.py
```
### Running tests
``` bash
nix-shell ../default.nix -A test --command 'pytest'
......
#!/usr/bin/env python3
"""Compute a set of distances between two batsim outputs (_jobs.csv)"""
import pandas as pd
import numpy as np
import argparse
import json
import warnings
from scipy.stats import entropy
def clean_and_select(df):
"""Select only desired column from the dataframe and clean the job_ids"""
# Select
desired_cols = ["job_id", "submission_time",
"starting_time", "finish_time", "success"]
select = df.loc[:, desired_cols]
# Clean job_id (remove the sessions, if present)
select.job_id = select.job_id.astype(str)
select["job_id"] = select["job_id"].str.split(':', expand=True)[0]
select.job_id = select.job_id.astype(int)
return select.sort_values(by="job_id")
def open_and_compare(file1, file2):
"""Open file1 and file2, two _jobs.csv files. Checks if the job_id columns
match and return their cleaned pandas Dataframe representation"""
out1 = pd.read_csv(file1)
out2 = pd.read_csv(file2)
out1 = clean_and_select(out1)
out2 = clean_and_select(out2)
if not out1.job_id.equals(out2.job_id):
raise KeyError(
f"{file1} and {file2} cannot be compared: they don't have the same job_ids")
if not out1.success.equals(out2.success):
warnings.warn(f"Some jobs in {file1} and {file2} don't have the same success status. Comparing only the jobs that were successful in both.")
diff = out1.success.eq(out2.success)
out1 = out1.loc[diff, :]
out2 = out2.loc[diff, :]
out1 = out1[out1.success == 1]
out2 = out2[out2.success == 1]
return out1, out2
def euclidean_distance(s1, s2):
"""Returns the Euclidean distance between two series s1 and s2"""
dist = np.sqrt(np.sum([(x-y) * (x-y) for x, y in zip(s1, s2)]))
return float(dist)
def lateness_distance(s1, s2):
"""Returns the 'lateness' of s2 compared to s1"""
return float(np.sum([y-x for x, y in zip(s1, s2)]))
def normalized_euclidian_distance(s1, s2):
"""Return the euclidien distance normalized by the l2 norm of the vectors,
or None if one of the vectors is the null vector (undefined)"""
n1, n2 = l2_norm(s1), l2_norm(s2)
if n1==0 or n2==0:
return None
eucl_dist = euclidean_distance(s1, s2)
return float( eucl_dist**2 / (n1 * n2) )
def l2_norm(s):
"""Return the l2 norm of the series s"""
return float( np.sqrt(np.sum([x * x for x in s])) )
def distances(file1, file2, euclidean=True, lateness=False, norm_eucl=False,
field=["finish_time"]):
"""Computes and returns a set of distances between two batsim outputs, if
they have the same job_ids."""
out1, out2 = open_and_compare(file1, file2)
dist = {}
for f in field:
dist[f] = {}
if euclidean:
dist[f]["euclidean"] = euclidean_distance(out1[f], out2[f])
if norm_eucl:
dist[f]["normalized_euclidean"] = normalized_euclidian_distance(out1[f], out2[f])
if lateness:
dist[f]["lateness"] = lateness_distance(out1[f], out2[f])
# If only one value, returns it directly
if len(dist) == 1:
val = list(dist.values())[0]
if len(val) == 1:
return list(val.values())[0]
# Otherwise returns the dictionnary
return dist
def pretty_print(dist):
"""Nice printing of the dictionnary dist"""
if isinstance(dist, dict):
pretty = json.dumps(dist, indent=4)
print(pretty)
else:
print(dist)
def main():
"""Program entry point if called with CLI"""
parser = argparse.ArgumentParser(
description="Computes and prints a set of distances between two batsim"
"outputs, if they have the same job_ids. "
"Default: euclidean distance on finish_time.")
parser.add_argument('file1', type=str,
help='The first _jobs.csv file')
parser.add_argument('file2', type=str,
help='The second _jobs.csv file')
parser.add_argument("--type", nargs='+', default=['euclidean'],
help="Type of distance to use. Available values are "
"{euclidean, lateness, normalized_euclidean}")
parser.add_argument("--field", nargs='+', default=['finish_time'],
help="The field to use to compute the distance. "
"Available values are {submission_time, starting_time, "
"finish_time}")
parser.add_argument("--all", action="store_true",
help="Print all available distances on all available fields")
args = parser.parse_args()
if args.all:
args.type = ["euclidean", "lateness", "normalized_euclidean"]
args.field = ["submission_time", "starting_time", "finish_time"]
dist = distances(file1=args.file1, file2=args.file2,
euclidean="euclidean" in args.type,
lateness="lateness" in args.type,
norm_eucl="normalized_euclidean" in args.type,
field=list(args.field))
pretty_print(dist)
if __name__ == "__main__":
main()
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment