Skip to content
Snippets Groups Projects
distance_batsim_output.py 4.77 KiB
#!/usr/bin/env python3

"""Compute a set of distances between two batsim outputs (_jobs.csv)"""

import pandas as pd
import numpy as np
import argparse
import json
import warnings
from scipy.stats import entropy


def clean_and_select(df):
    """Select only desired column from the dataframe and clean the job_ids"""

    # Select
    desired_cols = ["job_id", "submission_time",
                    "starting_time", "finish_time", "success"]
    select = df.loc[:, desired_cols]

    # Clean job_id (remove the sessions, if present)
    select.job_id = select.job_id.astype(str)
    select["job_id"] = select["job_id"].str.split(':', expand=True)[0]
    select.job_id = select.job_id.astype(int)

    return select.sort_values(by="job_id")


def open_and_compare(file1, file2):
    """Open file1 and file2, two _jobs.csv files. Checks if the job_id columns 
    match and return their cleaned pandas Dataframe representation"""

    out1 = pd.read_csv(file1)
    out2 = pd.read_csv(file2)

    out1 = clean_and_select(out1)
    out2 = clean_and_select(out2)

    if not out1.job_id.equals(out2.job_id):
        raise KeyError(
            f"{file1} and {file2} cannot be compared: they don't have the same job_ids")

    if not out1.success.equals(out2.success):
        warnings.warn(f"Some jobs in {file1} and {file2} don't have the same success status. Comparing only the jobs that were successful in both.")
    
        diff = out1.success.eq(out2)
        out1 = out1.loc[diff, :]
        out2 = out2.loc[diff, :]

    out1 = out1[out1.success == 1]
    out2 = out2[out2.success == 1]

    return out1, out2


def euclidean_distance(s1, s2):
    """Returns the Euclidean distance between two series s1 and s2"""

    dist = np.sqrt(np.sum([(x-y) * (x-y) for x, y in zip(s1, s2)]))
    return dist


def lateness_distance(s1, s2):
    """Returns the 'lateness' of s2 compared to s1"""

    return np.sum([y-x for x, y in zip(s1, s2)])

def normalized_euclidian_distance(s1, s2):
    """Return the euclidien distance normalized by the l2 norm of the vectors"""

    eucl_dist = euclidean_distance(s1, s2)
    return eucl_dist**2 / (l2_norm(s1) * l2_norm(s2))

def l2_norm(s):
    """Return the l2 norm of the series s"""

    return np.sqrt(np.sum([x * x for x in s]))


def distances(file1, file2, euclidean=True, lateness=False, norm_eucl=False,
              field=["finish_time"]):
    """Computes and returns a set of distances between two batsim outputs, if 
    they have the same job_ids."""

    out1, out2 = open_and_compare(file1, file2)

    dist = {}
    for f in field:
        dist[f] = {}
        if euclidean:
            dist[f]["euclidean"] = euclidean_distance(out1[f], out2[f])
        if norm_eucl:
            dist[f]["normalized_euclidean"] = normalized_euclidian_distance(out1[f], out2[f])
        if lateness:
            dist[f]["lateness"] = lateness_distance(out1[f], out2[f])

    return dist


def pretty_print(dist):
    """Nice printing of the dictionnary dist"""
    
    if len(dist) == 1:
        val = list(dist.values())[0]
        if len(val) == 1:
            print(list(val.values())[0])
    else:
        pretty = json.dumps(dist, indent=4)
        print(pretty)

def main():
    """Program entry point if called with CLI"""

    parser = argparse.ArgumentParser(
        description="Computes and prints a set of distances between two batsim"
                    "outputs, if they have the same job_ids. "
                    "Default: euclidean distance on finish_time.")
    parser.add_argument('file1', type=str,
                        help='The first _jobs.csv file')
    parser.add_argument('file2', type=str,
                        help='The second _jobs.csv file')

    parser.add_argument("--type", nargs='+', default=['euclidean'],
                        help="Type of distance to use. Available values are "
                        "{euclidean, lateness, normalized_euclidean}")
    parser.add_argument("--field", nargs='+', default=['finish_time'],
                        help="The field to use to compute the distance. "
                        "Available values are {submission_time, starting_time, "
                        "finish_time}")
    parser.add_argument("--all", action="store_true",
                        help="Print all available distances on all available fields") 

    args = parser.parse_args()

    if args.all:
        args.type = ["euclidean", "lateness", "normalized_euclidean"]
        args.field = ["submission_time", "starting_time", "finish_time"]

    dist = distances(file1=args.file1, file2=args.file2,
                    euclidean="euclidean" in args.type,
                    lateness="lateness" in args.type,
                    norm_eucl="normalized_euclidean" in args.type,
                    field=list(args.field))
    pretty_print(dist)


if __name__ == "__main__":
    main()