From c12f9c444431a95a4cf2d1ffb8fbb93583c3492f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ma=C3=ABl=20Madon?= <mael.madon@irit.fr>
Date: Fri, 11 Nov 2022 10:53:25 +0100
Subject: [PATCH] new tool for computing distance between two jobs.csv files

---
 distance_batsim_output.py | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)
 create mode 100644 distance_batsim_output.py

diff --git a/distance_batsim_output.py b/distance_batsim_output.py
new file mode 100644
index 0000000..883ad18
--- /dev/null
+++ b/distance_batsim_output.py
@@ -0,0 +1,26 @@
+#!/usr/bin/env python3
+
+"""Compute a set of distances between two batsim outputs (_jobs.csv)"""
+
+import pandas as pd
+
+def clean_and_select(df):
+    """Select only desired column from the dataframe and clean the job_ids"""
+
+    # Select
+    desired_cols = ["job_id", "submission_time", "starting_time", "finish_time"]
+    select = df.loc[:, desired_cols]
+
+    # Clean job_id
+    select.job_id = select.job_id.astype(str)
+    select["job_id"] = select["job_id"].str.split(':', expand=True)[0]
+
+    return select
+
+def open_and_compare(file1, file2):
+    """Open file1 and file2, two _jobs.csv files. Checks if the job_ids match 
+    and return their pandas Dataframe representation"""
+
+    out1 = pd.read_csv(file1)
+    out2 = pd.read_csv(file2)
+
-- 
GitLab