test new distance and clean unused code

6ef4c361 · Maël Madon · d216708b · 6ef4c361 · 6ef4c361 · 6ef4c361
Commit 6ef4c361 authored 2 years ago by Maël Madon
--- a/distance_batsim_output.py
+++ b/distance_batsim_output.py
@@ -77,12 +77,6 @@ def lateness_distance(s1, s2):
    return float(np.sum([y-x for x, y in zip(s1, s2)]))
-def l2_norm(s):
-    """Return the l2 norm of the series s"""
-    return float( np.sqrt(np.sum([x * x for x in s])) )
 def distances(file1, file2, euclidean=True, lateness=False, norm_eucl=False,
              field=["finish_time"]):
    """Computes and returns a set of distances between two batsim outputs, if 

--- a/example_distance.ipynb
+++ b/example_distance.ipynb
@@ -285,7 +285,7 @@
  },
  {
   "cell_type": "code",
-   "execution_count": 5,
+   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
@@ -294,22 +294,20 @@
     "text": [
      "/home/mael/ownCloud/workspace/batmen-tools/distance_batsim_output.py:56: UserWarning: Some jobs in test/input/3jobs.csv and test/input/3jobs_zeros.csv don't have the same runtime (+/- 1sec). It is unusual, as runtime are normally an input of the simulation.\n",
      "  warnings.warn(f\"Some jobs in {file1} and {file2} don't have the same runtime (+/- 1sec). It is unusual, as runtime are normally an input of the simulation.\")\n",
-      "/home/mael/ownCloud/workspace/batmen-tools/distance_batsim_output.py:114: RuntimeWarning: divide by zero encountered in double_scalars\n",
-      "  dist[f][\"normalized_euclidean\"] = eucl / np.sqrt(sum_runtimes1 * sum_runtimes2)\n",
      "{\n",
      "    \"submission_time\": {\n",
      "        \"euclidean\": 50.0,\n",
-      "        \"normalized_euclidean\": Infinity,\n",
+      "        \"normalized_euclidean\": null,\n",
      "        \"lateness\": -70.0\n",
      "    },\n",
      "    \"starting_time\": {\n",
      "        \"euclidean\": 50.0,\n",
-      "        \"normalized_euclidean\": Infinity,\n",
+      "        \"normalized_euclidean\": null,\n",
      "        \"lateness\": -70.0\n",
      "    },\n",
      "    \"finish_time\": {\n",
      "        \"euclidean\": 100.0,\n",
-      "        \"normalized_euclidean\": Infinity,\n",
+      "        \"normalized_euclidean\": null,\n",
      "        \"lateness\": -140.0\n",
      "    }\n",
      "}\n"

 %% Cell type:markdown id: tags:
 # Example of distances
 This notebook calculate distances between several _jobs.csv as an example.
 %% Cell type:code id: tags:
 ``` python
 # Initialization
 import pandas as pd
 from evalys.jobset import JobSet
 from evalys.visu.gantt import plot_gantt
 three_jobs =            "test/input/3jobs.csv"
 three_jobs_w_session =  "test/input/3jobs_w_sessions.csv"
 three_jobs_zero =       "test/input/3jobs_zeros.csv"
 mc_10days_a60 =         "test/input/mc_10days_a60_jobs.csv"
 mc_10days_m60 =         "test/input/mc_10days_m60_jobs.csv"
 mc_10days_rigid =       "test/input/mc_10days_rigid_jobs.csv"
 ```
 %% Cell type:markdown id: tags:
 Vizualize the useful columns of a jobs.csv:
 %% Cell type:code id: tags:
 ``` python
 out1 = pd.read_csv(mc_10days_a60)
 # Select
 desired_cols = ["job_id", "submission_time", "starting_time", "finish_time", "success"]
 select = out1.loc[:, desired_cols]
 # Clean job_id
 select.job_id = select.job_id.astype(str)
 select.job_id = select.job_id.str.split(':', expand=True)[0]
 select.job_id = select.job_id.astype(int)
 select
 ```
 %% Output
         job_id  submission_time  starting_time  finish_time  success
    0      1216          55532.0        55532.0      55894.0        1
    1       247           9327.0         9327.0      66429.0        1
    2      1242          56876.0        56876.0      66496.0        1
    3      1434          66504.0        66504.0      67496.0        1
    4      1438          66506.0        66506.0      69764.0        1
    ..      ...              ...            ...          ...      ...
    319   18943         643942.0       643942.0     954600.0        1
    320   18945         643943.0       643943.0     958295.0        1
    321   21862         753752.0       753752.0     960982.0        1
    322   18944         643942.0       643942.0     961174.0        1
    323   21995         760446.0       760446.0    1031099.0        1
    [324 rows x 5 columns]
 %% Cell type:markdown id: tags:
 ## With mock files
 Visualize our two mock files 3jobs and 3jobs_w_sessions:
 %% Cell type:code id: tags:
 ``` python
 plot_gantt(JobSet.from_csv(three_jobs), title="three_jobs")
 plot_gantt(JobSet.from_csv(three_jobs_w_session), title="three_jobs_w_session")
 ```
 %% Output
    /home/mael/.local/lib/python3.10/site-packages/evalys/visu/core.py:62: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.
      self.fig.show()
    /home/mael/.local/lib/python3.10/site-packages/evalys/visu/core.py:62: UserWarning: Matplotlib is currently using module://matplotlib_inline.backend_inline, which is a non-GUI backend, so cannot show the figure.
      self.fig.show()
 %% Cell type:code id: tags:
 ``` python
 !python3 distance_batsim_output.py {three_jobs} {three_jobs_w_session} --all
 ```
 %% Output
    /home/mael/ownCloud/workspace/batmen-tools/distance_batsim_output.py:56: UserWarning: Some jobs in test/input/3jobs.csv and test/input/3jobs_w_sessions.csv don't have the same runtime (+/- 1sec). It is unusual, as runtime are normally an input of the simulation.
      warnings.warn(f"Some jobs in {file1} and {file2} don't have the same runtime (+/- 1sec). It is unusual, as runtime are normally an input of the simulation.")
    {
        "submission_time": {
            "euclidean": 0.0,
            "normalized_euclidean": 0.0,
            "lateness": 0.0
        },
        "starting_time": {
            "euclidean": 0.0,
            "normalized_euclidean": 0.0,
            "lateness": 0.0
        },
        "finish_time": {
            "euclidean": 28.284271247461902,
            "normalized_euclidean": 0.4040610178208843,
            "lateness": 0.0
        }
    }
 %% Cell type:code id: tags:
 ``` python
 !python3 distance_batsim_output.py {three_jobs} {three_jobs_zero} --all
 ```
 %% Output
    /home/mael/ownCloud/workspace/batmen-tools/distance_batsim_output.py:56: UserWarning: Some jobs in test/input/3jobs.csv and test/input/3jobs_zeros.csv don't have the same runtime (+/- 1sec). It is unusual, as runtime are normally an input of the simulation.
      warnings.warn(f"Some jobs in {file1} and {file2} don't have the same runtime (+/- 1sec). It is unusual, as runtime are normally an input of the simulation.")
-    /home/mael/ownCloud/workspace/batmen-tools/distance_batsim_output.py:114: RuntimeWarning: divide by zero encountered in double_scalars
-      dist[f]["normalized_euclidean"] = eucl / np.sqrt(sum_runtimes1 * sum_runtimes2)
    {
        "submission_time": {
            "euclidean": 50.0,
-            "normalized_euclidean": Infinity,
+            "normalized_euclidean": null,
            "lateness": -70.0
        },
        "starting_time": {
            "euclidean": 50.0,
-            "normalized_euclidean": Infinity,
+            "normalized_euclidean": null,
            "lateness": -70.0
        },
        "finish_time": {
            "euclidean": 100.0,
-            "normalized_euclidean": Infinity,
+            "normalized_euclidean": null,
            "lateness": -140.0
        }
    }
 %% Cell type:code id: tags:
 ``` python
 !python3 distance_batsim_output.py {mc_10days_a60} {mc_10days_m60} --all
 ```
 %% Output
    {
        "submission_time": {
            "euclidean": 172.1394783308001,
            "normalized_euclidean": 1.2942984329407551e-05,
            "lateness": 2254.0
        },
        "starting_time": {
            "euclidean": 172.1394783308001,
            "normalized_euclidean": 1.2942984329407551e-05,
            "lateness": 2254.0
        },
        "finish_time": {
            "euclidean": 172.1394783308001,
            "normalized_euclidean": 1.2942984329407551e-05,
            "lateness": 2254.0
        }
    }
 %% Cell type:code id: tags:
 ``` python
 !python3 distance_batsim_output.py {mc_10days_a60} {mc_10days_rigid} --all
 ```
 %% Output
    {
        "submission_time": {
            "euclidean": 241.15762480170517,
            "normalized_euclidean": 1.813216587526566e-05,
            "lateness": 3311.0
        },
        "starting_time": {
            "euclidean": 241.15762480170517,
            "normalized_euclidean": 1.813216587526566e-05,
            "lateness": 3311.0
        },
        "finish_time": {
            "euclidean": 255.14897609044016,
            "normalized_euclidean": 1.91841479662116e-05,
            "lateness": 3633.0
        }
    }
 %% Cell type:code id: tags:
 ``` python
 ```

--- a/test/test_distance.py
+++ b/test/test_distance.py
@@ -19,17 +19,6 @@ def test_lateness_distance():
    assert lateness_distance(s1, s2) == - lateness_distance(s2, s1) == 6
    assert lateness_distance(empty, empty) == 0
-def test_l2_norm():
-    assert l2_norm(empty) == 0
-    assert l2_norm(s1) == np.sqrt(10*10 + 14*14 + 500*500)
-    assert l2_norm(u) == 2
-    assert l2_norm(v) == 1
-def test_normalized_euclidean_distance():
-    assert normalized_euclidian_distance(u, v) == .5
 ####### Integration tests #######
 three_jobs = "test/input/3jobs.csv"
@@ -40,7 +29,8 @@ mc_10days_a60 = "test/input/mc_10days_a60_jobs.csv"
 def test_cleaning():
    # Clean session info:
-    distances(three_jobs_w_session, three_jobs_zero)
+    with pytest.warns(UserWarning):
+        distances(three_jobs_w_session, three_jobs_zero)
    # Clean unsuccessful jobs:
    with pytest.warns(UserWarning):
@@ -63,24 +53,26 @@ def test_some_distances():
        euclidean=False, lateness=True, field=fin)      == 0 
    # Eucl distance
-    assert distances(three_jobs, three_jobs_zero, field=sub)    == 50
+    with pytest.warns(UserWarning):
-    assert distances(three_jobs_zero, three_jobs, field=start)  == 50
+        assert distances(three_jobs, three_jobs_zero, field=sub)    == 50
-    assert distances(three_jobs, three_jobs_zero, field=fin)    == 100
+        assert distances(three_jobs_zero, three_jobs, field=start)  == 50
+        assert distances(three_jobs, three_jobs_zero, field=fin)    == 100
-    assert distances(three_jobs, three_jobs_w_session, field=sub)   == 0
+        assert distances(three_jobs, three_jobs_w_session, field=sub)   == 0
-    assert distances(three_jobs, three_jobs_w_session, field=start) == 0
+        assert distances(three_jobs, three_jobs_w_session, field=start) == 0
-    assert distances(three_jobs, three_jobs_w_session, field=fin)   == 20 * np.sqrt(2)
+        assert distances(three_jobs, three_jobs_w_session, field=fin)   == 20 * np.sqrt(2)
    # Normalized eucl distance
-    assert distances(three_jobs, three_jobs_zero, 
+    with pytest.warns(UserWarning):
-        euclidean=False, norm_eucl=True, field=sub)           == None
+        assert distances(three_jobs, three_jobs_zero, 
-    assert distances(three_jobs, three_jobs_w_session,
+            euclidean=False, norm_eucl=True, field=sub)           == None
-        euclidean=False, norm_eucl=True, field=sub)           == 0
+        assert distances(three_jobs, three_jobs_w_session,
+            euclidean=False, norm_eucl=True, field=sub)           == 0
-    norm_dis_A_B = distances(three_jobs, three_jobs_w_session,
-                        euclidean=False, norm_eucl=True, field=fin)
+        norm_dis_A_B = distances(three_jobs, three_jobs_w_session,
-    norm_dis_B_A = distances(three_jobs, three_jobs_w_session,
+                            euclidean=False, norm_eucl=True, field=fin)
+        norm_dis_B_A = distances(three_jobs, three_jobs_w_session,
                        euclidean=False, norm_eucl=True, field=fin)
-    expected = 800 / (100*100)
+    expected = 20 * np.sqrt(2) / 70     # eucl / sum_runtime
    assert norm_dis_A_B == norm_dis_B_A
    assert norm_dis_B_A - expected  < 1e-8