From f7dbc8c87c4cdeaa2b43e191317234b3b351ddfa Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ma=C3=ABl=20Madon?= <mael.madon@irit.fr>
Date: Thu, 3 Nov 2022 11:59:24 +0100
Subject: [PATCH] remove job filters from swf_to_batsim_split_by_user

---
 swf_to_batsim_split_by_user.py | 102 +++++++++------------------------
 1 file changed, 28 insertions(+), 74 deletions(-)

diff --git a/swf_to_batsim_split_by_user.py b/swf_to_batsim_split_by_user.py
index 6154f4c..57a86a8 100755
--- a/swf_to_batsim_split_by_user.py
+++ b/swf_to_batsim_split_by_user.py
@@ -3,7 +3,6 @@
 """
 Transforms a SWF to a Batsim workload with computation-only jobs. 
 Split by user. Do not include the jobs profiles in the output JSON.
-(optional) keeps only a given set of partition.
 """
 
 import argparse
@@ -17,49 +16,41 @@ from swf import SwfField
 
 
 def generate_workload(input_swf, output_folder,
-                      partitions_to_select=None,
                       start_time=None,
                       job_walltime_factor=2,
                       given_walltime_only=False,
                       job_grain=1,
                       indent=None,
-                      keep_only=None,
-                      verbose=False,
                       quiet=False,
                       job_size_function_string='1*nb_res'):
     """Generate a Batsim workload from a SWF trace."""
-    print(f"Input file = {input_swf}")
-    print(f"output folder = {output_folder}")
+    if not quiet:
+        print(f"Input file = {input_swf}")
+        print(f"output folder = {output_folder}")
 
     element = '([-+]?\d+(?:\.\d+)?)'
     r = re.compile('\s*' + (element + '\s+') * 17 + element + '\s*')
 
-    current_id = 0
     version = 0
 
-    # A dictionary of users, each entry containing the jobs and the profiles
-    # used by this user
+    # A dictionary of users, each entry containing the its jobs 
     users = {}
-    # Let a job be a tuple (job_id, nb_res, run_time, submit_time, profile,
-    # walltime)
-    jobs = []
+
 
     # Some job counters
-    not_selected = {"nb": 0, "coreh": 0}
-    selected = {"nb": 0, "coreh": 0}
     not_valid = 0
     not_line_match_format = 0
 
     minimum_observed_submit_time = float('inf')
 
     # Let's loop over the lines of the input file
-    i = 1
+    i = 0
     with open(input_swf, 'r') as swf:
         for line in swf:
             i += 1
-            if i % 100000 == 0:
-                print("Processing swf line", i)
-
+            if not quiet and i % 10000 == 0:
+                print(f"\r\033[KProcessing swf line {i}...", end="")
+            
             res = r.match(line)
 
             if res:
@@ -72,7 +63,6 @@ def generate_workload(input_swf, output_folder,
                 walltime = max(job_walltime_factor * run_time,
                             float(res.group(SwfField.REQUESTED_TIME.value)))
                 user_id = str(res.group(SwfField.USER_ID.value))
-                partition_id = int(res.group(SwfField.PARTITION_ID.value))
 
                 # nb_res may be changed by calling a user-given function
                 nb_res = eval(job_size_function_string)
@@ -80,39 +70,28 @@ def generate_workload(input_swf, output_folder,
                 if given_walltime_only:
                     walltime = float(res.group(SwfField.REQUESTED_TIME.value))
 
-                # Select jobs to keep
                 is_valid_job = (nb_res > 0 and walltime >
                                 run_time and run_time > 0 and submit_time >= 0)
-                select_partition = ((partitions_to_select is None) or
-                                    (partition_id in partitions_to_select))
-                use_job = select_partition and (
-                    (keep_only is None) or eval(keep_only))
 
                 if not is_valid_job:
                     not_valid += 1
-                if not use_job:
-                    not_selected["nb"] += 1
-                    not_selected["coreh"] += run_time * nb_res
-
+                
                 else:
-                    selected["nb"] += 1
-                    selected["coreh"] += run_time * nb_res
-
-                    if not(users.__contains__(user_id)):
-                        users[user_id] = {}
-                        users[user_id]["jobs"] = []
+                    if user_id not in users:
+                        users[user_id] = []
 
                     profile = int(((run_time // job_grain) + 1) * job_grain)
 
-                    job = (current_id, nb_res, run_time,
+                    job = (job_id, nb_res, run_time,
                         submit_time, profile, walltime)
-                    current_id = current_id + 1
                     minimum_observed_submit_time = min(minimum_observed_submit_time,
                                                     submit_time)
-                    users[user_id]["jobs"].append(job)
+                    users[user_id].append(job)
 
             else:
                 not_line_match_format += 1
+        if not quiet:
+            print("\nEnd parsing.")
 
     # Create a json file per user
     if not os.path.exists(output_folder):
@@ -123,8 +102,7 @@ def generate_workload(input_swf, output_folder,
     else:
         translate_submit_times = start_time
 
-    for user_id in users:
-        jobs = users[user_id]["jobs"]
+    for user_id, jobs in users.items():
         # Export JSON
         # Let's generate a list of dictionaries for the jobs
         djobs = list()
@@ -135,17 +113,14 @@ def generate_workload(input_swf, output_folder,
                           'res': nb_res,
                           'profile': str(profile)})
 
-        biggest_job = max([nb_res for (job_id, nb_res, run_time, submit_time,
-                                       profile, walltime) in jobs])
-
-        platform_size = biggest_job
+        biggest_job = max([nb_res for (_, nb_res, _, _, _, _) in jobs])
 
         data = {
             'version': version,
             'command': ' '.join(sys.argv[:]),
             'date': datetime.datetime.now().isoformat(' '),
             'description': 'Workload for user {}'.format(user_id),
-            'nb_res': platform_size,
+            'nb_res': biggest_job,
             'jobs': djobs}
 
         try:
@@ -157,26 +132,18 @@ def generate_workload(input_swf, output_folder,
             if not quiet:
                 print('user {}:'.format(user_id))
                 print('   {} jobs had been created'.format(len(djobs)))
-                if keep_only:
-                    print('   {} jobs have been removed by keep_only'.format(
-                        len(jobs) - len(djobs)))
+
 
         except IOError:
             print('Cannot write file', output_json)
 
-    print('-------------------\nEnd parsing')
-    print('Total {} jobs and {} users have been created.'.format(
-        selected["nb"], len(users)))
-    print(
-        'Total number of core-hours: {:.0f}'.format(selected["coreh"] / 3600))
-    print('{} valid jobs were not selected (keep_only) for {:.0f} core-hour'.format(
-        not_selected["nb"], not_selected["coreh"] / 3600))
-    print("Jobs not selected: {:.1f}% in number, {:.1f}% in core-hour"
-          .format(not_selected["nb"] / (not_selected["nb"]+selected["nb"]) * 100,
-                  not_selected["coreh"] / (selected["coreh"]+not_selected["coreh"]) * 100))
-    print('{} out of {} lines in the file did not match the swf format'.format(
-        not_line_match_format, i))
-    print('{} jobs were not valid'.format(not_valid))
+    if not quiet:
+        print('-------------------')
+        print('Total {} jobs and {} users have been created.'.format(
+            sum([len(jobs) for jobs in users.values()]), len(users)))
+        print('{} out of {} lines in the file did not match the swf format'.format(
+            not_line_match_format, i))
+        print('{} jobs were not valid'.format(not_valid))
 
 
 def main():
@@ -192,10 +159,6 @@ def main():
     parser.add_argument('output_folder', type=str,
                         help='The output folder for the JSON files')
 
-    parser.add_argument('-sp', '--partitions_to_select',
-                        type=int, nargs='+', default=None,
-                        help='List of partitions to only consider in the input trace. The jobs running in the other partitions will be discarded.')
-
     parser.add_argument('--start_time', type=int, default=None,
                         help='If set, the submit times will be translated towards zero by this value. Otherwise, the first job sets the zero.')
 
@@ -215,27 +178,18 @@ def main():
                         help='Selects the level of detail we want for jobs. This parameter is used to group jobs that have close running time')
     parser.add_argument('-i', '--indent', type=int, default=None,
                         help='If set to a non-negative integer, then JSON array elements and object members will be pretty-printed with that indent level. An indent level of 0, or negative, will only insert newlines. The default value (None) selects the most compact representation.')
-    parser.add_argument('--keep_only',
-                        type=str,
-                        default=None,
-                        help='If set, this parameter is evaluated to choose which jobs should be kept')
 
-    group = parser.add_mutually_exclusive_group()
-    group.add_argument("-v", "--verbose", action="store_true")
-    group.add_argument("-q", "--quiet", action="store_true")
+    parser.add_argument("-q", "--quiet", action="store_true")
 
     args = parser.parse_args()
 
     generate_workload(input_swf=args.input_swf,
                       output_folder=args.output_folder,
-                      partitions_to_select=args.partitions_to_select,
                       start_time=args.start_time,
                       job_walltime_factor=args.job_walltime_factor,
                       given_walltime_only=args.given_walltime_only,
                       job_grain=args.job_grain,
                       indent=args.indent,
-                      keep_only=args.keep_only,
-                      verbose=args.verbose,
                       quiet=args.quiet,
                       job_size_function_string=args.job_size_function)
 
-- 
GitLab