m100_agg_power_predictions.py

#!/usr/bin/env python3
import argparse
import glob
import os
import re
import pandas as pd

FILENAME_PARSE_REGEX = '''.*/filter123_user_(\d+)_total_power_mean_pred\.csv$'''

def read_aggregate_one_dir(dir, estimated_metrics):
    full_df = None
    r = re.compile(FILENAME_PARSE_REGEX)
    for filename in glob.glob(f'{dir}/*.csv'):
        m = r.match(filename)
        if m is None:
            raise RuntimeError(f'Unexpected file encountered: {filename}')
        user_id = m.group(1)

        df = pd.read_csv(filename, low_memory=False)
        df['user_id'] = user_id
        df[f'{estimated_metrics}_power_estimation'] = df[f'hist_pred_total_power_{estimated_metrics}']
        df = df[['job_id', 'user_id', f'{estimated_metrics}_power_estimation']]

        if full_df is None:
            full_df = df
        else:
            full_df = pd.concat([full_df, df])
        del df

    return full_df

def read_aggregate_root_dir(root_dir):
    full_df_mean = read_aggregate_one_dir(f'{root_dir}/total_power_mean_predictions_users_allmethods_mean', 'mean')
    full_df_max = read_aggregate_one_dir(f'{root_dir}/total_power_mean_predictions_users_allmethods_max', 'max')
    return full_df_mean.merge(full_df_max)

def agg_all_files():
    parser = argparse.ArgumentParser()
    parser.add_argument("input_root_dir")
    parser.add_argument("output_file")
    args = parser.parse_args()

    df = read_aggregate_root_dir(args.input_root_dir)
    df.sort_values(by=['job_id']).to_csv(args.output_file, index=False)