diff --git a/README.md b/README.md index b5ddf39978468ea88725e76bcf6491f3faeaf45c..818f918bc92ad736d8389d72b2276fea79b59277 100644 --- a/README.md +++ b/README.md @@ -210,6 +210,7 @@ oarsub -I -l host=4,walltime=2 Reserve 4 hosts (GPU) (1 server + 3 clients) for 2 hours: ```bash oarsub -I -t exotic -p "gpu_count>0" -l {"cluster='drac'"}/host=4 # grenoble +oarsub -I -p "gpu_count>0" -l {"cluster='chifflot'"}/host=4 # lille ``` **Remark**: for now only 2 clusters, `chifflot` in Lille and `drac` in Grenoble are available for testing in more than 3 GPU nodes, maximum is 8 (`chifflot`) or 12 (`drac`) nodes. diff --git a/Run/kill.py b/Run/kill.py index 1617b260dd6f37f7f798a75138a55b7893e9d2ec..220af52dc3c7d83a274632edeaf28d9e3df263b8 100644 --- a/Run/kill.py +++ b/Run/kill.py @@ -1,66 +1,76 @@ +# python3 kill.py --config ./config_instances_lille.json --user mdo --serverkey "python3 /home/mdo/eflwr/Flower_v1/server_1.py" --clientkey "/home/mdo/eflwr/Flower_v1/client_1.py" +#!/usr/bin/env python3 + import os import subprocess import json +import argparse from pathlib import Path -# Determine script directory -current_dir = Path(__file__).resolve().parent -parent_dir = current_dir.parent - -# Path to the config.json file -config_path = os.path.join(current_dir, "config.json") - -# Read the configuration -with open(config_path, "r") as file: - config = json.load(file) - -# Get server and client details -server_ip = config["server"]["ip"] -server_args = " ".join(config["server"]["args"]) -server_keyword = "server.py" +def parse_args(): + parser = argparse.ArgumentParser(description="Kill background jobs by keyword and user") + parser.add_argument("--config", required=True, help="Path to the config JSON file") + parser.add_argument("--user", required=True, help="Username to filter processes") + parser.add_argument("--serverkey", required=True, help="Keyword to identify server process") + parser.add_argument("--clientkey", required=True, help="Keyword to identify client processes") + return parser.parse_args() -client_details = [ - {"ip": client["ip"], "keyword": f"{client['command']} {' '.join(client['args'])}"} - for client in config["clients"] -] - -def check_and_kill_process(host, keyword): +def check_and_kill_process(host, keyword, user): """ - Check for processes matching the keyword on the host and kill them. + Check for processes matching the keyword on the host and kill them (only for the given user). """ try: - print(f"========== Checking processes on {host} ==========") - - # Command to grep processes - grep_command = f"oarsh {host} pgrep -fa '{keyword}'" - - # Get process details + print(f"\n========== Checking processes on {host} for user '{user}' ==========") + + current_pid = str(os.getpid()) + + # Command to find processes by user and keyword + grep_command = f"oarsh {host} pgrep -u {user} -fa '{keyword}'" + result = subprocess.run(grep_command, shell=True, text=True, capture_output=True) - + if result.returncode == 0 and result.stdout.strip(): - # Extract process details (PID and process name) process_lines = result.stdout.strip().split("\n") for line in process_lines: pid, process_name = line.split(" ", 1) - print(f"Found process on {host}: PID={pid}, Process={process_name}") + + if pid == current_pid: + print(f"Skipping current script process (PID={pid})") + continue + + print(f"Found process: PID={pid}, Process={process_name}") - # Kill the process kill_command = f"oarsh {host} kill -9 {pid}" subprocess.run(kill_command, shell=True) print(f"Killed process {pid} on {host}") else: - print(f"No matching processes found on {host}.") + print("No matching processes found.") except Exception as e: - print(f"An error occurred on {host}: {e}") + print(f"Error on host {host}: {e}") + +def main(): + args = parse_args() + + # Load config + config_path = Path(args.config).resolve() + with open(config_path, "r") as file: + config = json.load(file)["instances"]["1"] + + # Get server IP + server_ip = config["server"]["ip"] + + # Get client list + client_details = [client["ip"] for client in config["clients"]] + + # Kill server processes + check_and_kill_process(server_ip, args.serverkey, args.user) -# Check and kill processes on the server -check_and_kill_process(server_ip, server_keyword) + # Kill client processes + for client_ip in client_details: + if client_ip: + check_and_kill_process(client_ip, args.clientkey, args.user) -# Check and kill processes on each client -for client in client_details: - client_ip = client["ip"] - client_keyword = "client" - if client_ip: - check_and_kill_process(client_ip, client_keyword) + print("\n========== Process Management Completed ==========") -print("========== Process Management Completed ==========") +if __name__ == "__main__": + main()