From a806e49812221b0147b65fca7e9b56b7bb2640ce Mon Sep 17 00:00:00 2001 From: Pierre LOTTE <pierrelotte.dev@gmail.com> Date: Tue, 24 Sep 2024 08:41:13 +0200 Subject: [PATCH] Add F1 measure with threshold selection from ROC curve --- results/base.py | 2 +- results/roc.py | 127 +++++++++++++++++++++++++----------------------- 2 files changed, 66 insertions(+), 63 deletions(-) diff --git a/results/base.py b/results/base.py index 1790ed5..5f87b56 100644 --- a/results/base.py +++ b/results/base.py @@ -11,7 +11,7 @@ class BaseResults(): self.path = path self.algos = algos self.configs = config_names - self.result = {} + self.result = {"roc": {}, "f1": {}} def compute(self, auto_split=False): """ diff --git a/results/roc.py b/results/roc.py index 142bc70..4b92030 100644 --- a/results/roc.py +++ b/results/roc.py @@ -8,7 +8,7 @@ import matplotlib.pyplot as plt import numpy as np import pandas as pd -from sklearn.metrics import roc_auc_score, roc_curve +from sklearn.metrics import roc_auc_score, roc_curve, f1_score from sklearn.preprocessing import Normalizer from .base import BaseResults @@ -24,7 +24,6 @@ class ROCResults(BaseResults): """ return subprocess.check_output(cmd, shell=True).decode("utf-8").split() - norm = Normalizer() for config in self.configs: config_name = config.split("/")[-1][:-5] if "." in config else config.split("/")[-1] @@ -32,79 +31,32 @@ class ROCResults(BaseResults): # Compute the score for the full_dataset for algo in self.algos: - y_pred = np.loadtxt(f"{self.path}/{config_name}/results_{algo}/anomaly_scores_dataset_{algo}.ts") - y_pred= np.nan_to_num(y_pred) + y_pred_path = f"{self.path}/{config_name}/results_{algo}/anomaly_scores_dataset_{algo}.ts" + roc, f1 = self.__compute_score(labels, y_pred_path) - if len(y_pred) != len(labels): - y_pred = self.__vote_for_score(y_pred, len(labels)) - - score = roc_auc_score(labels, y_pred) - fpr, tpr, _ = roc_curve(labels, y_pred) - - if algo in self.result: - self.result[algo][config_name] = {"classic": round(score, 4)} + if algo in self.result["roc"]: + self.result["roc"][algo][config_name] = {"classic": round(roc, 4)} + self.result["f1"][algo][config_name] = {"classic": round(f1, 4)} else: - self.result[algo] = {config_name: {"classic": round(score, 4)}} - - plt.rcParams["figure.figsize"] = (10, 10) - plt.plot(fpr, tpr) - plt.plot(np.linspace(0,1,10), np.linspace(0,1,10), linestyle="--", label="ROC=0.5") - plt.xlabel("False Positive Rate") - plt.ylabel("True Positive Rate") - plt.legend() - plt.savefig(f"{self.path}/{config_name}/roc_auc_{algo}.png") - plt.clf() - plt.rcParams["figure.figsize"] = (20, 10) + self.result["roc"][algo] = {config_name: {"classic": round(roc, 4)}} + self.result["f1"][algo] = {config_name: {"classic": round(f1, 4)}} # Compute results for automatically splitted dataset if auto_split: files = __exec(f"find -L {self.path}/{config_name}/results_{algo} -regex '^.*dataset[_0-9]+_auto_split_{algo}.ts'") - result = np.zeros(len(labels)) - for file in files: - y_pred = np.loadtxt(file) - y_pred = np.nan_to_num(y_pred) - - if len(y_pred) != len(labels): - y_pred = self.__vote_for_score(y_pred, len(labels)) - - y_pred = norm.fit_transform(y_pred.reshape(1,-1)).reshape(-1) - result = np.maximum(result, y_pred) - - score = roc_auc_score(labels, result) - self.result[algo][f"{config_name}"]["auto_split"] = round(score, 4) + roc, f1 = self.__compute_score(labels, files, local=True) + self.result["roc"][algo][f"{config_name}"]["auto_split"] = round(roc, 4) + self.result["f1"][algo][f"{config_name}"]["auto_split"] = round(f1, 4) # Compute results for splitted dataset files = __exec(f"find -L {self.path}/{config_name}/results_{algo} -regex '^.*dataset[_0-9]+_{algo}.ts'") - result = np.zeros(len(labels)) - for file in files: - y_pred = np.loadtxt(file) - y_pred = np.nan_to_num(y_pred) - - if len(y_pred != len(labels)): - y_pred = self.__vote_for_score(y_pred, len(labels)) - - y_pred = norm.fit_transform(y_pred.reshape(1,-1)).reshape(-1) - result = np.maximum(result, y_pred) - score = roc_auc_score(labels, result) - self.result[algo][f"{config_name}"]["split"] = round(score, 4) + roc, f1 = self.__compute_score(labels, files, local=True) + self.result["roc"][algo][f"{config_name}"]["split"] = round(roc, 4) + self.result["f1"][algo][f"{config_name}"]["split"] = round(f1, 4) print(json.dumps(self.result)) - # for algo, algo_res in self.result.items(): - # l_classic = [] - # l_auto_split = [] - # l_split = [] - # for details in algo_res.values(): - # l_classic.append(details["classic"]) - # l_auto_split.append(details["auto_split"]) - # l_split.append(details["split"]) - # print(f"{algo}:") - # print(f"\tClassic: {np.mean(l_classic)} ({np.std(l_classic)})") - # print(f"\tAuto split: {np.mean(l_auto_split)} ({np.std(l_auto_split)})") - # print(f"\tSplit: {np.mean(l_split)} ({np.std(l_split)})") - - def __vote_for_score(self, scores, length): """ Compute the score for each point of the dataset instead of a per window basis. @@ -120,3 +72,54 @@ class ROCResults(BaseResults): results[idx] = np.mean(scores[start:end]) return results + + def __compute_score(self, labels, y_pred_path, local=False): + """ + This function computes the roc and F1 score of the given predictions + """ + result = np.zeros(len(labels)) + + # If local is set to true, it means that we have a list of path for local scores. + # We must first retrieve all the scores and aggregate them. + if local: + norm = Normalizer() + + if len(y_pred_path) == 0: + return 0, 0 + + for path in y_pred_path: + y_pred = np.loadtxt(path) + y_pred = np.nan_to_num(y_pred) + + if len(y_pred != len(labels)): + y_pred = self.__vote_for_score(y_pred, len(labels)) + + y_pred = norm.fit_transform(y_pred.reshape(1,-1)).reshape(-1) + result = np.maximum(result, y_pred) + # Otherwise, we simply have one score file, we must read it and compute the score + # for each instant. + else: + y_pred = np.loadtxt(y_pred_path) + result = np.nan_to_num(y_pred) + + if len(y_pred) != len(labels): + result = self.__vote_for_score(y_pred, len(labels)) + + # Once the correct anomaly scores have been computed, we can compute the metrics + roc = roc_auc_score(labels, result) + fpr, tpr, thresh = roc_curve(labels, result) + + closest_dist = np.inf + closest_thresh = np.inf + best_couple = (0,0) + for f, t, th in zip(fpr, tpr, thresh): + dist = np.sqrt((f-0)**2+(t-1)**2) + if dist < closest_dist: + closest_dist = dist + closest_thresh = th + best_couple = (f, t) + + binary_labels = (result > closest_thresh).astype(int) + f1 = f1_score(labels, binary_labels) + + return roc, f1 -- GitLab