Skip to content
Snippets Groups Projects
Commit a806e498 authored by Pierre LOTTE's avatar Pierre LOTTE
Browse files

Add F1 measure with threshold selection from ROC curve

parent e656158b
No related branches found
No related tags found
No related merge requests found
......@@ -11,7 +11,7 @@ class BaseResults():
self.path = path
self.algos = algos
self.configs = config_names
self.result = {}
self.result = {"roc": {}, "f1": {}}
def compute(self, auto_split=False):
"""
......
......@@ -8,7 +8,7 @@ import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.metrics import roc_auc_score, roc_curve, f1_score
from sklearn.preprocessing import Normalizer
from .base import BaseResults
......@@ -24,7 +24,6 @@ class ROCResults(BaseResults):
"""
return subprocess.check_output(cmd, shell=True).decode("utf-8").split()
norm = Normalizer()
for config in self.configs:
config_name = config.split("/")[-1][:-5] if "." in config else config.split("/")[-1]
......@@ -32,79 +31,32 @@ class ROCResults(BaseResults):
# Compute the score for the full_dataset
for algo in self.algos:
y_pred = np.loadtxt(f"{self.path}/{config_name}/results_{algo}/anomaly_scores_dataset_{algo}.ts")
y_pred= np.nan_to_num(y_pred)
y_pred_path = f"{self.path}/{config_name}/results_{algo}/anomaly_scores_dataset_{algo}.ts"
roc, f1 = self.__compute_score(labels, y_pred_path)
if len(y_pred) != len(labels):
y_pred = self.__vote_for_score(y_pred, len(labels))
score = roc_auc_score(labels, y_pred)
fpr, tpr, _ = roc_curve(labels, y_pred)
if algo in self.result:
self.result[algo][config_name] = {"classic": round(score, 4)}
if algo in self.result["roc"]:
self.result["roc"][algo][config_name] = {"classic": round(roc, 4)}
self.result["f1"][algo][config_name] = {"classic": round(f1, 4)}
else:
self.result[algo] = {config_name: {"classic": round(score, 4)}}
plt.rcParams["figure.figsize"] = (10, 10)
plt.plot(fpr, tpr)
plt.plot(np.linspace(0,1,10), np.linspace(0,1,10), linestyle="--", label="ROC=0.5")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend()
plt.savefig(f"{self.path}/{config_name}/roc_auc_{algo}.png")
plt.clf()
plt.rcParams["figure.figsize"] = (20, 10)
self.result["roc"][algo] = {config_name: {"classic": round(roc, 4)}}
self.result["f1"][algo] = {config_name: {"classic": round(f1, 4)}}
# Compute results for automatically splitted dataset
if auto_split:
files = __exec(f"find -L {self.path}/{config_name}/results_{algo} -regex '^.*dataset[_0-9]+_auto_split_{algo}.ts'")
result = np.zeros(len(labels))
for file in files:
y_pred = np.loadtxt(file)
y_pred = np.nan_to_num(y_pred)
if len(y_pred) != len(labels):
y_pred = self.__vote_for_score(y_pred, len(labels))
y_pred = norm.fit_transform(y_pred.reshape(1,-1)).reshape(-1)
result = np.maximum(result, y_pred)
score = roc_auc_score(labels, result)
self.result[algo][f"{config_name}"]["auto_split"] = round(score, 4)
roc, f1 = self.__compute_score(labels, files, local=True)
self.result["roc"][algo][f"{config_name}"]["auto_split"] = round(roc, 4)
self.result["f1"][algo][f"{config_name}"]["auto_split"] = round(f1, 4)
# Compute results for splitted dataset
files = __exec(f"find -L {self.path}/{config_name}/results_{algo} -regex '^.*dataset[_0-9]+_{algo}.ts'")
result = np.zeros(len(labels))
for file in files:
y_pred = np.loadtxt(file)
y_pred = np.nan_to_num(y_pred)
if len(y_pred != len(labels)):
y_pred = self.__vote_for_score(y_pred, len(labels))
y_pred = norm.fit_transform(y_pred.reshape(1,-1)).reshape(-1)
result = np.maximum(result, y_pred)
score = roc_auc_score(labels, result)
self.result[algo][f"{config_name}"]["split"] = round(score, 4)
roc, f1 = self.__compute_score(labels, files, local=True)
self.result["roc"][algo][f"{config_name}"]["split"] = round(roc, 4)
self.result["f1"][algo][f"{config_name}"]["split"] = round(f1, 4)
print(json.dumps(self.result))
# for algo, algo_res in self.result.items():
# l_classic = []
# l_auto_split = []
# l_split = []
# for details in algo_res.values():
# l_classic.append(details["classic"])
# l_auto_split.append(details["auto_split"])
# l_split.append(details["split"])
# print(f"{algo}:")
# print(f"\tClassic: {np.mean(l_classic)} ({np.std(l_classic)})")
# print(f"\tAuto split: {np.mean(l_auto_split)} ({np.std(l_auto_split)})")
# print(f"\tSplit: {np.mean(l_split)} ({np.std(l_split)})")
def __vote_for_score(self, scores, length):
"""
Compute the score for each point of the dataset instead of a per window basis.
......@@ -120,3 +72,54 @@ class ROCResults(BaseResults):
results[idx] = np.mean(scores[start:end])
return results
def __compute_score(self, labels, y_pred_path, local=False):
"""
This function computes the roc and F1 score of the given predictions
"""
result = np.zeros(len(labels))
# If local is set to true, it means that we have a list of path for local scores.
# We must first retrieve all the scores and aggregate them.
if local:
norm = Normalizer()
if len(y_pred_path) == 0:
return 0, 0
for path in y_pred_path:
y_pred = np.loadtxt(path)
y_pred = np.nan_to_num(y_pred)
if len(y_pred != len(labels)):
y_pred = self.__vote_for_score(y_pred, len(labels))
y_pred = norm.fit_transform(y_pred.reshape(1,-1)).reshape(-1)
result = np.maximum(result, y_pred)
# Otherwise, we simply have one score file, we must read it and compute the score
# for each instant.
else:
y_pred = np.loadtxt(y_pred_path)
result = np.nan_to_num(y_pred)
if len(y_pred) != len(labels):
result = self.__vote_for_score(y_pred, len(labels))
# Once the correct anomaly scores have been computed, we can compute the metrics
roc = roc_auc_score(labels, result)
fpr, tpr, thresh = roc_curve(labels, result)
closest_dist = np.inf
closest_thresh = np.inf
best_couple = (0,0)
for f, t, th in zip(fpr, tpr, thresh):
dist = np.sqrt((f-0)**2+(t-1)**2)
if dist < closest_dist:
closest_dist = dist
closest_thresh = th
best_couple = (f, t)
binary_labels = (result > closest_thresh).astype(int)
f1 = f1_score(labels, binary_labels)
return roc, f1
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment