Skip to content
Snippets Groups Projects
Commit 12ff70b4 authored by Pierre LOTTE's avatar Pierre LOTTE
Browse files

Merge branch 'soft-clustering' into 'master'

Add soft/fuzzy clustering splitting

See merge request !3
parents 336ca4ce 05b6db09
No related branches found
No related tags found
1 merge request!3Add soft/fuzzy clustering splitting
...@@ -13,7 +13,7 @@ import numpy as np ...@@ -13,7 +13,7 @@ import numpy as np
import pandas as pd import pandas as pd
from generator import DatasetGenerator from generator import DatasetGenerator
from split import BaseSplitter from split import SoftSplitter
from trainers import BaseTrainer from trainers import BaseTrainer
from results import ResultExtractor, ROCResults from results import ResultExtractor, ROCResults
...@@ -160,8 +160,8 @@ if __name__ == "__main__": ...@@ -160,8 +160,8 @@ if __name__ == "__main__":
# Split data # Split data
# ================================================================================================================= # =================================================================================================================
if args.split and args.task in ["train", "all"]: if args.split and args.task in ["generate", "train", "all"]:
splitter = BaseSplitter(f"{INPUT_DIR}/{config_name}") splitter = SoftSplitter(f"{INPUT_DIR}/{config_name}")
splitter.split_data(method=args.method, merge=args.merge) splitter.split_data(method=args.method, merge=args.merge)
# ================================================================================================================= # =================================================================================================================
......
...@@ -2,3 +2,4 @@ ...@@ -2,3 +2,4 @@
This module provides the necessary code to split data. This module provides the necessary code to split data.
""" """
from .base import BaseSplitter from .base import BaseSplitter
from .soft import SoftSplitter
...@@ -31,31 +31,33 @@ class BaseSplitter: ...@@ -31,31 +31,33 @@ class BaseSplitter:
self.output_path = f"{path}/splitting" self.output_path = f"{path}/splitting"
os.makedirs(f"{path}/splitting", exist_ok=True) os.makedirs(f"{path}/splitting", exist_ok=True)
self.w_df = pd.read_csv(f"{self.data_path}/dataset_train.csv", index_col="Timestamp")
self.df = pd.read_csv(f"{self.data_path}/dataset.csv", index_col="Timestamp")
self.train_df = pd.read_csv(f"{self.data_path}/dataset_train.csv", index_col="Timestamp")
self.labels_df = pd.read_csv(f"{self.data_path}/dataset_variables_labels.csv")
# Remove the categorial variables from the correlation computation
self.w_df.drop(columns=self.df.select_dtypes(include="int64").columns, inplace=True)
self.w_df.drop(columns=["Timestamp", "is_anomaly"], errors="ignore", inplace=True)
self.w_df.drop(columns=self.w_df.columns[self.w_df.nunique() <= 1], inplace=True)
def split_data(self, method="HDBSCAN", merge=False): def split_data(self, method="HDBSCAN", merge=False):
""" """
This method will be in charge of splitting data into subsystems. This method will be in charge of splitting data into subsystems.
""" """
w_df = pd.read_csv(f"{self.data_path}/dataset_train.csv", index_col="Timestamp") x = self._compute_correlations(merge=merge)
df = pd.read_csv(f"{self.data_path}/dataset.csv", index_col="Timestamp") self._split_data(x, method=method)
train_df = pd.read_csv(f"{self.data_path}/dataset_train.csv", index_col="Timestamp")
labels_df = pd.read_csv(f"{self.data_path}/dataset_variables_labels.csv")
# Remove the categorial variables from the correlation computation
w_df.drop(columns=df.select_dtypes(include="int64").columns, inplace=True)
w_df.drop(columns=["Timestamp", "is_anomaly"], errors="ignore", inplace=True)
w_df.drop(columns=w_df.columns[w_df.nunique() <= 1], inplace=True)
def _split_data(self, x, method="HDBSCAN"):
# Compute all the correlations for the dataset and find the coefficient that produces the best # Compute all the correlations for the dataset and find the coefficient that produces the best
# cluters from its coefficient # cluters from its coefficient
max_silhouette = 0 max_silhouette = 0
best_clusters = None best_clusters = None
x = self._compute_correlations(w_df, merge=merge)
if "HDBSCAN" == method: if "HDBSCAN" == method:
model = HDBSCAN(min_cluster_size=2, allow_single_cluster=True, n_jobs=-1) model = HDBSCAN(min_cluster_size=2, allow_single_cluster=True, n_jobs=-1)
best_clusters = model.fit_predict(x) best_clusters = model.fit_predict(x)
elif "kmeans" == method: elif "kmeans" == method:
for i in range(1, len(w_df.columns)): for i in range(1, len(self.w_df.columns)):
model = KMeans(n_clusters=i, n_init="auto") model = KMeans(n_clusters=i, n_init="auto")
# Compute clusters and their silhouette score # Compute clusters and their silhouette score
...@@ -75,18 +77,19 @@ class BaseSplitter: ...@@ -75,18 +77,19 @@ class BaseSplitter:
# Split the dataset # Split the dataset
for i in range(-1 if -1 in best_clusters else 0, max(best_clusters)+1): for i in range(-1 if -1 in best_clusters else 0, max(best_clusters)+1):
drop = [col for idx, col in enumerate(w_df.columns) if best_clusters[idx] != i] drop = [col for idx, col in enumerate(self.w_df.columns) if best_clusters[idx] != i]
df.drop(columns=drop)\ self.df.drop(columns=drop)\
.to_csv(f"{self.data_path}/dataset_{i}_auto_split.csv", index_label="Timestamp") .to_csv(f"{self.data_path}/dataset_{i}_auto_split.csv", index_label="Timestamp")
train_df.drop(columns=drop)\ self.train_df.drop(columns=drop)\
.to_csv(f"{self.data_path}/dataset_{i}_auto_split_train.csv", index_label="Timestamp") .to_csv(f"{self.data_path}/dataset_{i}_auto_split_train.csv", index_label="Timestamp")
labels = np.bitwise_or.reduce(labels_df.drop(columns=drop).to_numpy(), axis=1, dtype=np.int32) labels = np.bitwise_or.reduce(self.labels_df.drop(columns=drop).to_numpy(), axis=1, dtype=np.int32)
pd.DataFrame(labels).to_csv(f"{self.data_path}/dataset_{i}_auto_split_labels.csv", index=False) pd.DataFrame(labels).to_csv(f"{self.data_path}/dataset_{i}_auto_split_labels.csv", index=False)
def _compute_correlations(self, data, merge=False): def _compute_correlations(self, merge=False):
""" """
Compute the vector of correlation coefficients for each of the variable of the dataset. Compute the vector of correlation coefficients for each of the variable of the dataset.
""" """
sns.set(font_scale=3.0)
with open(f"{self.output_path}/split_time.csv", "a", encoding="utf-8") as f: with open(f"{self.output_path}/split_time.csv", "a", encoding="utf-8") as f:
f.write("Algorithm,duration\n") f.write("Algorithm,duration\n")
x = [] x = []
...@@ -94,7 +97,7 @@ class BaseSplitter: ...@@ -94,7 +97,7 @@ class BaseSplitter:
coeff_name = coeff.__name__[:-11] coeff_name = coeff.__name__[:-11]
start_time = time() start_time = time()
correlation_matrix = coeff().compute(data) correlation_matrix = coeff().compute(self.w_df)
duration = time() - start_time duration = time() - start_time
f.write(f"{coeff_name},{duration}\n") f.write(f"{coeff_name},{duration}\n")
......
"""
This module contains the basic splitter algorithm based on the correlation coefficients computing and
soft clustering.
"""
import json
import os
from time import time
from warnings import simplefilter
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.spatial.distance import euclidean
from skfuzzy.cluster import cmeans
from sklearn.exceptions import ConvergenceWarning
from sklearn.metrics import silhouette_score
from .correlations import CORRELATION_CLASSES
from .base import BaseSplitter
simplefilter("ignore", category=ConvergenceWarning)
class SoftSplitter(BaseSplitter):
"""
Class in charge of doing basic splitting using correlation coefficients
"""
def _fuzzy_silhouette(self, fuzzy_matrix, points, distances_matrix):
s = np.zeros(len(points))
ranked_scores = []
for i in range(len(points)):
la = [euclidean(points[i], points[x]) for x in range(len(points)) if x != i and np.argmax(fuzzy_matrix[:, i]) == np.argmax(fuzzy_matrix[:, x])]
if la:
a = np.mean(la)
b = np.sort(distances_matrix[:, i])[1]
s[i] = (b-a)/max(a,b)
ranked_scores.append(-np.sort(-fuzzy_matrix[:, i]))
alpha = 1
non_summed = np.array([((ranked_scores[i][0] - ranked_scores[i][1])**alpha) for i in range(len(points))])
denom = sum(non_summed)
num = sum(non_summed * s)
return num/denom
def _split_data(self, x, method=""):
# Compute all the correlations for the dataset and find the coefficient that produces the best
# cluters from its coefficient
best_clusters = None
best_fs = 0
all_fs = []
k = 1
for i in range(2, len(self.w_df.columns)):
# Compute fuzzy clusters. The third argument has been taken from the tutorial of skfuzzy
# and might not be the best choice.
_, scores, _, d, _, _, _ = cmeans(x.T, i, 2, error=0.005, maxiter=1000)
score = self._fuzzy_silhouette(scores, x, d)
all_fs.append(score)
# Check if the proposed clustering is better than already proposed solutions
if score > best_fs:
best_fs = score
best_clusters = scores.T
k = i
# Save choosen clustering
with open(f"{self.output_path}/dataset_split_clusters.txt", "w", encoding="utf-8") as f:
f.write(f"{best_clusters.tolist()}\n\n")
f.write(f"All silhouette: {all_fs}\n\n")
# Split the dataset
for i in range(k):
drop = [col for idx, col in enumerate(self.w_df.columns) if np.argmax(best_clusters[idx]) != i and best_clusters[idx, i] < best_clusters[idx, np.argmax(best_clusters[idx])] /2]
f.write(f"Cluster {i} -> {[col for col in self.w_df.columns if col not in drop]}\n")
self.df.drop(columns=drop)\
.to_csv(f"{self.data_path}/dataset_{i}_auto_split.csv", index_label="Timestamp")
self.train_df.drop(columns=drop)\
.to_csv(f"{self.data_path}/dataset_{i}_auto_split_train.csv", index_label="Timestamp")
labels = np.bitwise_or.reduce(self.labels_df.drop(columns=drop).to_numpy(), axis=1, dtype=np.int32)
pd.DataFrame(labels).to_csv(f"{self.data_path}/dataset_{i}_auto_split_labels.csv", index=False)
return x
This diff is collapsed.
...@@ -24,6 +24,7 @@ torch-scatter = {version="^2.1.2", source="pyg"} ...@@ -24,6 +24,7 @@ torch-scatter = {version="^2.1.2", source="pyg"}
torch-spline-conv = {version="^1.2.2", source="pyg"} torch-spline-conv = {version="^1.2.2", source="pyg"}
torch-cluster = {version="^1.6.3", source="pyg"} torch-cluster = {version="^1.6.3", source="pyg"}
torch-sparse = {version="^0.6.18", source="pyg"} torch-sparse = {version="^0.6.18", source="pyg"}
scikit-fuzzy = "^0.5.0"
[[tool.poetry.source]] [[tool.poetry.source]]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment