Skip to content
Snippets Groups Projects
Commit f75648e3 authored by Pierre LOTTE's avatar Pierre LOTTE
Browse files

Add frequency anomalies and a few fixes

parent bc8c3a91
Branches
Tags
No related merge requests found
......@@ -32,7 +32,7 @@
"prediction_window_size": [10, 200]
},
"const": {
"epochs": 500,
"epochs": 100,
"learning_rate": 0.001
},
"default": {
......@@ -49,15 +49,14 @@
"prediction_window_size": [10, 100]
},
"const": {
"epochs": 500,
"epochs": 100,
"learning_rate": 0.001
},
"default": {
"lstm_layers": 1,
"lstm_layers": 2,
"window_size": 200,
"prediction_window_size": 50
}
},
"health_esn": {
"training": true,
......
......@@ -5,10 +5,11 @@ This module handles anomaly creation
from .noise import NoiseAnomaly
from .corr import CorrelationAnomaly
from .varying_frequency import VFAnomaly
from .frequency import FrequencyAnomaly
ANOMALIES = {
"CORRELATION": [NoiseAnomaly, CorrelationAnomaly],
"EXP_CORRELATION": [NoiseAnomaly, CorrelationAnomaly],
"VFOSC": [NoiseAnomaly, VFAnomaly],
"OSC": [NoiseAnomaly],
"VFOSC": [NoiseAnomaly, VFAnomaly, FrequencyAnomaly],
"OSC": [NoiseAnomaly, FrequencyAnomaly],
}
......@@ -12,12 +12,16 @@ class LinearCorrelationDimension(BaseDimension):
def generate(self, idx:int) -> Dict:
dimension = np.random.choice(range(idx))
# Choose lag
lag = np.random.randint(low=0, high=20)
return {
"kind": "CORRELATION",
"dimension": int(dimension),
"equation": {
"sign": int(np.random.choice([-1, 1])),
"step": round(np.random.uniform(low=0.01, high=0.1), 4)
"step": round(np.random.uniform(low=0.01, high=0.1), 4),
"lag": lag
},
"noise": {
"mean": 0.0,
......@@ -36,7 +40,7 @@ class ExpCorrelationDimension(BaseDimension):
start, end = limits
# Choose lag
lag = np.random.randint(low=0, high=100)
lag = np.random.randint(low=0, high=20)
return {
"kind": "EXP_CORRELATION",
......
......@@ -24,11 +24,13 @@ class OscillatingDimension(BaseDimension):
def generate(self, idx:int) -> Dict:
return {
"kind": "OSC",
"equation": {
"function": np.random.choice(OscillatingDimension.OSC_FN),
"amplitude": round(np.random.uniform(low=0.1, high=5.0), 4),
"frequency": round(np.random.uniform(low=0.1, high=5.0), 4),
},
"equation": [
{
"function": np.random.choice(OscillatingDimension.OSC_FN),
"amplitude": round(np.random.uniform(low=0.1, high=5.0), 4),
"frequency": round(np.random.uniform(low=0.1, high=5.0), 4),
} for _ in range(np.random.randint(1, 10))
],
"noise": {
"mean": 0.0,
"std": 0.1
......
......@@ -31,13 +31,13 @@ class VFOscillatingDimension(BaseDimension):
return {
"kind": "VFOSC",
"equation": {
"equation": [{
"function": np.random.choice(VFOscillatingDimension.OSC_FN),
"amplitude": round(np.random.uniform(low=0.1, high=5.0), 4),
"frequency_min": round(f_min, 4),
"frequency_max": round(f_max, 4),
"period": int(np.random.uniform(low=50, high=300))
},
}],
"noise": {
"mean": 0.0,
"std": 0.1
......
......@@ -95,8 +95,9 @@ class ConfigGenerator:
length_per_dim = (self.__softmax(length_per_dim) * anomalies_per_sub[idx]).astype(int)
for i in range(nb_dim_erroneous):
dim_erroneous = np.random.choice(sub)
kind = np.random.choice(ANOMALIES[self.config["dimensions"][dim_erroneous]["kind"]])
self.config["dimensions"][dim_erroneous]["anomalies"].append(kind(length_per_dim[i]).generate())
if length_per_dim[i] > 0:
dim_erroneous = np.random.choice(sub)
kind = np.random.choice(ANOMALIES[self.config["dimensions"][dim_erroneous]["kind"]])
self.config["dimensions"][dim_erroneous]["anomalies"].append(kind(length_per_dim[i]).generate())
return self
......@@ -3,6 +3,7 @@ This module is in charge of the generation of anomalies.
"""
from .noise import NoiseAnomaly
from .correlation import CorrelationAnomaly
from .frequency import FrequencyAnomaly
from .value import ValueAnomaly
from .varying_frequency import VFAnomaly
......@@ -11,8 +12,9 @@ ANOMALY_CLASSES = {
"CORRELATION": CorrelationAnomaly,
"VALUE": ValueAnomaly,
"VF": VFAnomaly,
"FREQUENCY": FrequencyAnomaly,
}
PREPROCESS_ANOMALIES = []
PROCESS_ANOMALIES = ["VF",]
POSTPROCESS_ANOMALIES = ["NOISE", "CORRELATION", "VALUE"]
POSTPROCESS_ANOMALIES = ["NOISE", "CORRELATION", "VALUE", "FREQUENCY"]
"""
This module descibes the noise anomalies.
"""
from typing import Tuple
import numpy as np
from .base import BaseAnomaly
......@@ -9,7 +11,7 @@ class NoiseAnomaly(BaseAnomaly):
"""
This class is in charge of adding noise to a given time series.
"""
def inject(self) -> np.array:
def inject(self) -> Tuple[np.array, np.array]:
start, end = self.find_anomaly_index()
noise = np.random.normal(self.params["mean"], self.params["std"], self.length)
......
......@@ -4,6 +4,7 @@ This module provides the main class needed to generate time series
from typing import List, Tuple
import numpy as np
import matplotlib.pyplot as plt
from generator.dimension import DIMENSION_CLASSES
......
......@@ -30,4 +30,4 @@ class LinearCorrelationDimension(BaseDimension):
steps = np.insert(self.terms["sign"] * np.sign(np.diff(corr_dim)) * self.terms["step"], 0, 0)
self.train_data[self.idx, lag:] = np.cumsum(steps)[:-lag] if lag > 0 else np.cumsum(steps)
return self.data, self.train_data
\ No newline at end of file
return self.data, self.train_data
......@@ -4,6 +4,7 @@ This module defines different dimension generator.
from typing import Tuple
import numpy as np
import matplotlib.pyplot as plt
from .base import BaseDimension
......@@ -27,17 +28,29 @@ class OscillatingDimension(BaseDimension):
def generate(self) -> Tuple[np.array, np.array]:
# Prepare computing variables
dt = .01
func = self.OSCILLATING_FUNCTIONS[self.terms["function"]]
# Generate testing data
steps = np.zeros(self.length)
steps += 2 * np.pi * self.terms["frequency"] * dt
self.data[self.idx] = self.terms["amplitude"] * func(np.cumsum(steps) + np.random.normal(0.0, 2.0))
# Generate training data
steps = np.zeros(self.train_length)
steps += 2 * np.pi * self.terms["frequency"] * dt
self.train_data[self.idx] = self.terms["amplitude"] * func(np.cumsum(steps))
dt = .001
res = np.zeros(self.length)
res_train = np.zeros(self.train_length)
for term in self.terms:
func = self.OSCILLATING_FUNCTIONS[term["function"]]
# Generate testing data
signal = self.data[self.idx]
signal += 2 * np.pi * term["frequency"] * dt
signal = func(np.cumsum(signal))
signal *= term["amplitude"]
res += signal
# Generate training data
signal = self.train_data[self.idx]
signal += 2 * np.pi * term["frequency"] * dt
signal = func(np.cumsum(signal))
signal *= term["amplitude"]
res_train += signal
self.data[self.idx] = res
self.train_data[self.idx] = res_train
return self.data, self.train_data
......@@ -29,6 +29,7 @@ class VFOscillatingDimension(BaseDimension):
def generate(self) -> Tuple[np.array, np.array]:
# Prepare computing variables
dt = .01
self.terms = self.terms[0]
func = self.OSCILLATING_FUNCTIONS[self.terms["function"]]
# Prepare frequencies
......
......@@ -40,6 +40,12 @@ if __name__ == "__main__":
nargs="+"
)
parser.add_argument("-a", "--algorithms", help="Which algorithm to train.", default=["kmeans"], nargs="+")
parser.add_argument(
"-m", "--cluster-method",
help="Which cluster algorithm to use.",
default="HDBSCAN",
dest="method"
)
parser.add_argument("-i", "--input", help="Input directory. Only to be used when no data will be generated")
parser.add_argument("-o", "--output", help="Output directory")
parser.add_argument(
......@@ -144,7 +150,7 @@ if __name__ == "__main__":
if args.split and args.task in ["train", "all"]:
splitter = BaseSplitter(f"{INPUT_DIR}/{config_name}")
splitter.split_data()
splitter.split_data(method=args.method)
# =================================================================================================================
# Train algorithm
......
......@@ -3,6 +3,7 @@ This module contains the basic splitter algorithm based on the correlation coeff
clustering.
"""
import json
import os
import matplotlib.pyplot as plt
import numpy as np
......@@ -27,8 +28,10 @@ class BaseSplitter:
"""
def __init__(self, path):
self.data_path = path
self.output_path = f"{path}/splitting"
os.makedirs(f"{path}/splitting", exist_ok=True)
def split_data(self):
def split_data(self, method="HDBSCAN"):
"""
This method will be in charge of splitting data into subsystems.
"""
......@@ -38,9 +41,9 @@ class BaseSplitter:
labels_df = pd.read_csv(f"{self.data_path}/dataset_variables_labels.csv")
# Remove the categorial variables from the correlation computation
cat_columns = df.select_dtypes(include="int64").columns
w_df.drop(columns=cat_columns, inplace=True)
w_df.drop(columns=df.select_dtypes(include="int64").columns, inplace=True)
w_df.drop(columns=["Timestamp", "is_anomaly"], errors="ignore", inplace=True)
w_df.drop(columns=w_df.columns[w_df.nunique() <= 1], inplace=True)
# Compute all the correlations for the dataset and find the coefficient that produces the best
# cluters from its coefficient
......@@ -49,17 +52,16 @@ class BaseSplitter:
x = self._compute_correlations(w_df)
for i in range(1, len(w_df.columns)):
# km = KMeans(n_clusters=i)
# clusters = km.fit_predict(x)
hdb_scan = HDBSCAN(min_cluster_size=2, allow_single_cluster=True, n_jobs=-1)
clusters = hdb_scan.fit_predict(x)
if len(np.unique(clusters)) == 1:
score = 0.5
elif i > 1:
score = silhouette_score(x, clusters)
if "HDBSCAN" == method:
model = HDBSCAN(min_cluster_size=2, allow_single_cluster=True, n_jobs=-1)
elif "kmeans" == method:
model = KMeans(n_clusters=i, n_init="auto")
else:
score = 0.01
raise ValueError("Please select either kmeans or HDBSCAN as a clustering method.")
# Compute clusters and their silhouette score
clusters = model.fit_predict(x)
score = silhouette_score(x, clusters) if i > 1 else 0.5
# Keep the best coeff and number of clusters in mind
if score > max_silhouette and (score > 0.5 or i == 1):
......@@ -67,12 +69,12 @@ class BaseSplitter:
best_clusters = clusters
# Save choosen clustering
with open(f"{self.data_path}/dataset_clusters.txt", "w", encoding="utf-8") as f:
with open(f"{self.output_path}/dataset_clusters.txt", "w", encoding="utf-8") as f:
f.write(json.dumps(best_clusters.tolist()))
# Split the dataset
for i in range(-1 if -1 in clusters else 0, max(best_clusters)+1):
drop = [str(col) for idx, col in enumerate(w_df.columns) if best_clusters[idx] != i]
drop = [col for idx, col in enumerate(w_df.columns) if best_clusters[idx] != i]
df.drop(columns=drop)\
.to_csv(f"{self.data_path}/dataset_{i}_auto_split.csv", index_label="Timestamp")
train_df.drop(columns=drop)\
......@@ -84,7 +86,7 @@ class BaseSplitter:
"""
Compute the vector of correlation coefficients for each of the variable of the dataset.
"""
with open(f"{self.data_path}/split_time.csv", "a", encoding="utf-8") as f:
with open(f"{self.output_path}/split_time.csv", "a", encoding="utf-8") as f:
f.write("Algorithm,duration\n")
x = None
for coeff in CORRELATION_CLASSES:
......@@ -95,10 +97,10 @@ class BaseSplitter:
duration = time() - start_time
f.write(f"{coeff_name},{duration}\n")
correlation_matrix.to_csv(f"{self.data_path}/dataset_correlation_{coeff_name}.csv")
correlation_matrix.to_csv(f"{self.output_path}/dataset_correlation_{coeff_name}.csv")
sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")\
.get_figure()\
.savefig(f"{self.data_path}/dataset_correlation_matrix_{coeff_name}.png")
.savefig(f"{self.output_path}/dataset_correlation_matrix_{coeff_name}.png")
plt.clf()
if x is not None:
......
......@@ -27,11 +27,14 @@ class BaseTrainer():
self.params = kwargs["const"]
self.optim_params = kwargs["optimize"]
self.default_params = kwargs["default"]
self.log_path = f"{'/'.join(data_path.split('/')[:-1])}/{self.__class__.__name__}_logs"
self.algorithm = algorithm
self.train = train
self.pwd = ""
# Prepare logging files and dirs
self.log_path = f"{data_path}/logs/{algorithm}_logs.txt"
os.makedirs(f"{data_path}/logs", exist_ok=True)
def start(self, optim=False):
"""
This method orchestrates the optimization, training and computing of the results for the
......@@ -167,7 +170,7 @@ class BaseTrainer():
"""
start_time = time()
cmd = f"python3 {self.pwd}/algorithms/{self.algorithm}/algorithm.py '{json.dumps(model_args)}'"
cmd += f" 1>{self.log_path} 2>{self.log_path}"
cmd += f" 1>>{self.log_path} 2>>{self.log_path}"
os.system(cmd)
duration = time() - start_time
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment