diff --git a/algorithm_params.json b/algorithm_params.json index 1644b5c086c56abdc15d827c7e992073e1e723dc..be7bf26a78a5f9cc89c97b851b302b3fd6ad7928 100644 --- a/algorithm_params.json +++ b/algorithm_params.json @@ -32,7 +32,7 @@ "prediction_window_size": [10, 200] }, "const": { - "epochs": 500, + "epochs": 100, "learning_rate": 0.001 }, "default": { @@ -49,15 +49,14 @@ "prediction_window_size": [10, 100] }, "const": { - "epochs": 500, + "epochs": 100, "learning_rate": 0.001 }, "default": { - "lstm_layers": 1, + "lstm_layers": 2, "window_size": 200, "prediction_window_size": 50 } - }, "health_esn": { "training": true, diff --git a/config_maker/anomalies/__init__.py b/config_maker/anomalies/__init__.py index 51ef97397a4069516bbdb820960ab611b078a878..bd8060d37dfae8764fa10f20a096436a182ed11b 100644 --- a/config_maker/anomalies/__init__.py +++ b/config_maker/anomalies/__init__.py @@ -5,10 +5,11 @@ This module handles anomaly creation from .noise import NoiseAnomaly from .corr import CorrelationAnomaly from .varying_frequency import VFAnomaly +from .frequency import FrequencyAnomaly ANOMALIES = { "CORRELATION": [NoiseAnomaly, CorrelationAnomaly], "EXP_CORRELATION": [NoiseAnomaly, CorrelationAnomaly], - "VFOSC": [NoiseAnomaly, VFAnomaly], - "OSC": [NoiseAnomaly], + "VFOSC": [NoiseAnomaly, VFAnomaly, FrequencyAnomaly], + "OSC": [NoiseAnomaly, FrequencyAnomaly], } diff --git a/config_maker/dimensions/correlation.py b/config_maker/dimensions/correlation.py index 6b4a87e621ea2efad79add39f95726ce0351cc6c..29d7c55bbc4d8bc0f00b2ac0bcbcd52b724f1783 100644 --- a/config_maker/dimensions/correlation.py +++ b/config_maker/dimensions/correlation.py @@ -12,12 +12,16 @@ class LinearCorrelationDimension(BaseDimension): def generate(self, idx:int) -> Dict: dimension = np.random.choice(range(idx)) + # Choose lag + lag = np.random.randint(low=0, high=20) + return { "kind": "CORRELATION", "dimension": int(dimension), "equation": { "sign": int(np.random.choice([-1, 1])), - "step": round(np.random.uniform(low=0.01, high=0.1), 4) + "step": round(np.random.uniform(low=0.01, high=0.1), 4), + "lag": lag }, "noise": { "mean": 0.0, @@ -36,7 +40,7 @@ class ExpCorrelationDimension(BaseDimension): start, end = limits # Choose lag - lag = np.random.randint(low=0, high=100) + lag = np.random.randint(low=0, high=20) return { "kind": "EXP_CORRELATION", diff --git a/config_maker/dimensions/oscillating.py b/config_maker/dimensions/oscillating.py index 4dcff42146468cb53197a96640d8d4142d3a6eaf..63ce3324e3ba856d48e59f51fb2c2866c5536d0d 100644 --- a/config_maker/dimensions/oscillating.py +++ b/config_maker/dimensions/oscillating.py @@ -24,11 +24,13 @@ class OscillatingDimension(BaseDimension): def generate(self, idx:int) -> Dict: return { "kind": "OSC", - "equation": { - "function": np.random.choice(OscillatingDimension.OSC_FN), - "amplitude": round(np.random.uniform(low=0.1, high=5.0), 4), - "frequency": round(np.random.uniform(low=0.1, high=5.0), 4), - }, + "equation": [ + { + "function": np.random.choice(OscillatingDimension.OSC_FN), + "amplitude": round(np.random.uniform(low=0.1, high=5.0), 4), + "frequency": round(np.random.uniform(low=0.1, high=5.0), 4), + } for _ in range(np.random.randint(1, 10)) + ], "noise": { "mean": 0.0, "std": 0.1 diff --git a/config_maker/dimensions/varying_frequency.py b/config_maker/dimensions/varying_frequency.py index 0835397a054ba5ab74348980dd03f3188e19371f..c4ff6541d2c9788c28c8ec19a5f81dcf8ba5b2ef 100644 --- a/config_maker/dimensions/varying_frequency.py +++ b/config_maker/dimensions/varying_frequency.py @@ -31,13 +31,13 @@ class VFOscillatingDimension(BaseDimension): return { "kind": "VFOSC", - "equation": { + "equation": [{ "function": np.random.choice(VFOscillatingDimension.OSC_FN), "amplitude": round(np.random.uniform(low=0.1, high=5.0), 4), "frequency_min": round(f_min, 4), "frequency_max": round(f_max, 4), "period": int(np.random.uniform(low=50, high=300)) - }, + }], "noise": { "mean": 0.0, "std": 0.1 diff --git a/config_maker/generator.py b/config_maker/generator.py index 9b71fb04b01cbc39414dffa94b667b312a5f696f..4e0da23976fdea0b528f387aea957a385f41940e 100644 --- a/config_maker/generator.py +++ b/config_maker/generator.py @@ -95,8 +95,9 @@ class ConfigGenerator: length_per_dim = (self.__softmax(length_per_dim) * anomalies_per_sub[idx]).astype(int) for i in range(nb_dim_erroneous): - dim_erroneous = np.random.choice(sub) - kind = np.random.choice(ANOMALIES[self.config["dimensions"][dim_erroneous]["kind"]]) - self.config["dimensions"][dim_erroneous]["anomalies"].append(kind(length_per_dim[i]).generate()) + if length_per_dim[i] > 0: + dim_erroneous = np.random.choice(sub) + kind = np.random.choice(ANOMALIES[self.config["dimensions"][dim_erroneous]["kind"]]) + self.config["dimensions"][dim_erroneous]["anomalies"].append(kind(length_per_dim[i]).generate()) return self diff --git a/generator/anomaly/__init__.py b/generator/anomaly/__init__.py index 7af34faf719356540832ef8154fb5ea135dc7a17..68b5f3252e1c21ac6209cd84c9988514f87783f9 100644 --- a/generator/anomaly/__init__.py +++ b/generator/anomaly/__init__.py @@ -3,6 +3,7 @@ This module is in charge of the generation of anomalies. """ from .noise import NoiseAnomaly from .correlation import CorrelationAnomaly +from .frequency import FrequencyAnomaly from .value import ValueAnomaly from .varying_frequency import VFAnomaly @@ -11,8 +12,9 @@ ANOMALY_CLASSES = { "CORRELATION": CorrelationAnomaly, "VALUE": ValueAnomaly, "VF": VFAnomaly, + "FREQUENCY": FrequencyAnomaly, } PREPROCESS_ANOMALIES = [] PROCESS_ANOMALIES = ["VF",] -POSTPROCESS_ANOMALIES = ["NOISE", "CORRELATION", "VALUE"] +POSTPROCESS_ANOMALIES = ["NOISE", "CORRELATION", "VALUE", "FREQUENCY"] diff --git a/generator/anomaly/noise.py b/generator/anomaly/noise.py index 7bc77c955224ed6ab35d48902b160b544dffa00f..9a0e75484f99245b572556ae0d8bac3ee9fa2394 100644 --- a/generator/anomaly/noise.py +++ b/generator/anomaly/noise.py @@ -1,6 +1,8 @@ """ This module descibes the noise anomalies. """ +from typing import Tuple + import numpy as np from .base import BaseAnomaly @@ -9,7 +11,7 @@ class NoiseAnomaly(BaseAnomaly): """ This class is in charge of adding noise to a given time series. """ - def inject(self) -> np.array: + def inject(self) -> Tuple[np.array, np.array]: start, end = self.find_anomaly_index() noise = np.random.normal(self.params["mean"], self.params["std"], self.length) diff --git a/generator/dataset.py b/generator/dataset.py index 06e6a93ed35cc530767de409797b4ffdbd06ab5b..d38cf0424edb84f775c5e3a35f89a68c1d0db4fe 100644 --- a/generator/dataset.py +++ b/generator/dataset.py @@ -4,6 +4,7 @@ This module provides the main class needed to generate time series from typing import List, Tuple import numpy as np +import matplotlib.pyplot as plt from generator.dimension import DIMENSION_CLASSES diff --git a/generator/dimension/linear_correlation.py b/generator/dimension/linear_correlation.py index 75354c539e498a04824c1f45fd2c42f20ef369d1..c581a4378e4803f72f86193865f8eb5178156ca9 100644 --- a/generator/dimension/linear_correlation.py +++ b/generator/dimension/linear_correlation.py @@ -30,4 +30,4 @@ class LinearCorrelationDimension(BaseDimension): steps = np.insert(self.terms["sign"] * np.sign(np.diff(corr_dim)) * self.terms["step"], 0, 0) self.train_data[self.idx, lag:] = np.cumsum(steps)[:-lag] if lag > 0 else np.cumsum(steps) - return self.data, self.train_data \ No newline at end of file + return self.data, self.train_data diff --git a/generator/dimension/oscillating.py b/generator/dimension/oscillating.py index 8af6fb80606201ed814d92c8919867b1f224f8e1..8193107f2d664736df1adc1ff64e0c640c818639 100644 --- a/generator/dimension/oscillating.py +++ b/generator/dimension/oscillating.py @@ -4,6 +4,7 @@ This module defines different dimension generator. from typing import Tuple import numpy as np +import matplotlib.pyplot as plt from .base import BaseDimension @@ -27,17 +28,29 @@ class OscillatingDimension(BaseDimension): def generate(self) -> Tuple[np.array, np.array]: # Prepare computing variables - dt = .01 - func = self.OSCILLATING_FUNCTIONS[self.terms["function"]] - - # Generate testing data - steps = np.zeros(self.length) - steps += 2 * np.pi * self.terms["frequency"] * dt - self.data[self.idx] = self.terms["amplitude"] * func(np.cumsum(steps) + np.random.normal(0.0, 2.0)) - - # Generate training data - steps = np.zeros(self.train_length) - steps += 2 * np.pi * self.terms["frequency"] * dt - self.train_data[self.idx] = self.terms["amplitude"] * func(np.cumsum(steps)) + dt = .001 + + res = np.zeros(self.length) + res_train = np.zeros(self.train_length) + + for term in self.terms: + func = self.OSCILLATING_FUNCTIONS[term["function"]] + + # Generate testing data + signal = self.data[self.idx] + signal += 2 * np.pi * term["frequency"] * dt + signal = func(np.cumsum(signal)) + signal *= term["amplitude"] + res += signal + + # Generate training data + signal = self.train_data[self.idx] + signal += 2 * np.pi * term["frequency"] * dt + signal = func(np.cumsum(signal)) + signal *= term["amplitude"] + res_train += signal + + self.data[self.idx] = res + self.train_data[self.idx] = res_train return self.data, self.train_data diff --git a/generator/dimension/varying_frequency.py b/generator/dimension/varying_frequency.py index 3dff8867759e9628746be73ab323ffd239ea9dec..f816d3f48e7556a8e7d860a4593316a07db5be22 100644 --- a/generator/dimension/varying_frequency.py +++ b/generator/dimension/varying_frequency.py @@ -29,6 +29,7 @@ class VFOscillatingDimension(BaseDimension): def generate(self) -> Tuple[np.array, np.array]: # Prepare computing variables dt = .01 + self.terms = self.terms[0] func = self.OSCILLATING_FUNCTIONS[self.terms["function"]] # Prepare frequencies diff --git a/main.py b/main.py index 0788ca8453f60bd4d2476f0d65d70755beb227b6..3b405dc0be22846acd452d978c33d771c950a745 100755 --- a/main.py +++ b/main.py @@ -40,6 +40,12 @@ if __name__ == "__main__": nargs="+" ) parser.add_argument("-a", "--algorithms", help="Which algorithm to train.", default=["kmeans"], nargs="+") + parser.add_argument( + "-m", "--cluster-method", + help="Which cluster algorithm to use.", + default="HDBSCAN", + dest="method" + ) parser.add_argument("-i", "--input", help="Input directory. Only to be used when no data will be generated") parser.add_argument("-o", "--output", help="Output directory") parser.add_argument( @@ -144,7 +150,7 @@ if __name__ == "__main__": if args.split and args.task in ["train", "all"]: splitter = BaseSplitter(f"{INPUT_DIR}/{config_name}") - splitter.split_data() + splitter.split_data(method=args.method) # ================================================================================================================= # Train algorithm diff --git a/split/base.py b/split/base.py index b39d797c584aa2b04777cdf2c6d5e5cc1bbab092..2e14789b7cae160d4b589988a1537731c815ac9a 100644 --- a/split/base.py +++ b/split/base.py @@ -3,6 +3,7 @@ This module contains the basic splitter algorithm based on the correlation coeff clustering. """ import json +import os import matplotlib.pyplot as plt import numpy as np @@ -27,8 +28,10 @@ class BaseSplitter: """ def __init__(self, path): self.data_path = path + self.output_path = f"{path}/splitting" + os.makedirs(f"{path}/splitting", exist_ok=True) - def split_data(self): + def split_data(self, method="HDBSCAN"): """ This method will be in charge of splitting data into subsystems. """ @@ -38,9 +41,9 @@ class BaseSplitter: labels_df = pd.read_csv(f"{self.data_path}/dataset_variables_labels.csv") # Remove the categorial variables from the correlation computation - cat_columns = df.select_dtypes(include="int64").columns - w_df.drop(columns=cat_columns, inplace=True) + w_df.drop(columns=df.select_dtypes(include="int64").columns, inplace=True) w_df.drop(columns=["Timestamp", "is_anomaly"], errors="ignore", inplace=True) + w_df.drop(columns=w_df.columns[w_df.nunique() <= 1], inplace=True) # Compute all the correlations for the dataset and find the coefficient that produces the best # cluters from its coefficient @@ -49,17 +52,16 @@ class BaseSplitter: x = self._compute_correlations(w_df) for i in range(1, len(w_df.columns)): - # km = KMeans(n_clusters=i) - # clusters = km.fit_predict(x) - hdb_scan = HDBSCAN(min_cluster_size=2, allow_single_cluster=True, n_jobs=-1) - clusters = hdb_scan.fit_predict(x) - - if len(np.unique(clusters)) == 1: - score = 0.5 - elif i > 1: - score = silhouette_score(x, clusters) + if "HDBSCAN" == method: + model = HDBSCAN(min_cluster_size=2, allow_single_cluster=True, n_jobs=-1) + elif "kmeans" == method: + model = KMeans(n_clusters=i, n_init="auto") else: - score = 0.01 + raise ValueError("Please select either kmeans or HDBSCAN as a clustering method.") + + # Compute clusters and their silhouette score + clusters = model.fit_predict(x) + score = silhouette_score(x, clusters) if i > 1 else 0.5 # Keep the best coeff and number of clusters in mind if score > max_silhouette and (score > 0.5 or i == 1): @@ -67,12 +69,12 @@ class BaseSplitter: best_clusters = clusters # Save choosen clustering - with open(f"{self.data_path}/dataset_clusters.txt", "w", encoding="utf-8") as f: + with open(f"{self.output_path}/dataset_clusters.txt", "w", encoding="utf-8") as f: f.write(json.dumps(best_clusters.tolist())) # Split the dataset for i in range(-1 if -1 in clusters else 0, max(best_clusters)+1): - drop = [str(col) for idx, col in enumerate(w_df.columns) if best_clusters[idx] != i] + drop = [col for idx, col in enumerate(w_df.columns) if best_clusters[idx] != i] df.drop(columns=drop)\ .to_csv(f"{self.data_path}/dataset_{i}_auto_split.csv", index_label="Timestamp") train_df.drop(columns=drop)\ @@ -84,7 +86,7 @@ class BaseSplitter: """ Compute the vector of correlation coefficients for each of the variable of the dataset. """ - with open(f"{self.data_path}/split_time.csv", "a", encoding="utf-8") as f: + with open(f"{self.output_path}/split_time.csv", "a", encoding="utf-8") as f: f.write("Algorithm,duration\n") x = None for coeff in CORRELATION_CLASSES: @@ -95,10 +97,10 @@ class BaseSplitter: duration = time() - start_time f.write(f"{coeff_name},{duration}\n") - correlation_matrix.to_csv(f"{self.data_path}/dataset_correlation_{coeff_name}.csv") + correlation_matrix.to_csv(f"{self.output_path}/dataset_correlation_{coeff_name}.csv") sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")\ .get_figure()\ - .savefig(f"{self.data_path}/dataset_correlation_matrix_{coeff_name}.png") + .savefig(f"{self.output_path}/dataset_correlation_matrix_{coeff_name}.png") plt.clf() if x is not None: diff --git a/trainers/base.py b/trainers/base.py index eacfaaa9ccbc4abb756c5167fb741d15ba2bbe2f..b52bf055d0fbe275811420db61a28018002a32e0 100644 --- a/trainers/base.py +++ b/trainers/base.py @@ -27,11 +27,14 @@ class BaseTrainer(): self.params = kwargs["const"] self.optim_params = kwargs["optimize"] self.default_params = kwargs["default"] - self.log_path = f"{'/'.join(data_path.split('/')[:-1])}/{self.__class__.__name__}_logs" self.algorithm = algorithm self.train = train self.pwd = "" + # Prepare logging files and dirs + self.log_path = f"{data_path}/logs/{algorithm}_logs.txt" + os.makedirs(f"{data_path}/logs", exist_ok=True) + def start(self, optim=False): """ This method orchestrates the optimization, training and computing of the results for the @@ -167,7 +170,7 @@ class BaseTrainer(): """ start_time = time() cmd = f"python3 {self.pwd}/algorithms/{self.algorithm}/algorithm.py '{json.dumps(model_args)}'" - cmd += f" 1>{self.log_path} 2>{self.log_path}" + cmd += f" 1>>{self.log_path} 2>>{self.log_path}" os.system(cmd) duration = time() - start_time