Add frequency anomalies and a few fixes

f75648e3 · Pierre LOTTE · bc8c3a91 · f75648e3 · f75648e3 · f75648e3
Commit f75648e3 authored 10 months ago by Pierre LOTTE
--- a/algorithm_params.json
+++ b/algorithm_params.json
@@ -32,7 +32,7 @@
 			"prediction_window_size": [10, 200]
 		},
 		"const": {
-			"epochs": 500,
+			"epochs": 100,
 			"learning_rate": 0.001
 		},
 		"default": {
@@ -49,15 +49,14 @@
 			"prediction_window_size": [10, 100]
 		},
 		"const": {
-			"epochs": 500,
+			"epochs": 100,
 			"learning_rate": 0.001
 		},
 		"default": {
-			"lstm_layers": 1,
+			"lstm_layers": 2,
 			"window_size": 200,
 			"prediction_window_size": 50 
 		}
-
 	},
 	"health_esn": {
 		"training": true,

--- a/config_maker/anomalies/__init__.py
+++ b/config_maker/anomalies/__init__.py
@@ -5,10 +5,11 @@ This module handles anomaly creation
 from .noise import NoiseAnomaly
 from .corr import CorrelationAnomaly
 from .varying_frequency import VFAnomaly
+from .frequency import FrequencyAnomaly

 ANOMALIES = {
    "CORRELATION": [NoiseAnomaly, CorrelationAnomaly],
    "EXP_CORRELATION": [NoiseAnomaly, CorrelationAnomaly],
-    "VFOSC": [NoiseAnomaly, VFAnomaly],
-    "OSC": [NoiseAnomaly],
+    "VFOSC": [NoiseAnomaly, VFAnomaly, FrequencyAnomaly],
+    "OSC": [NoiseAnomaly, FrequencyAnomaly],
 }
--- a/config_maker/dimensions/correlation.py
+++ b/config_maker/dimensions/correlation.py
@@ -12,12 +12,16 @@ class LinearCorrelationDimension(BaseDimension):
    def generate(self, idx:int) -> Dict:
        dimension = np.random.choice(range(idx))

+        # Choose lag
+        lag = np.random.randint(low=0, high=20)
+
        return {
            "kind": "CORRELATION",
            "dimension": int(dimension),
            "equation": {
                "sign": int(np.random.choice([-1, 1])),
-                "step": round(np.random.uniform(low=0.01, high=0.1), 4)
+                "step": round(np.random.uniform(low=0.01, high=0.1), 4),
+                "lag": lag
            },
            "noise": {
                "mean": 0.0,
@@ -36,7 +40,7 @@ class ExpCorrelationDimension(BaseDimension):
        start, end = limits

        # Choose lag
-        lag = np.random.randint(low=0, high=100)
+        lag = np.random.randint(low=0, high=20)

        return {
            "kind": "EXP_CORRELATION",

--- a/config_maker/dimensions/oscillating.py
+++ b/config_maker/dimensions/oscillating.py
@@ -24,11 +24,13 @@ class OscillatingDimension(BaseDimension):
    def generate(self, idx:int) -> Dict:
        return {
            "kind": "OSC",
-            "equation": {
-                "function": np.random.choice(OscillatingDimension.OSC_FN),
-                "amplitude": round(np.random.uniform(low=0.1, high=5.0), 4),
-                "frequency": round(np.random.uniform(low=0.1, high=5.0), 4),
-            },
+            "equation": [
+                {
+                    "function": np.random.choice(OscillatingDimension.OSC_FN),
+                    "amplitude": round(np.random.uniform(low=0.1, high=5.0), 4),
+                    "frequency": round(np.random.uniform(low=0.1, high=5.0), 4),
+                } for _ in range(np.random.randint(1, 10))
+            ],
            "noise": {
                "mean": 0.0,
                "std": 0.1

--- a/config_maker/dimensions/varying_frequency.py
+++ b/config_maker/dimensions/varying_frequency.py
@@ -31,13 +31,13 @@ class VFOscillatingDimension(BaseDimension):

        return {
            "kind": "VFOSC",
-            "equation": {
+            "equation": [{
                "function": np.random.choice(VFOscillatingDimension.OSC_FN),
                "amplitude": round(np.random.uniform(low=0.1, high=5.0), 4),
                "frequency_min": round(f_min, 4),
                "frequency_max": round(f_max, 4),
                "period": int(np.random.uniform(low=50, high=300))
-            },
+            }],
            "noise": {
                "mean": 0.0,
                "std": 0.1

--- a/config_maker/generator.py
+++ b/config_maker/generator.py
@@ -95,8 +95,9 @@ class ConfigGenerator:
            length_per_dim = (self.__softmax(length_per_dim) * anomalies_per_sub[idx]).astype(int)

            for i in range(nb_dim_erroneous):
-                dim_erroneous = np.random.choice(sub)
-                kind = np.random.choice(ANOMALIES[self.config["dimensions"][dim_erroneous]["kind"]])
-                self.config["dimensions"][dim_erroneous]["anomalies"].append(kind(length_per_dim[i]).generate())
+                if length_per_dim[i] > 0:
+                    dim_erroneous = np.random.choice(sub)
+                    kind = np.random.choice(ANOMALIES[self.config["dimensions"][dim_erroneous]["kind"]])
+                    self.config["dimensions"][dim_erroneous]["anomalies"].append(kind(length_per_dim[i]).generate())

        return self
--- a/generator/anomaly/__init__.py
+++ b/generator/anomaly/__init__.py
@@ -3,6 +3,7 @@ This module is in charge of the generation of anomalies.
 """
 from .noise import NoiseAnomaly
 from .correlation import CorrelationAnomaly
+from .frequency import FrequencyAnomaly
 from .value import ValueAnomaly
 from .varying_frequency import VFAnomaly

@@ -11,8 +12,9 @@ ANOMALY_CLASSES = {
    "CORRELATION": CorrelationAnomaly,
    "VALUE": ValueAnomaly,
    "VF": VFAnomaly,
+    "FREQUENCY": FrequencyAnomaly,
 }

 PREPROCESS_ANOMALIES = []
 PROCESS_ANOMALIES = ["VF",]
-POSTPROCESS_ANOMALIES = ["NOISE", "CORRELATION", "VALUE"]
+POSTPROCESS_ANOMALIES = ["NOISE", "CORRELATION", "VALUE", "FREQUENCY"]
--- a/generator/anomaly/noise.py
+++ b/generator/anomaly/noise.py
 """
 This module descibes the noise anomalies.
 """
+from typing import Tuple
+
 import numpy as np

 from .base import BaseAnomaly
@@ -9,7 +11,7 @@ class NoiseAnomaly(BaseAnomaly):
    """
    This class is in charge of adding noise to a given time series.
    """
-    def inject(self) -> np.array:
+    def inject(self) -> Tuple[np.array, np.array]:
        start, end = self.find_anomaly_index()

        noise = np.random.normal(self.params["mean"], self.params["std"], self.length)

--- a/generator/dataset.py
+++ b/generator/dataset.py
@@ -4,6 +4,7 @@ This module provides the main class needed to generate time series
 from typing import List, Tuple

 import numpy as np
+import matplotlib.pyplot as plt

 from generator.dimension import DIMENSION_CLASSES


--- a/generator/dimension/linear_correlation.py
+++ b/generator/dimension/linear_correlation.py
@@ -30,4 +30,4 @@ class LinearCorrelationDimension(BaseDimension):
        steps = np.insert(self.terms["sign"] * np.sign(np.diff(corr_dim)) * self.terms["step"], 0, 0)
        self.train_data[self.idx, lag:] = np.cumsum(steps)[:-lag] if lag > 0 else np.cumsum(steps)

-        return self.data, self.train_data
\ No newline at end of file
+        return self.data, self.train_data
--- a/generator/dimension/oscillating.py
+++ b/generator/dimension/oscillating.py
@@ -4,6 +4,7 @@ This module defines different dimension generator.
 from typing import Tuple

 import numpy as np
+import matplotlib.pyplot as plt

 from .base import BaseDimension

@@ -27,17 +28,29 @@ class OscillatingDimension(BaseDimension):

    def generate(self) -> Tuple[np.array, np.array]:
        # Prepare computing variables
-        dt = .01
-        func = self.OSCILLATING_FUNCTIONS[self.terms["function"]]
-
-        # Generate testing data
-        steps = np.zeros(self.length)
-        steps += 2 * np.pi * self.terms["frequency"] * dt
-        self.data[self.idx] = self.terms["amplitude"] * func(np.cumsum(steps) + np.random.normal(0.0, 2.0))
-
-        # Generate training data
-        steps = np.zeros(self.train_length)
-        steps += 2 * np.pi * self.terms["frequency"] * dt
-        self.train_data[self.idx] = self.terms["amplitude"] * func(np.cumsum(steps))
+        dt = .001
+
+        res = np.zeros(self.length)
+        res_train = np.zeros(self.train_length)
+
+        for term in self.terms:
+            func = self.OSCILLATING_FUNCTIONS[term["function"]]
+
+            # Generate testing data
+            signal = self.data[self.idx]
+            signal += 2 * np.pi * term["frequency"] * dt
+            signal = func(np.cumsum(signal))
+            signal *= term["amplitude"]
+            res += signal
+
+            # Generate training data
+            signal = self.train_data[self.idx]
+            signal += 2 * np.pi * term["frequency"] * dt
+            signal = func(np.cumsum(signal))
+            signal *= term["amplitude"]
+            res_train += signal
+
+        self.data[self.idx] = res
+        self.train_data[self.idx] = res_train

        return self.data, self.train_data
--- a/generator/dimension/varying_frequency.py
+++ b/generator/dimension/varying_frequency.py
@@ -29,6 +29,7 @@ class VFOscillatingDimension(BaseDimension):
    def generate(self) -> Tuple[np.array, np.array]:
        # Prepare computing variables
        dt = .01
+        self.terms = self.terms[0]
        func = self.OSCILLATING_FUNCTIONS[self.terms["function"]]

        # Prepare frequencies

--- a/main.py
+++ b/main.py
@@ -40,6 +40,12 @@ if __name__ == "__main__":
        nargs="+"
    )
    parser.add_argument("-a", "--algorithms", help="Which algorithm to train.", default=["kmeans"], nargs="+")
+    parser.add_argument(
+        "-m", "--cluster-method",
+        help="Which cluster algorithm to use.",
+        default="HDBSCAN",
+        dest="method"
+    )
    parser.add_argument("-i", "--input", help="Input directory. Only to be used when no data will be generated")
    parser.add_argument("-o", "--output", help="Output directory")
    parser.add_argument(
@@ -144,7 +150,7 @@ if __name__ == "__main__":

        if args.split and args.task in ["train", "all"]:
            splitter = BaseSplitter(f"{INPUT_DIR}/{config_name}")
-            splitter.split_data()
+            splitter.split_data(method=args.method)

    # =================================================================================================================
    #                                               Train algorithm

--- a/split/base.py
+++ b/split/base.py
@@ -3,6 +3,7 @@ This module contains the basic splitter algorithm based on the correlation coeff
 clustering.
 """
 import json
+import os

 import matplotlib.pyplot as plt
 import numpy as np
@@ -27,8 +28,10 @@ class BaseSplitter:
    """
    def __init__(self, path):
        self.data_path = path
+        self.output_path = f"{path}/splitting"
+        os.makedirs(f"{path}/splitting", exist_ok=True)

-    def split_data(self):
+    def split_data(self, method="HDBSCAN"):
        """
        This method will be in charge of splitting data into subsystems.
        """
@@ -38,9 +41,9 @@ class BaseSplitter:
        labels_df = pd.read_csv(f"{self.data_path}/dataset_variables_labels.csv")

        # Remove the categorial variables from the correlation computation
-        cat_columns = df.select_dtypes(include="int64").columns
-        w_df.drop(columns=cat_columns, inplace=True)
+        w_df.drop(columns=df.select_dtypes(include="int64").columns, inplace=True)
        w_df.drop(columns=["Timestamp", "is_anomaly"], errors="ignore", inplace=True)
+        w_df.drop(columns=w_df.columns[w_df.nunique() <= 1], inplace=True)

        # Compute all the correlations for the dataset and find the coefficient that produces the best
        # cluters from its coefficient
@@ -49,17 +52,16 @@ class BaseSplitter:
        x = self._compute_correlations(w_df)

        for i in range(1, len(w_df.columns)):
-            # km = KMeans(n_clusters=i)
-            # clusters = km.fit_predict(x)
-            hdb_scan = HDBSCAN(min_cluster_size=2, allow_single_cluster=True, n_jobs=-1)
-            clusters = hdb_scan.fit_predict(x)
-
-            if len(np.unique(clusters)) == 1:
-                score = 0.5
-            elif i > 1:
-                score = silhouette_score(x, clusters)
+            if "HDBSCAN" == method:
+                model = HDBSCAN(min_cluster_size=2, allow_single_cluster=True, n_jobs=-1)
+            elif "kmeans" == method:
+                model = KMeans(n_clusters=i, n_init="auto")
            else:
-                score = 0.01
+                raise ValueError("Please select either kmeans or HDBSCAN as a clustering method.")
+
+            # Compute clusters and their silhouette score
+            clusters = model.fit_predict(x)
+            score = silhouette_score(x, clusters) if i > 1 else 0.5

            # Keep the best coeff and number of clusters in mind
            if score > max_silhouette and (score > 0.5 or i == 1):
@@ -67,12 +69,12 @@ class BaseSplitter:
                best_clusters = clusters

        # Save choosen clustering
-        with open(f"{self.data_path}/dataset_clusters.txt", "w", encoding="utf-8") as f:
+        with open(f"{self.output_path}/dataset_clusters.txt", "w", encoding="utf-8") as f:
            f.write(json.dumps(best_clusters.tolist()))

        # Split the dataset
        for i in range(-1 if -1 in clusters else 0, max(best_clusters)+1):
-            drop = [str(col) for idx, col in enumerate(w_df.columns) if best_clusters[idx] != i]
+            drop = [col for idx, col in enumerate(w_df.columns) if best_clusters[idx] != i]
            df.drop(columns=drop)\
                .to_csv(f"{self.data_path}/dataset_{i}_auto_split.csv", index_label="Timestamp")
            train_df.drop(columns=drop)\
@@ -84,7 +86,7 @@ class BaseSplitter:
        """
        Compute the vector of correlation coefficients for each of the variable of the dataset.
        """
-        with open(f"{self.data_path}/split_time.csv", "a", encoding="utf-8") as f:
+        with open(f"{self.output_path}/split_time.csv", "a", encoding="utf-8") as f:
            f.write("Algorithm,duration\n")
            x = None
            for coeff in CORRELATION_CLASSES:
@@ -95,10 +97,10 @@ class BaseSplitter:
                duration = time() - start_time
                f.write(f"{coeff_name},{duration}\n")

-                correlation_matrix.to_csv(f"{self.data_path}/dataset_correlation_{coeff_name}.csv")
+                correlation_matrix.to_csv(f"{self.output_path}/dataset_correlation_{coeff_name}.csv")
                sns.heatmap(correlation_matrix, annot=True, cmap="coolwarm")\
                    .get_figure()\
-                    .savefig(f"{self.data_path}/dataset_correlation_matrix_{coeff_name}.png")
+                    .savefig(f"{self.output_path}/dataset_correlation_matrix_{coeff_name}.png")
                plt.clf()

                if x is not None:

--- a/trainers/base.py
+++ b/trainers/base.py
@@ -27,11 +27,14 @@ class BaseTrainer():
        self.params = kwargs["const"]
        self.optim_params = kwargs["optimize"]
        self.default_params = kwargs["default"]
-        self.log_path = f"{'/'.join(data_path.split('/')[:-1])}/{self.__class__.__name__}_logs"
        self.algorithm = algorithm
        self.train = train
        self.pwd = ""

+        # Prepare logging files and dirs
+        self.log_path = f"{data_path}/logs/{algorithm}_logs.txt"
+        os.makedirs(f"{data_path}/logs", exist_ok=True)
+
    def start(self, optim=False):
        """
        This method orchestrates the optimization, training and computing of the results for the
@@ -167,7 +170,7 @@ class BaseTrainer():
        """
        start_time = time()
        cmd = f"python3 {self.pwd}/algorithms/{self.algorithm}/algorithm.py '{json.dumps(model_args)}'"
-        cmd += f" 1>{self.log_path} 2>{self.log_path}"
+        cmd += f" 1>>{self.log_path} 2>>{self.log_path}"

        os.system(cmd)
        duration = time() - start_time