Add possibility to merge or not

43a3caa3 · Pierre LOTTE · a60e4fc7 · 43a3caa3 · 43a3caa3
Commit 43a3caa3 authored 9 months ago by Pierre LOTTE
--- a/paradise/main.py
+++ b/paradise/main.py
@@ -63,6 +63,11 @@ if __name__ == "__main__":
        help="Use Docker containers directly to run the algorithm. Allow to run algorithms without cloning repo",
        action="store_true"
    )
+    parser.add_argument(
+        "--merge",
+        help="Whether the method should merge the correlation matrices or not.",
+        action="store_true"
+    )
    # Load args
    args = parser.parse_args()
@@ -157,7 +162,7 @@ if __name__ == "__main__":
        if args.split and args.task in ["train", "all"]:
            splitter = BaseSplitter(f"{INPUT_DIR}/{config_name}")
-            splitter.split_data(method=args.method)
+            splitter.split_data(method=args.method, merge=args.merge)
    # =================================================================================================================
    #                                               Train algorithm

--- a/paradise/split/base.py
+++ b/paradise/split/base.py
@@ -31,7 +31,7 @@ class BaseSplitter:
        self.output_path = f"{path}/splitting"
        os.makedirs(f"{path}/splitting", exist_ok=True)
-    def split_data(self, method="HDBSCAN"):
+    def split_data(self, method="HDBSCAN", merge=False):
        """
        This method will be in charge of splitting data into subsystems.
        """
@@ -49,7 +49,7 @@ class BaseSplitter:
        # cluters from its coefficient
        max_silhouette = 0
        best_clusters = None
-        x = self._compute_correlations(w_df)
+        x = self._compute_correlations(w_df, merge=merge)
        if "HDBSCAN" == method:
            model = HDBSCAN(min_cluster_size=2, allow_single_cluster=True, n_jobs=-1)
@@ -83,7 +83,7 @@ class BaseSplitter:
            labels = np.bitwise_or.reduce(labels_df.drop(columns=drop).to_numpy(), axis=1, dtype=np.int32)
            pd.DataFrame(labels).to_csv(f"{self.data_path}/dataset_{i}_auto_split_labels.csv", index=False)
-    def _compute_correlations(self, data):
+    def _compute_correlations(self, data, merge=False):
        """
        Compute the vector of correlation coefficients for each of the variable of the dataset.
        """
@@ -106,11 +106,14 @@ class BaseSplitter:
                x.append(np.abs(correlation_matrix))
-        x = np.array(x)
+        if merge:
-        x = np.mean(x, axis=0)
+            x = np.array(x)
+            x = np.mean(x, axis=0)
-        sns.heatmap(x, annot=True, cmap="coolwarm")\
+            sns.heatmap(x, annot=True, cmap="coolwarm")\
-            .get_figure()\
+                .get_figure()\
-            .savefig(f"{self.output_path}/dataset_final_matrix.png")
+                .savefig(f"{self.output_path}/dataset_final_matrix.png")
+        else:
+            x = np.concatenate(x, axis=1)
        return x