From 897a4516bf31f15776f007289563db1b0176c2fb Mon Sep 17 00:00:00 2001
From: Caroline DE POURTALES <caroline.de-pourtales@irit.fr>
Date: Mon, 5 Dec 2022 15:54:56 +0100
Subject: [PATCH] adding predict

---
 Configuration/config.ini                      |   9 +-
 Linker/Linker.py                              | 136 +++++++--------
 Linker/utils_linker.py                        |   5 +-
 NeuralProofNet/NeuralProofNet.py              |  60 ++-----
 README.md                                     |   8 +-
 SuperTagger/Datasets/processingTXT.py         |  31 ++--
 SuperTagger/README.md                         |   3 +-
 SuperTagger/SuperTagger/SentencesTokenizer.py |  42 +++++
 SuperTagger/SuperTagger/SuperTagger.py        | 160 ++++++++----------
 .../{Utils => }/SymbolTokenizer.py            |  28 ++-
 .../{Utils => }/Tagging_bert_model.py         |  24 +++
 .../SuperTagger/Utils/SentencesTokenizer.py   |  18 --
 SuperTagger/SuperTagger/Utils/helpers.py      |  42 -----
 SuperTagger/SuperTagger/eval.py               |  19 +++
 SuperTagger/__init__.py                       |   1 -
 predict_links.py                              |  12 +-
 predict_supertags.py                          |   4 +-
 train_neuralproofnet.py                       |  19 +--
 train_supertagger.py                          |   7 +-
 utils.py                                      | 112 ++++++++----
 20 files changed, 368 insertions(+), 372 deletions(-)
 create mode 100644 SuperTagger/SuperTagger/SentencesTokenizer.py
 rename SuperTagger/SuperTagger/{Utils => }/SymbolTokenizer.py (71%)
 rename SuperTagger/SuperTagger/{Utils => }/Tagging_bert_model.py (72%)
 delete mode 100644 SuperTagger/SuperTagger/Utils/SentencesTokenizer.py
 delete mode 100644 SuperTagger/SuperTagger/Utils/helpers.py
 create mode 100644 SuperTagger/SuperTagger/eval.py

diff --git a/Configuration/config.ini b/Configuration/config.ini
index 30dfc89..a695829 100644
--- a/Configuration/config.ini
+++ b/Configuration/config.ini
@@ -20,11 +20,4 @@ dim_cat_out = 256
 dim_intermediate_ffn = 128
 dim_pre_sinkhorn_transfo = 32
 dropout = 0.1
-sinkhorn_iters = 5
-
-[MODEL_TRAINING]
-batch_size = 32
-pretrain_linker_epochs = 10
-epoch = 20
-seed_val = 42
-learning_rate = 2e-3
\ No newline at end of file
+sinkhorn_iters = 5
\ No newline at end of file
diff --git a/Linker/Linker.py b/Linker/Linker.py
index 58197bd..e8494c9 100644
--- a/Linker/Linker.py
+++ b/Linker/Linker.py
@@ -1,4 +1,3 @@
-import datetime
 import math
 import os
 import sys
@@ -11,7 +10,6 @@ from torch.nn import Sequential, LayerNorm, Module, Linear, Dropout, Transformer
 from torch.optim import AdamW
 from torch.optim.lr_scheduler import StepLR
 from torch.utils.data import TensorDataset, random_split
-from torch.utils.tensorboard import SummaryWriter
 from tqdm import tqdm
 
 from Configuration import Configuration
@@ -21,42 +19,15 @@ from Linker.Sinkhorn import sinkhorn_fn_no_exp as sinkhorn
 from Linker.atom_map import atom_map, atom_map_redux
 from Linker.eval import measure_accuracy, SinkhornLoss
 from Linker.utils_linker import FFN, get_axiom_links, get_GOAL, get_pos_idx, get_neg_idx, get_atoms_batch, \
-    find_pos_neg_idexes, get_num_atoms_batch
+    find_pos_neg_idexes, get_num_atoms_batch, generate_square_subsequent_mask
 from SuperTagger import SuperTagger
-from utils import pad_sequence
-
-
-def format_time(elapsed):
-    '''
-    Takes a time in seconds and returns a string hh:mm:ss
-    '''
-    # Round to the nearest second.
-    elapsed_rounded = int(round(elapsed))
-
-    # Format as hh:mm:ss
-    return str(datetime.timedelta(seconds=elapsed_rounded))
-
-
-def output_create_dir():
-    """
-    Create le output dir for tensorboard and checkpoint
-    @return: output dir, tensorboard writter
-    """
-    from datetime import datetime
-    outpout_path = 'TensorBoard'
-    training_dir = os.path.join(outpout_path, 'Tranning_' + datetime.today().strftime('%d-%m_%H-%M'))
-    logs_dir = os.path.join(training_dir, 'logs')
-    writer = SummaryWriter(log_dir=logs_dir)
-    return training_dir, writer
-
-
-def generate_square_subsequent_mask(sz):
-    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
-    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
+from utils import pad_sequence, format_time, output_create_dir
 
 
 class Linker(Module):
     
+    # region initialization
+
     def __init__(self, supertagger_path_model):
         super(Linker, self).__init__()
 
@@ -65,7 +36,6 @@ class Linker(Module):
         datasetConfig = config["DATASET_PARAMS"]
         modelEncoderConfig = config["MODEL_ENCODER"]
         modelLinkerConfig = config["MODEL_LINKER"]
-        modelTrainingConfig = config["MODEL_TRAINING"]
         dim_encoder = int(modelEncoderConfig['dim_encoder'])
         atom_vocab_size = int(datasetConfig['atom_vocab_size'])
         
@@ -85,7 +55,6 @@ class Linker(Module):
         self.max_len_sentence = int(datasetConfig['max_len_sentence'])
         self.max_atoms_in_sentence = int(datasetConfig['max_atoms_in_sentence'])
         self.max_atoms_in_one_type = int(datasetConfig['max_atoms_in_one_type'])
-        learning_rate = float(modelTrainingConfig['learning_rate'])
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         # endregion
 
@@ -125,10 +94,32 @@ class Linker(Module):
 
         # Learning
         self.cross_entropy_loss = SinkhornLoss()
-        self.optimizer = AdamW(self.parameters(), lr=learning_rate)
+        self.optimizer = AdamW(self.parameters(), lr=0.001)
         self.scheduler = StepLR(self.optimizer, step_size=2, gamma=0.5)
         self.to(self.device)
 
+    def load_weights(self, model_file):
+        print("#" * 15)
+        try:
+            params = torch.load(model_file, map_location=self.device)
+            self.atom_encoder.load_state_dict(params['atom_encoder'])
+            self.position_encoder.load_state_dict(params['position_encoder'])
+            self.transformer.load_state_dict(params['transformer'])
+            self.linker_encoder.load_state_dict(params['linker_encoder'])
+            self.pos_transformation.load_state_dict(params['pos_transformation'])
+            self.neg_transformation.load_state_dict(params['neg_transformation'])
+            self.cross_entropy_loss.load_state_dict(params['cross_entropy_loss'])
+            self.optimizer = params['optimizer']
+            print("\n The loading checkpoint was successful ! \n")
+        except Exception as e:
+            print("\n/!\ Can't load checkpoint model /!\ because :\n\n " + str(e), file=sys.stderr)
+            raise e
+        print("#" * 15)
+
+    #endregion
+
+    # region data
+
     def __preprocess_data(self, batch_size, df_axiom_links, validation_rate=0.1):
         r"""
         Args:
@@ -172,6 +163,26 @@ class Linker(Module):
         print("End preprocess Data")
         return training_dataloader, validation_dataloader
 
+    #endregion
+
+    # region training
+
+    def make_sinkhorn_inputs(self, bsd_tensor, positional_ids, atom_type):
+        """
+        :param bsd_tensor:
+            Tensor of shape batch size \times sequence length \times feature dimensionality.
+        :param positional_ids:
+            A List of batch_size elements, each being a List of num_atoms LongTensors.
+            Each LongTensor in positional_ids[b][a] indexes the location of atoms of type a in sentence b.
+        :param atom_type:
+        :return:
+        """
+
+        return torch.stack([torch.stack([bsd_tensor.select(0, index=i).select(0, index=int(atom)).to(self.device)
+                                         if atom != -1 else torch.zeros(self.dim_cat_out, device=self.device)
+                                         for atom in sentence])
+                            for i, sentence in enumerate(positional_ids[:, self.atom_map_redux[atom_type], :])])
+
     def forward(self, batch_num_atoms_per_word, batch_atoms, batch_pos_idx, batch_neg_idx, sents_embedding):
         r"""
         Args:
@@ -327,6 +338,10 @@ class Linker(Module):
 
         return avg_train_loss, avg_accuracy_train, training_time
 
+    #endregion
+
+    # region evaluation
+
     def eval_batch(self, batch):
         batch_num_atoms = batch[0].to(self.device)
         batch_atoms_tok = batch[1].to(self.device)
@@ -343,8 +358,9 @@ class Linker(Module):
         axiom_links_pred = torch.argmax(logits_predictions, dim=3)  # atom_vocab, batch_size, max atoms in one type
 
         print('\n')
-        print("Les vrais liens de la catégorie n : ", batch_true_links[1][2][:100])
-        print("Les prédictions : ", axiom_links_pred[2][1][:100])
+        print(batch_true_links)
+        print("Les vrais liens de la catégorie n : ", batch_true_links[0][2][:100])
+        print("Les prédictions : ", axiom_links_pred[2][0][:100])
         print('\n')
 
         accuracy = measure_accuracy(batch_true_links, axiom_links_pred)
@@ -369,6 +385,10 @@ class Linker(Module):
 
         return loss_average / len(dataloader), accuracy_average / len(dataloader)
 
+    #endregion
+
+    #region prediction 
+
     def predict_with_categories(self, sentence, categories):
         r""" Predict the links from a sentence and its categories
 
@@ -443,24 +463,8 @@ class Linker(Module):
             axiom_links_pred = torch.argmax(logits_predictions, dim=3)
 
         return categories, axiom_links_pred
-
-    def load_weights(self, model_file):
-        print("#" * 15)
-        try:
-            params = torch.load(model_file, map_location=self.device)
-            self.atom_encoder.load_state_dict(params['atom_encoder'])
-            self.position_encoder.load_state_dict(params['position_encoder'])
-            self.transformer.load_state_dict(params['transformer'])
-            self.linker_encoder.load_state_dict(params['linker_encoder'])
-            self.pos_transformation.load_state_dict(params['pos_transformation'])
-            self.neg_transformation.load_state_dict(params['neg_transformation'])
-            self.cross_entropy_loss.load_state_dict(params['cross_entropy_loss'])
-            self.optimizer.load_state_dict(params['optimizer'])
-            print("\n The loading checkpoint was successful ! \n")
-        except Exception as e:
-            print("\n/!\ Can't load checkpoint model /!\ because :\n\n " + str(e), file=sys.stderr)
-            raise e
-        print("#" * 15)
+    
+    #endregion
 
     def __checkpoint_save(self, path='/linker.pt'):
         """
@@ -470,28 +474,12 @@ class Linker(Module):
 
         torch.save({
             'atom_encoder': self.atom_encoder.state_dict(),
-            'position_encoder': self.position_encoder,
+            'position_encoder': self.position_encoder.state_dict(),
             'transformer': self.transformer.state_dict(),
             'linker_encoder': self.linker_encoder.state_dict(),
             'pos_transformation': self.pos_transformation.state_dict(),
             'neg_transformation': self.neg_transformation.state_dict(),
-            'cross_entropy_loss': self.cross_entropy_loss,
+            'cross_entropy_loss': self.cross_entropy_loss.state_dict(),
             'optimizer': self.optimizer,
         }, path)
         self.to(self.device)
-
-    def make_sinkhorn_inputs(self, bsd_tensor, positional_ids, atom_type):
-        """
-        :param bsd_tensor:
-            Tensor of shape batch size \times sequence length \times feature dimensionality.
-        :param positional_ids:
-            A List of batch_size elements, each being a List of num_atoms LongTensors.
-            Each LongTensor in positional_ids[b][a] indexes the location of atoms of type a in sentence b.
-        :param atom_type:
-        :return:
-        """
-
-        return torch.stack([torch.stack([bsd_tensor.select(0, index=i).select(0, index=int(atom)).to(self.device)
-                                         if atom != -1 else torch.zeros(self.dim_cat_out, device=self.device)
-                                         for atom in sentence])
-                            for i, sentence in enumerate(positional_ids[:, self.atom_map_redux[atom_type], :])])
diff --git a/Linker/utils_linker.py b/Linker/utils_linker.py
index ddf97bb..5f16c82 100644
--- a/Linker/utils_linker.py
+++ b/Linker/utils_linker.py
@@ -25,7 +25,10 @@ class FFN(Module):
     def forward(self, x):
         return self.ffn(x)
 
-
+def generate_square_subsequent_mask(sz):
+    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
+    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
+    
 ################################ Regex ########################################
 regex_categories_axiom_links = r'\w+\(\d+,(?:((?R))|(\w+))*,?(?:((?R))|(\w+))*\)'
 regex_categories = r'\w+\(\d+,(?:((?R))|(\w+))*,?(?:((?R))|(\w+))*\)'
diff --git a/NeuralProofNet/NeuralProofNet.py b/NeuralProofNet/NeuralProofNet.py
index 73ee607..ab12693 100644
--- a/NeuralProofNet/NeuralProofNet.py
+++ b/NeuralProofNet/NeuralProofNet.py
@@ -1,6 +1,3 @@
-import os
-import datetime
-import os
 import time
 
 import torch
@@ -8,7 +5,6 @@ from torch.nn import Module
 from torch.optim import AdamW
 from torch.optim.lr_scheduler import StepLR
 from torch.utils.data import TensorDataset, random_split
-from torch.utils.tensorboard import SummaryWriter
 from tqdm import tqdm
 
 from Configuration import Configuration
@@ -16,31 +12,7 @@ from Linker import Linker
 from Linker.eval import measure_accuracy, SinkhornLoss
 from Linker.utils_linker import get_axiom_links, get_GOAL, get_pos_idx, get_num_atoms_batch, get_neg_idx
 from NeuralProofNet.utils_proofnet import get_info_for_tagger
-from utils import pad_sequence
-
-
-def format_time(elapsed):
-    '''
-    Takes a time in seconds and returns a string hh:mm:ss
-    '''
-    # Round to the nearest second.
-    elapsed_rounded = int(round(elapsed))
-
-    # Format as hh:mm:ss
-    return str(datetime.timedelta(seconds=elapsed_rounded))
-
-
-def output_create_dir():
-    """
-    Create le output dir for tensorboard and checkpoint
-    @return: output dir, tensorboard writter
-    """
-    from datetime import datetime
-    outpout_path = 'TensorBoard'
-    training_dir = os.path.join(outpout_path, 'Tranning_' + datetime.today().strftime('%d-%m_%H-%M'))
-    logs_dir = os.path.join(training_dir, 'logs')
-    writer = SummaryWriter(log_dir=logs_dir)
-    return training_dir, writer
+from utils import pad_sequence, format_time, output_create_dir
 
 
 class NeuralProofNet(Module):
@@ -49,38 +21,30 @@ class NeuralProofNet(Module):
         super(NeuralProofNet, self).__init__()
         config = Configuration.read_config()
         datasetConfig = config["DATASET_PARAMS"]
-        modelTrainingConfig = config["MODEL_TRAINING"]
 
-        # pretrain settings
-        self.pretrain_linker_epochs = int(modelTrainingConfig['pretrain_linker_epochs'])
         # settings
         self.max_len_sentence = int(datasetConfig['max_len_sentence'])
         self.max_atoms_in_sentence = int(datasetConfig['max_atoms_in_sentence'])
         self.max_atoms_in_one_type = int(datasetConfig['max_atoms_in_one_type'])
-        learning_rate = float(modelTrainingConfig['learning_rate'])
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.batch_size = int(modelTrainingConfig['batch_size'])
 
         linker = Linker(supertagger_path_model)
         if linker_path_model is not None:
             linker.load_weights(linker_path_model)
-            self.pretrain_linker_epochs = 0
         self.linker = linker
 
         # Learning
         self.linker_loss = SinkhornLoss()
         self.linker_optimizer = AdamW(self.linker.parameters(),
-                                      lr=learning_rate)
+                                      lr=0.001)
         self.linker_scheduler = StepLR(self.linker_optimizer, step_size=2, gamma=0.5)
 
         self.to(self.device)
 
-    def __pretrain_linker__(self, df_axiom_links, checkpoint=False, tensorboard=True):
+    def __pretrain_linker__(self, df_axiom_links, pretrain_linker_epochs, batch_size, checkpoint=False, tensorboard=True):
         print("\nLinker Pre-Training\n")
-        self.linker.train_linker(df_axiom_links, validation_rate=0.05, epochs=self.pretrain_linker_epochs,
-                                 batch_size=self.batch_size,
-                                 checkpoint=checkpoint,
-                                 tensorboard=tensorboard)
+        self.linker.train_linker(df_axiom_links, validation_rate=0.05, epochs=pretrain_linker_epochs,
+                                 batch_size=batch_size, checkpoint=checkpoint, tensorboard=tensorboard)
         print("\nEND Linker Pre-Training\n")
 
     def __preprocess_data(self, batch_size, df_axiom_links, validation_rate=0.1):
@@ -144,11 +108,11 @@ class NeuralProofNet(Module):
         batch_neg_idx = batch_neg_idx.to(self.device)
 
         logits_links = self.linker(batch_num_atoms_per_word, atoms_batch_tokenized, batch_pos_idx, batch_neg_idx,
-                                   output['word_embeding'])
+                                   output['word_embedding'])
 
         return torch.log_softmax(logits_links, dim=3)
 
-    def train_neuralproofnet(self, df_axiom_links, validation_rate=0.1, epochs=20,
+    def train_neuralproofnet(self, df_axiom_links, validation_rate=0.1, epochs=20, pretrain_linker_epochs=0, 
                              batch_size=32, checkpoint=True, tensorboard=False):
         r"""
         Args:
@@ -162,7 +126,7 @@ class NeuralProofNet(Module):
             Final accuracy and final loss
         """
         # Pretrain the linker
-        self.__pretrain_linker__(df_axiom_links)
+        self.__pretrain_linker__(df_axiom_links, pretrain_linker_epochs, batch_size)
 
         # Start learning with output from tagger
         training_dataloader, validation_dataloader = self.__preprocess_data(batch_size, df_axiom_links,
@@ -262,8 +226,8 @@ class NeuralProofNet(Module):
                                         dim=3)  # atom_vocab, batch_size, max atoms in one type
 
         print('\n')
-        print("Les vrais liens de la catégorie n : ", batch_true_links[1][2][:100])
-        print("Les prédictions : ", axiom_links_pred[2][1][:100])
+        print("Les vrais liens de la catégorie n : ", batch_true_links[0][2][:100])
+        print("Les prédictions : ", axiom_links_pred[2][0][:100])
         print('\n')
 
         accuracy = measure_accuracy(batch_true_links, axiom_links_pred)
@@ -296,12 +260,12 @@ class NeuralProofNet(Module):
 
         torch.save({
             'atom_encoder': self.linker.atom_encoder.state_dict(),
-            'position_encoder': self.linker.position_encoder,
+            'position_encoder': self.linker.position_encoder.state_dict(),
             'transformer': self.linker.transformer.state_dict(),
             'linker_encoder': self.linker.linker_encoder.state_dict(),
             'pos_transformation': self.linker.pos_transformation.state_dict(),
             'neg_transformation': self.linker.neg_transformation.state_dict(),
-            'cross_entropy_loss': self.linker_loss,
+            'cross_entropy_loss': self.linker_loss.state_dict(),
             'optimizer': self.linker_optimizer,
         }, path)
         self.to(self.device)
\ No newline at end of file
diff --git a/README.md b/README.md
index 154f242..28e986e 100644
--- a/README.md
+++ b/README.md
@@ -38,11 +38,9 @@ The structure should look like this :
 │    ├── Datasets                    # TLGbank data with supertags
 │    └──  SuperTagger                # BertForTokenClassification
 │       ├── SuperTagger.py           # Main class
-│       └── Utils
-│          ├── Tagging_bert_model.py # Bert model
-│          ├── SymbolTokenizer       # Tags tokenizer
-│          ├── SentencesTokenizer    # Words tokenizer
-│          └── helpers               # utils
+│       ├── Tagging_bert_model.py # Bert model
+│       ├── SymbolTokenizer       # Tags tokenizer
+│       └── SentencesTokenizer    # Words tokenizer
 ├── Linker                           # The Linker directory (that you need to install)
 │    ├── ...
 │    └── Linker.py                   # Linker class containing the neural network
diff --git a/SuperTagger/Datasets/processingTXT.py b/SuperTagger/Datasets/processingTXT.py
index a0dbd84..320d004 100644
--- a/SuperTagger/Datasets/processingTXT.py
+++ b/SuperTagger/Datasets/processingTXT.py
@@ -5,6 +5,9 @@ import re
 import numpy as np
 import pandas as pd
 
+"""
+Format data for training supertagger from txt to csv and tags pkl
+"""
 
 # dr = /
 # dl = \
@@ -38,21 +41,18 @@ def sub_tree_line(line_with_data: str):
     sub_trees.append(["[SOS]"])
     return sentence, list(itertools.chain(*sub_trees))
 
-def Txt_to_csv(file_name: str):
+def Txt_to_csv(file_name: str, csv_name:str = "../Datasets/m2V2_dataset.csv"):
     file = open(file_name, "r", encoding="utf8")
     text = file.readlines()
-
     sub = [sub_tree_line(data) for data in text]
-
     df = pd.DataFrame(data=sub, columns = ['Sentences', 'sub_tree'])
-
-    df.to_csv("../Datasets/" + file_name[:-4] + "_dataset.csv", index=False)
+    df.to_csv(csv_name, index=False)
 
 def normalize_word(orig_word):
     word = orig_word.lower()
-    if (word is "["):
+    if (word == "["):
         word = "("
-    if (word is "]"):
+    if (word == "]"):
         word = ")"
 
     return word
@@ -118,27 +118,18 @@ def save_obj(obj, name):
     with open(name + '.pkl', 'wb+') as f:
         pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
 
-def load_obj(name):
-    with open(name + '.pkl', 'rb') as f:
-        return pickle.load(f)
-
+# Format from txt to csv 
 # Txt_to_csv("m2.txt")
-
 X, Y1, Y2, Z, vocabulary, vnorm, partsofspeech1, partsofspeech2, superset, maxlen = read_maxentdata("SuperTagger/Datasets/m2.txt")
-
-df = pd.DataFrame(columns = ["X", "Y1", "Y2", "Z"])
-
-df['X'] = X[:-1]
-df['Y1'] = Y1[:-1]
-df['Y2'] = Y2[:-1]
-df['Z'] = Z[:-1]
-
+df = pd.DataFrame({"X":X[:-1], "Y1":Y1[:-1], "Y2":Y2[:-1], "Z":Z[:-1]})
 df.to_csv("SuperTagger/Datasets/m2_dataset_V2.csv", index=False)
 
+# Dictionary for supertags
 t =  np.unique(np.array(list(itertools.chain(*Z))))
 dict = { i : t[i] for i in range(0, len(t) ) }
 save_obj(dict,"SuperTagger/Datasets/index_to_super")
 
+# Dictionary for grammar tags (not used)
 t =  np.unique(np.array(list(itertools.chain(*Y1))))
 dict = { i : t[i] for i in range(0, len(t) ) }
 save_obj(dict,"SuperTagger/Datasets/index_to_pos1")
\ No newline at end of file
diff --git a/SuperTagger/README.md b/SuperTagger/README.md
index 140b2b1..a6b7651 100644
--- a/SuperTagger/README.md
+++ b/SuperTagger/README.md
@@ -13,8 +13,7 @@ to provide a wide coverage syntactic and semantic parser for French. But the Tag
 .
 ├── Datasets                      # TLGbank data
 └──  SuperTagger                  # BertForTokenClassification
-   ├── SuperTagger.py             # Main class
-   └── Utils
+        ├── SuperTagger.py        # Main class
         ├── Tagging_bert_model.py # Bert model
         ├── SymbolTokenizer       # Tags tokenizer
         ├── SentencesTokenizer    # Words tokenizer
diff --git a/SuperTagger/SuperTagger/SentencesTokenizer.py b/SuperTagger/SuperTagger/SentencesTokenizer.py
new file mode 100644
index 0000000..104577f
--- /dev/null
+++ b/SuperTagger/SuperTagger/SentencesTokenizer.py
@@ -0,0 +1,42 @@
+
+class SentencesTokenizer():
+    """
+    Tokenizer for sentences : Based on a pretrained tokenzer
+
+    Atributes:
+    ----------
+        tokenizer : Tokenizer
+            Pretrained Tokenizer
+        max_length : 
+            Maximal length of a sentence (i.e maximum number of words)
+    """
+
+    def __init__(self, tokenizer, max_length):
+        """
+        Parameters :
+        ------------
+            tokenizer (PretrainedTokenizer): Tokenizer that tokenizes text 
+            max_length : Maximal length of a sentence
+        """
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+
+    def fit_transform(self, sents):
+        """
+        Tokenizes the given sentences
+        """
+        return self.tokenizer(sents, padding=True)
+
+    def fit_transform_tensors(self, sents):
+        """
+        Tokenizes the sentences and returns tensor
+        """
+        temp = self.tokenizer(sents, padding='max_length', truncation=True, return_tensors = 'pt', max_length=self.max_length)
+
+        return temp["input_ids"], temp["attention_mask"]
+
+    def convert_ids_to_tokens(self, inputs_ids, skip_special_tokens=False):
+        """
+        Decodes a sentence.
+        """
+        return self.tokenizer.batch_decode(inputs_ids, skip_special_tokens=skip_special_tokens)
diff --git a/SuperTagger/SuperTagger/SuperTagger.py b/SuperTagger/SuperTagger/SuperTagger.py
index 80e006d..4f405a0 100644
--- a/SuperTagger/SuperTagger/SuperTagger.py
+++ b/SuperTagger/SuperTagger/SuperTagger.py
@@ -1,80 +1,31 @@
-import datetime
 import os
 import sys
 import time
 
 import torch
 import transformers
-from torch import Tensor
 from torch.optim import Adam
 from torch.utils.data import TensorDataset, random_split
-from torch.utils.tensorboard import SummaryWriter
 from tqdm import tqdm
 from transformers import AutoTokenizer
 from transformers import logging
 
 from Configuration import Configuration
+from .SentencesTokenizer import SentencesTokenizer
+from .SymbolTokenizer import SymbolTokenizer
+from .Tagging_bert_model import Tagging_bert_model
+from .eval import categorical_accuracy
+from utils import format_time, output_create_dir
 
-from .Utils.SentencesTokenizer import SentencesTokenizer
-from .Utils.SymbolTokenizer import SymbolTokenizer
-from .Utils.Tagging_bert_model import Tagging_bert_model
 
 logging.set_verbosity(logging.ERROR)
 
 
-# region Utils
-
-def output_create_dir():
-    """
-    Create le output dir for tensorboard and checkpoint
-    @return: output dir, tensorboard writter
-    """
-    from datetime import datetime
-    outpout_path = 'TensorBoard'
-    training_dir = os.path.join(outpout_path, 'Tranning_' + datetime.today().strftime('%d-%m_%H-%M'))
-    logs_dir = os.path.join(training_dir, 'logs')
-    writer = SummaryWriter(log_dir=logs_dir)
-    return training_dir, writer
-
-
-def categorical_accuracy(preds, truth):
-    """
-    Calculates how often predictions match argmax labels.
-    @param preds: batch of prediction. (argmax)
-    @param truth: batch of truth label.
-    @return: scoring of batch prediction. (Categorical accuracy values)
-    """
-    good_label = 0
-    nb_label = 0
-    for i in range(len(truth)):
-        sublist_truth = truth[i]
-        sublist_preds = preds[i]
-        for j in range(len(sublist_truth)):
-            if sublist_truth[j] != 0:
-                if sublist_truth[j] == sublist_preds[j]:
-                    good_label += 1
-                nb_label += 1
-    return good_label / nb_label
-
-
-def format_time(elapsed):
-    '''
-    Takes a time in seconds and returns a string hh:mm:ss
-    '''
-    # Round to the nearest second.
-    elapsed_rounded = int(round(elapsed))
-
-    # Format as hh:mm:ss
-    return str(datetime.timedelta(seconds=elapsed_rounded))
-
-
-# endregion Utils
-
 # region Class
 
 class SuperTagger:
     """
-    Implements the SuperTagger to assign each word a supertag. A supertag is a tree of tags such as np, s, ...
+    Implements the SuperTagger to assign each word a supertag (also named symbol). A supertag is a tree of tags such as np, s, ...
 
     Attributes:
     -----------
@@ -101,7 +52,7 @@ class SuperTagger:
     model_load : bool
     """
 
-    # region Constructor
+    # region Instanciation
 
     def __init__(self):
         """
@@ -126,20 +77,19 @@ class SuperTagger:
         self.trainable = False
         self.model_load = False
 
-    # endregion Constructor
-
-    # region Instanciation
-
     def load_weights(self, model_file):
         """
         Loads an SupperTagger saved with SupperTagger.__checkpoint_save() (during a train) from a file.
 
-        @param model_file: path of .pt save of model
+        Parameters:
+        -----------
+            model_file: 
+                path of .pt save of model
         """
         self.trainable = False
 
         print("#" * 20)
-        print("\n Loading...")
+        print("\n Loading model for supertagger ...")
         try:
             params = torch.load(model_file, map_location=self.device)
             args = params['args']
@@ -167,11 +117,16 @@ class SuperTagger:
 
     def create_new_model(self, num_label, bert_name, index_to_tags):
         """
-        Instantiation and parameterization of a new bert model
-
-        @param num_label: number of diferent labels (tags)
-        @param bert_name: name of model available on Hugging Face `<https://huggingface.co/models>`
-        @param index_to_tags: Dict for convert ID to tags
+        Instantiation and parameterization of a new bert model.
+
+        Parameters:
+        -----------
+            num_label: 
+                number of diferent labels (tags)
+            bert_name: 
+                name of model available on Hugging Face `<https://huggingface.co/models>`
+            index_to_tags: 
+                Dict for convert ID to tags
         """
         assert len(
             index_to_tags) == num_label, f" len(index_to_tags) : {len(index_to_tags)} must be equels with num_label: {num_label}"
@@ -198,8 +153,13 @@ class SuperTagger:
         """
         Predict and convert sentences in tags (depends on the dictation given when the model was created)
 
-        @param sentences: list of sentences : list[str] OR one sentences : str
-        @return: tags prediction for all sentences (no argmax tags, convert tags, embedding layer of bert )
+        Parameters:
+        -----------
+            sentences: list of sentences : list[str] OR one sentences : str
+        
+        Returns:
+        --------
+            tags prediction for all sentences (no argmax tags, convert tags, embedding layer of bert )
         """
         assert self.trainable or self.model is None, "Please use the create_new_model(...) or load_weights(...) " \
                                                         "function before the predict, the model is not integrated "
@@ -219,7 +179,7 @@ class SuperTagger:
 
     def forward(self, b_sents_tokenized, b_sents_mask):
         """
-        Function used for the linker (same of predict)
+        Forward to the model
         """
         with torch.no_grad():
             output = self.model.predict((b_sents_tokenized, b_sents_mask))
@@ -231,13 +191,15 @@ class SuperTagger:
         """
         Starts the training of the model, either new or previously loaded
 
-        @param sentences: list of sentences for train (X)
-        @param tags: list of tags for train (Y)
-        @param validation_rate: percentage of validation data [0-1]
-        @param epochs: number of epoch (50 recommended)
-        @param batch_size:  number of sample in batch (32 recommended, attention to memory)
-        @param tensorboard: use tensorboard for see loss and accuracy
-        @param checkpoint: save the model after each epoch
+        Parameters:
+        -----------
+            sentences: list of sentences for train (X)
+            tags: list of tags for train (Y)
+            validation_rate: percentage of validation data [0-1]
+            epochs: number of epoch (50 recommended)
+            batch_size:  number of sample in batch (32 recommended, attention to memory)
+            tensorboard: use tensorboard for see loss and accuracy
+            checkpoint: save the model after each epoch
         """
         assert self.trainable or self.model is None, "Please use the create_new_model(...) or load_weights(...) function before the train, the model is not integrated"
 
@@ -296,12 +258,17 @@ class SuperTagger:
                           validation_rate):
         """
         Create torch dataloader for training
-
-        @param batch_size: number of sample in batch
-        @param sentences: list of sentences for train (X)
-        @param tags: list of tags for train (Y)
-        @param validation_rate: percentage of validation data [0-1]
-        @return: (training dataloader, validation dataloader)
+        
+        Parameters:
+        -----------
+            batch_size: number of sample in batch
+            sentences: list of sentences for train (X)
+            tags: list of tags for train (Y)
+            validation_rate: percentage of validation data [0-1]
+
+        Returns:
+        --------
+            training dataloader, validation dataloader
         """
         validation_dataloader = None
 
@@ -325,9 +292,14 @@ class SuperTagger:
     def __train_epoch(self, training_dataloader):
         """
         Train on epoch
-
-        @param training_dataloader: dataloader of training data
-        @return: (epoch accuracy, epoch loss, training time)
+        
+        Parameters:
+        -----------
+            training_dataloader: dataloader of training data
+
+        Returns:
+        --------
+            epoch accuracy, epoch loss, training time
         """
         self.model.train()
         epoch_loss = 0
@@ -370,8 +342,13 @@ class SuperTagger:
         """
         Validation on epoch
 
-        @param validation_dataloader:  dataloader of validation data
-        @return: (epoch accuracy, epoch loss, num step)
+        Parameters:
+        -----------
+            validation_dataloader:  dataloader of validation data
+
+        Returns: 
+        --------
+            epoch accuracy, epoch loss, num step
         """
         self.model.eval()
         eval_loss = 0
@@ -404,7 +381,10 @@ class SuperTagger:
     def __checkpoint_save(self, path='/model_check.pt'):
         """
         Save the model with good parameters
-        @param path: poth and name for save
+
+        Parameters:
+        -----------
+            path: poth and name for save
         """
         self.model.cpu()
         # print('save model parameters to [%s]' % path, file=sys.stderr)
@@ -418,4 +398,4 @@ class SuperTagger:
 
     # endregion Private
 
-# endregion Class
+# endregion Class
\ No newline at end of file
diff --git a/SuperTagger/SuperTagger/Utils/SymbolTokenizer.py b/SuperTagger/SuperTagger/SymbolTokenizer.py
similarity index 71%
rename from SuperTagger/SuperTagger/Utils/SymbolTokenizer.py
rename to SuperTagger/SuperTagger/SymbolTokenizer.py
index e5095d1..dafcc12 100644
--- a/SuperTagger/SuperTagger/Utils/SymbolTokenizer.py
+++ b/SuperTagger/SuperTagger/SymbolTokenizer.py
@@ -8,19 +8,39 @@ def load_obj(name):
     with open(name + '.pkl', 'rb') as f:
         return pickle.load(f)
 
+def pad_sequence(sequences, max_len=400):
+    padded = [0] * max_len
+    padded[:len(sequences)] = sequences
+    return padded
 
 class SymbolTokenizer():
+    """
+    Tokenizer for tags : Based on a dictionary
+
+    Atributes:
+    ----------
+        index_to_super : dict
+            Convert id to supertag
+        super_to_index : dict
+            Convert supertag to id
+    """
 
     def __init__(self, index_to_super):
-        """@params index_to_super: Dict for convert ID to tags """
+        """
+        Parameters:
+        -----------
+            index_to_super: Dict for convert ID to tags """
         self.index_to_super = index_to_super
         self.super_to_index = {v: int(k) for k, v in self.index_to_super.items()}
 
     def lenSuper(self):
-        """@return len of dict for convert ID to tags """
+        """Returns len of dict for convert ID to tags """
         return len(self.index_to_super) + 1
 
     def convert_batchs_to_ids(self, tags, sents_tokenized):
+        """
+        Convert batch of tags to id
+        """
         encoded_labels = []
         labels = [[self.super_to_index[str(symbol)] for symbol in sents] for sents in tags]
         for l, s in zip(labels, sents_tokenized):
@@ -36,7 +56,3 @@ class SymbolTokenizer():
         return labels
 
 
-def pad_sequence(sequences, max_len=400):
-    padded = [0] * max_len
-    padded[:len(sequences)] = sequences
-    return padded
diff --git a/SuperTagger/SuperTagger/Utils/Tagging_bert_model.py b/SuperTagger/SuperTagger/Tagging_bert_model.py
similarity index 72%
rename from SuperTagger/SuperTagger/Utils/Tagging_bert_model.py
rename to SuperTagger/SuperTagger/Tagging_bert_model.py
index b5331ff..96dfbb6 100644
--- a/SuperTagger/SuperTagger/Utils/Tagging_bert_model.py
+++ b/SuperTagger/SuperTagger/Tagging_bert_model.py
@@ -26,6 +26,18 @@ class Tagging_bert_model(Module):
         self.bert = transformers.AutoModelForTokenClassification.from_pretrained(bert_name, config=config)
 
     def forward(self, batch):
+        """
+        Forward to the model.
+
+        Parameters:
+        -----------
+            batch :
+                batch of tokenized sentences
+
+        Returns:
+        --------
+            result : dict containing logit, word_embedding and last_hidden_state
+        """
         b_input_ids = batch[0]
         b_input_mask = batch[1]
         labels = batch[2]
@@ -38,6 +50,18 @@ class Tagging_bert_model(Module):
         return result
 
     def predict(self, batch):
+        """
+        Prediction of supertags for a batch of sentences
+
+        Parameters:
+        -----------
+            batch :
+                batch of tokenized sentences
+
+        Returns:
+        --------
+            result : dict containing logit, word_embedding and last_hidden_state
+        """
         b_input_ids = batch[0]
         b_input_mask = batch[1]
 
diff --git a/SuperTagger/SuperTagger/Utils/SentencesTokenizer.py b/SuperTagger/SuperTagger/Utils/SentencesTokenizer.py
deleted file mode 100644
index 3b637bf..0000000
--- a/SuperTagger/SuperTagger/Utils/SentencesTokenizer.py
+++ /dev/null
@@ -1,18 +0,0 @@
-
-class SentencesTokenizer():
-
-    def __init__(self, tokenizer, max_length):
-        """@params tokenizer (PretrainedTokenizer): Tokenizer that tokenizes text """
-        self.tokenizer = tokenizer
-        self.max_length = max_length
-
-    def fit_transform(self, sents):
-        return self.tokenizer(sents, padding=True)
-
-    def fit_transform_tensors(self, sents):
-        temp = self.tokenizer(sents, padding='max_length', return_tensors = 'pt', max_length=300)
-
-        return temp["input_ids"], temp["attention_mask"]
-
-    def convert_ids_to_tokens(self, inputs_ids, skip_special_tokens=False):
-        return self.tokenizer.batch_decode(inputs_ids, skip_special_tokens=skip_special_tokens)
diff --git a/SuperTagger/SuperTagger/Utils/helpers.py b/SuperTagger/SuperTagger/Utils/helpers.py
deleted file mode 100644
index 21a60b8..0000000
--- a/SuperTagger/SuperTagger/Utils/helpers.py
+++ /dev/null
@@ -1,42 +0,0 @@
-import pandas as pd
-from tqdm import tqdm
-
-
-def read_csv_pgbar(csv_path, nrows=float('inf'), chunksize=100):
-    print("\n" + "#" * 20)
-    print("Loading csv...")
-
-    rows = sum(1 for _ in open(csv_path, 'r', encoding="utf8")) - 1  # minus the header
-    chunk_list = []
-
-    if rows > nrows:
-        rows = nrows
-
-    with tqdm(total=rows, desc='Rows read: ') as bar:
-        for chunk in pd.read_csv(csv_path, converters={'Y1': pd.eval, 'Y2': pd.eval, 'Z': pd.eval}, chunksize=chunksize,
-                                 nrows=rows):
-            chunk_list.append(chunk)
-            bar.update(len(chunk))
-
-    df = pd.concat((f for f in chunk_list), axis=0)
-    print("#" * 20)
-    return df
-
-
-def load_obj(name):
-    with open(name + '.pkl', 'rb') as f:
-        import pickle
-        return pickle.load(f)
-
-
-def categorical_accuracy_str(preds, truth):
-    nb_label = 0
-    good_label = 0
-    for i in range(len(truth)):
-        sublist_truth = truth[i]
-        sublist_preds = preds[i]
-        nb_label += len(sublist_truth)
-        for j in range(min(len(sublist_truth), len(sublist_preds))):
-            if str(sublist_truth[j]) == str(sublist_preds[j]):
-                good_label += 1
-    return good_label / nb_label
diff --git a/SuperTagger/SuperTagger/eval.py b/SuperTagger/SuperTagger/eval.py
new file mode 100644
index 0000000..1fd1b97
--- /dev/null
+++ b/SuperTagger/SuperTagger/eval.py
@@ -0,0 +1,19 @@
+def categorical_accuracy(preds, truth):
+    """
+    Calculates how often predictions match argmax labels.
+    preds: batch of prediction. (argmax)
+    truth: batch of truth label.
+    @return: scoring of batch prediction. (Categorical accuracy values)
+    """
+    good_label = 0
+    nb_label = 0
+    for i in range(len(truth)):
+        sublist_truth = truth[i]
+        sublist_preds = preds[i]
+        for j in range(len(sublist_truth)):
+            if sublist_truth[j] != 0:
+                if sublist_truth[j] == sublist_preds[j]:
+                    good_label += 1
+                nb_label += 1
+    return good_label / nb_label
+    
\ No newline at end of file
diff --git a/SuperTagger/__init__.py b/SuperTagger/__init__.py
index d0947b7..993e537 100644
--- a/SuperTagger/__init__.py
+++ b/SuperTagger/__init__.py
@@ -1,2 +1 @@
-from .SuperTagger.Utils import *
 from .SuperTagger.SuperTagger import SuperTagger
\ No newline at end of file
diff --git a/predict_links.py b/predict_links.py
index 3bc1db1..f3d05a6 100644
--- a/predict_links.py
+++ b/predict_links.py
@@ -3,22 +3,24 @@ from NeuralProofNet.NeuralProofNet import NeuralProofNet
 # region data
 a_s = "( 1 ) parmi les huit \" partants \" acquis ou potentiels , MM. Lacombe , Koehler et Laroze ne sont pas membres " \
       "du PCF . "
-tags_s = [['let', 'dr(0,s,s)', 'let', 'dr(0,dr(0,s,s),np)', 'dr(0,np,n)', 'dr(0,n,n)', 'let', 'n', 'let', 'dl(0,n,n)',
+tags_s = ['let', 'dr(0,s,s)', 'let', 'dr(0,dr(0,s,s),np)', 'dr(0,np,n)', 'dr(0,n,n)', 'let', 'n', 'let', 'dl(0,n,n)',
            'dr(0,dl(0,dl(0,n,n),dl(0,n,n)),dl(0,n,n))', 'dl(0,n,n)', 'let', 'dr(0,np,np)', 'np', 'dr(0,dl(0,np,np),np)',
            'np', 'dr(0,dl(0,np,np),np)', 'np', 'dr(0,dl(0,np,s),dl(0,np,s))', 'dr(0,dl(0,np,s),np)', 'dl(1,s,s)', 'np',
-           'dr(0,dl(0,np,np),n)', 'n', 'dl(0,s,txt)']]
+           'dr(0,dl(0,np,np),n)', 'n', 'dl(0,s,txt)']
 # endregion
 
 
 # region model
-neuralproofnet = NeuralProofNet()
-model = "models/linker.pt"
+model_tagger = "models/flaubert_super_98_V2_50e.pt"
+neuralproofnet = NeuralProofNet(model_tagger)
+model = "Output/linker.pt"
 neuralproofnet.linker.load_weights(model)
 # endregion
 
 
 # region prediction
 linker = neuralproofnet.linker
-links = linker.predict_without_categories("le chat est noir")
+links = linker.predict_without_categories(a_s)
+#links = linker.predict_with_categories(a_s, tags_s)
 print(links)
 # endregion
\ No newline at end of file
diff --git a/predict_supertags.py b/predict_supertags.py
index 22725ab..88ec004 100644
--- a/predict_supertags.py
+++ b/predict_supertags.py
@@ -1,5 +1,5 @@
 from SuperTagger.SuperTagger.SuperTagger import SuperTagger
-from SuperTagger.SuperTagger.Utils.helpers import categorical_accuracy_str
+from SuperTagger.SuperTagger.eval import categorical_accuracy
 
 # region data
 a_s = "( 1 ) parmi les huit \" partants \" acquis ou potentiels , MM. Lacombe , Koehler et Laroze ne sont pas membres " \
@@ -33,5 +33,5 @@ print("\tTags               : ", tags_s[0])
 print()
 print("\tPred_convert       : ", pred_convert[0])
 print()
-print("\tScore              :", categorical_accuracy_str(pred_convert, tags_s))
+print("\tScore              :", f"{categorical_accuracy(pred_convert, tags_s)*100}%" )
 # endregion
\ No newline at end of file
diff --git a/train_neuralproofnet.py b/train_neuralproofnet.py
index 8bbc086..daa6f65 100644
--- a/train_neuralproofnet.py
+++ b/train_neuralproofnet.py
@@ -1,26 +1,13 @@
 import torch
 from Linker import *
 from NeuralProofNet.NeuralProofNet import NeuralProofNet
-from utils import read_csv_pgbar
-from Configuration import Configuration
+from utils import read_links_csv
 torch.cuda.empty_cache()
 
-# region config
-config = Configuration.read_config()
-version = config["VERSION"]
-datasetConfig = config["DATASET_PARAMS"]
-modelEncoderConfig = config["MODEL_ENCODER"]
-modelLinkerConfig = config["MODEL_LINKER"]
-modelTrainingConfig = config["MODEL_TRAINING"]
-epochs = int(modelTrainingConfig['epoch'])
-batch_size = int(modelTrainingConfig['batch_size'])
-# endregion
-
 
 # region data
-nb_sentences = 100
 file_path_axiom_links = 'Datasets/goldANDsilver_dataset_links.csv'
-df_axiom_links = read_csv_pgbar(file_path_axiom_links, nb_sentences)
+df_axiom_links = read_links_csv(file_path_axiom_links)
 # endregion
 
 
@@ -29,7 +16,7 @@ print("#" * 20)
 print("#" * 20)
 model_tagger = "models/flaubert_super_98_V2_50e.pt"
 neural_proof_net = NeuralProofNet(model_tagger)
-neural_proof_net.train_neuralproofnet(df_axiom_links, validation_rate=0.1, epochs=epochs, batch_size=batch_size,
+neural_proof_net.train_neuralproofnet(df_axiom_links, validation_rate=0.1, epochs=5, batch_size=16,
                                       checkpoint=True, tensorboard=True)
 print("#" * 20)
 print("#" * 20)
diff --git a/train_supertagger.py b/train_supertagger.py
index 3c0bde0..edba002 100644
--- a/train_supertagger.py
+++ b/train_supertagger.py
@@ -1,10 +1,9 @@
 from SuperTagger.SuperTagger.SuperTagger import SuperTagger
-from SuperTagger.SuperTagger.Utils.helpers import read_csv_pgbar, load_obj
+from utils import read_supertags_csv, load_obj
 
 # region data
 file_path = 'SuperTagger/Datasets/m2_dataset_V2.csv'
-nb_sentences = 100
-df = read_csv_pgbar(file_path,nb_sentences)
+df = read_supertags_csv(file_path)
 texts = df['X'].tolist()
 tags = df['Z'].tolist()
 
@@ -15,7 +14,7 @@ index_to_super = load_obj('SuperTagger/Datasets/index_to_super')
 # region model
 tagger = SuperTagger()
 tagger.create_new_model(len(index_to_super),'camembert-base',index_to_super)
-## If you wnat to upload a pretrained model
+## If you want to upload a pretrained model
 # tagger.load_weights("models/model_check.pt")
 tagger.train(texts, tags, epochs=2, batch_size=16, validation_rate=0.1, 
             tensorboard=True, checkpoint=True)
diff --git a/utils.py b/utils.py
index c4fae14..9640a0b 100644
--- a/utils.py
+++ b/utils.py
@@ -1,17 +1,78 @@
 import datetime
+import os
+from torch.utils.tensorboard import SummaryWriter
 
 import pandas as pd
-import torch
 from tqdm import tqdm
 
+# region load data
+
+def read_links_csv(csv_path, nrows=float('inf'), chunksize=100):
+    r"""
+    Preparing csv dataset.
+
+    Parameters:
+    -----------
+        csv_path:
+        nrows:
+        chunksize:
+    """
+    print("\n" + "#" * 20)
+    print("Loading csv...")
+
+    chunk_list = []
+
+    with tqdm(total=nrows, desc='Rows read: ') as bar:
+        for chunk in pd.read_csv(csv_path, header=0, converters={'Y': pd.eval, 'Z': pd.eval}, 
+                                chunksize=chunksize, nrows=nrows):
+            chunk_list.append(chunk)
+            bar.update(len(chunk))
+
+    df = pd.concat((f for f in chunk_list), axis=0)
+    print("#" * 20)
+    return df
+
+def read_supertags_csv(csv_path, nrows=float('inf'), chunksize=100):
+    r"""
+    Preparing csv dataset.
+
+    Parameters:
+    -----------
+        csv_path:
+        nrows:
+        chunksize:
+    """
+    print("\n" + "#" * 20)
+    print("Loading csv...")
+
+    chunk_list = []
+    with tqdm(total=nrows, desc='Rows read: ') as bar:
+        for chunk in pd.read_csv(csv_path, header=0, converters={'Y1': pd.eval, 'Y2': pd.eval, 'Z': pd.eval}, 
+                                chunksize=chunksize, nrows=nrows):
+            chunk_list.append(chunk)
+            bar.update(len(chunk))
+
+    df = pd.concat((f for f in chunk_list), axis=0)
+    print("#" * 20)
+    return df
+
+
+def load_obj(name):
+    with open(name + '.pkl', 'rb') as f:
+        import pickle
+        return pickle.load(f)
+
+#endregion
+
+# region format data
 
 def pad_sequence(sequences, batch_first=True, padding_value=0, max_len=400):
     r"""
     Padding sequence for preparation to tensorDataset
-    :param sequences: data to pad
-    :param batch_first: boolean indicating whether the batch are in first dimension
-    :param padding_value: the value for pad
-    :param max_len: the maximum length
+     sequences: data to pad
+     batch_first: boolean indicating whether the batch are in first dimension
+     padding_value: the value for pad
+     max_len: the maximum length
     :return: padding sequences
     """
     max_size = sequences[0].size()
@@ -32,32 +93,21 @@ def pad_sequence(sequences, batch_first=True, padding_value=0, max_len=400):
 
     return out_tensor
 
+#endregion
 
-def read_csv_pgbar(csv_path, nrows=float('inf'), chunksize=500):
-    r"""
-    Preparing csv dataset
-    :param csv_path:
-    :param nrows:
-    :param chunksize:
-    :return:
-    """
-    print("Loading csv...")
-
-    rows = sum(1 for _ in open(csv_path, 'r', encoding="utf8")) - 1  # minus the header
-    chunk_list = []
-
-    if rows > nrows:
-        rows = nrows
-        chunksize = nrows
-
-    with tqdm(total=rows, desc='Rows read: ') as bar:
-        for chunk in pd.read_csv(csv_path, converters={'Y': pd.eval, 'Z': pd.eval}, chunksize=chunksize, nrows=rows):
-            chunk_list.append(chunk)
-            bar.update(len(chunk))
+# region utils training
 
-    df = pd.concat((f for f in chunk_list), axis=0)
-
-    return df
+def output_create_dir():
+    """
+    Create le output dir for tensorboard and checkpoint
+    @return: output dir, tensorboard writter
+    """
+    from datetime import datetime
+    outpout_path = 'TensorBoard'
+    training_dir = os.path.join(outpout_path, 'Tranning_' + datetime.today().strftime('%d-%m_%H-%M'))
+    logs_dir = os.path.join(training_dir, 'logs')
+    writer = SummaryWriter(log_dir=logs_dir)
+    return training_dir, writer
 
 
 def format_time(elapsed):
@@ -68,4 +118,6 @@ def format_time(elapsed):
     elapsed_rounded = int(round(elapsed))
 
     # Format as hh:mm:ss
-    return str(datetime.timedelta(seconds=elapsed_rounded))
\ No newline at end of file
+    return str(datetime.timedelta(seconds=elapsed_rounded))
+
+#endregion
\ No newline at end of file
-- 
GitLab