Compare revisions

Caroline de Pourtalès · Caroline de Pourtalès · 0a027e25 · 0a027e25 · 0a027e25 · 0a027e25
--- a/Configuration/config.ini
+++ b/Configuration/config.ini
@@ -4,7 +4,7 @@ transformers = 4.16.2
 [DATASET_PARAMS]
 symbols_vocab_size = 26
 atom_vocab_size = 18
-max_len_sentence = 290
+max_len_sentence = 300
 max_atoms_in_sentence = 900
 max_atoms_in_one_type = 360

@@ -20,11 +20,4 @@ dim_cat_out = 256
 dim_intermediate_ffn = 128
 dim_pre_sinkhorn_transfo = 32
 dropout = 0.1
-sinkhorn_iters = 5
-
-[MODEL_TRAINING]
-batch_size = 32
-pretrain_linker_epochs = 10
-epoch = 20
-seed_val = 42
-learning_rate = 2e-3
\ No newline at end of file
+sinkhorn_iters = 5
\ No newline at end of file
--- a/Utils/PostpreprocesTXT.py
+++ b/Utils/PostpreprocesTXT.py
@@ -8,55 +8,16 @@ import pandas as pd

 # dr = /
 # dl = \
-#
-# def sub_tree_word(word_with_data: str):
-#     word = ""
-#     if not word_with_data.startswith("GOAL:"):
-#         s = word_with_data.split('|')
-#         word = s[0]
-#         tree = s[1]
-#     else:
-#         tree = word_with_data
-#     return word, tree
-#
-#
-# def sub_tree_line(line_with_data: str):
-#     line_list = line_with_data.split()
-#     sentence = ""
-#     sub_trees = []
-#     for word_with_data in line_list:
-#         w, t = sub_tree_word(word_with_data)
-#         sentence += ' ' + w
-#         if t not in ["\\", "/", "let"] and len(t) > 0:
-#             sub_trees.append([t])
-#         """if ('ppp' in list(itertools.chain(*sub_trees))):
-#             print(sentence)"""
-#     return sentence, list(itertools.chain(*sub_trees))
-#
-#
-# def Txt_to_csv(file_name: str, result_name):
-#     file = open(file_name, "r", encoding="utf8")
-#     text = file.readlines()
-#     sub = [sub_tree_line(data) for data in text]
-#     df = pd.DataFrame(data=sub, columns=['X', 'Y'])
-#     df.to_csv("../Datasets/" + result_name + "_dataset_links.csv", mode='a', index=False, header=False)
-#
-# def Txt_to_csv_header(file_name: str, result_name):
-#     file = open(file_name, "r", encoding="utf8")
-#     text = file.readlines()
-#     sub = [sub_tree_line(data) for data in text]
-#     df = pd.DataFrame(data=sub, columns=['X', 'Y'])
-#     df.to_csv("../Datasets/" + result_name + "_dataset_links.csv", index=False)
+
 def normalize_word(orig_word):
    word = orig_word.lower()
-    if (word is "["):
+    if (word == "["):
        word = "("
-    if (word is "]"):
+    if (word == "]"):
        word = ")"

    return word

-
 def read_maxentdata(path):
    allwords = []
    allsuper = []

--- a/Linker/Linker.py
+++ b/Linker/Linker.py
-import datetime
 import math
 import os
 import sys
@@ -11,7 +10,6 @@ from torch.nn import Sequential, LayerNorm, Module, Linear, Dropout, Transformer
 from torch.optim import AdamW
 from torch.optim.lr_scheduler import StepLR
 from torch.utils.data import TensorDataset, random_split
-from torch.utils.tensorboard import SummaryWriter
 from tqdm import tqdm

 from Configuration import Configuration
@@ -21,41 +19,15 @@ from Linker.Sinkhorn import sinkhorn_fn_no_exp as sinkhorn
 from Linker.atom_map import atom_map, atom_map_redux
 from Linker.eval import measure_accuracy, SinkhornLoss
 from Linker.utils_linker import FFN, get_axiom_links, get_GOAL, get_pos_idx, get_neg_idx, get_atoms_batch, \
-    find_pos_neg_idexes, get_num_atoms_batch
+    find_pos_neg_idexes, get_num_atoms_batch, generate_square_subsequent_mask
 from SuperTagger import SuperTagger
-from utils import pad_sequence
-
-
-def format_time(elapsed):
-    '''
-    Takes a time in seconds and returns a string hh:mm:ss
-    '''
-    # Round to the nearest second.
-    elapsed_rounded = int(round(elapsed))
-
-    # Format as hh:mm:ss
-    return str(datetime.timedelta(seconds=elapsed_rounded))
-
-
-def output_create_dir():
-    """
-    Create le output dir for tensorboard and checkpoint
-    @return: output dir, tensorboard writter
-    """
-    from datetime import datetime
-    outpout_path = 'TensorBoard'
-    training_dir = os.path.join(outpout_path, 'Tranning_' + datetime.today().strftime('%d-%m_%H-%M'))
-    logs_dir = os.path.join(training_dir, 'logs')
-    writer = SummaryWriter(log_dir=logs_dir)
-    return training_dir, writer
-
-
-def generate_square_subsequent_mask(sz):
-    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
-    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
+from utils import pad_sequence, format_time, output_create_dir


 class Linker(Module):
+    
+    # region initialization
+
    def __init__(self, supertagger_path_model):
        super(Linker, self).__init__()

@@ -64,11 +36,9 @@ class Linker(Module):
        datasetConfig = config["DATASET_PARAMS"]
        modelEncoderConfig = config["MODEL_ENCODER"]
        modelLinkerConfig = config["MODEL_LINKER"]
-        modelTrainingConfig = config["MODEL_TRAINING"]
-
        dim_encoder = int(modelEncoderConfig['dim_encoder'])
-        # atom settings
        atom_vocab_size = int(datasetConfig['atom_vocab_size'])
+        
        # Transformer
        self.nhead = int(modelLinkerConfig['nhead'])
        self.dim_emb_atom = int(modelLinkerConfig['dim_emb_atom'])
@@ -85,7 +55,6 @@ class Linker(Module):
        self.max_len_sentence = int(datasetConfig['max_len_sentence'])
        self.max_atoms_in_sentence = int(datasetConfig['max_atoms_in_sentence'])
        self.max_atoms_in_one_type = int(datasetConfig['max_atoms_in_one_type'])
-        learning_rate = float(modelTrainingConfig['learning_rate'])
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        # endregion

@@ -113,27 +82,44 @@ class Linker(Module):
            Linear(dim_cat, self.dim_cat_out),
            GELU(),
            Dropout(dropout),
-            LayerNorm(self.dim_cat_out, eps=1e-8)
-        )
+            LayerNorm(self.dim_cat_out, eps=1e-8))

        # Division into positive and negative
        self.pos_transformation = Sequential(
            FFN(self.dim_cat_out, dim_intermediate_FFN, dropout, d_out=dim_pre_sinkhorn_transfo),
-            LayerNorm(dim_pre_sinkhorn_transfo, eps=1e-8)
-        )
+            LayerNorm(dim_pre_sinkhorn_transfo, eps=1e-8))
        self.neg_transformation = Sequential(
            FFN(self.dim_cat_out, dim_intermediate_FFN, dropout, d_out=dim_pre_sinkhorn_transfo),
-            LayerNorm(dim_pre_sinkhorn_transfo, eps=1e-8)
-        )
+            LayerNorm(dim_pre_sinkhorn_transfo, eps=1e-8))

        # Learning
        self.cross_entropy_loss = SinkhornLoss()
-        self.optimizer = AdamW(self.parameters(),
-                               lr=learning_rate)
+        self.optimizer = AdamW(self.parameters(), lr=0.001)
        self.scheduler = StepLR(self.optimizer, step_size=2, gamma=0.5)
-
        self.to(self.device)

+    def load_weights(self, model_file):
+        print("#" * 15)
+        try:
+            params = torch.load(model_file, map_location=self.device)
+            self.atom_encoder.load_state_dict(params['atom_encoder'])
+            self.position_encoder.load_state_dict(params['position_encoder'])
+            self.transformer.load_state_dict(params['transformer'])
+            self.linker_encoder.load_state_dict(params['linker_encoder'])
+            self.pos_transformation.load_state_dict(params['pos_transformation'])
+            self.neg_transformation.load_state_dict(params['neg_transformation'])
+            self.cross_entropy_loss.load_state_dict(params['cross_entropy_loss'])
+            self.optimizer = params['optimizer']
+            print("\n The loading checkpoint was successful ! \n")
+        except Exception as e:
+            print("\n/!\ Can't load checkpoint model /!\ because :\n\n " + str(e), file=sys.stderr)
+            raise e
+        print("#" * 15)
+
+    #endregion
+
+    # region data
+
    def __preprocess_data(self, batch_size, df_axiom_links, validation_rate=0.1):
        r"""
        Args:
@@ -177,6 +163,26 @@ class Linker(Module):
        print("End preprocess Data")
        return training_dataloader, validation_dataloader

+    #endregion
+
+    # region training
+
+    def make_sinkhorn_inputs(self, bsd_tensor, positional_ids, atom_type):
+        """
+        :param bsd_tensor:
+            Tensor of shape batch size \times sequence length \times feature dimensionality.
+        :param positional_ids:
+            A List of batch_size elements, each being a List of num_atoms LongTensors.
+            Each LongTensor in positional_ids[b][a] indexes the location of atoms of type a in sentence b.
+        :param atom_type:
+        :return:
+        """
+
+        return torch.stack([torch.stack([bsd_tensor.select(0, index=i).select(0, index=int(atom)).to(self.device)
+                                         if atom != -1 else torch.zeros(self.dim_cat_out, device=self.device)
+                                         for atom in sentence])
+                            for i, sentence in enumerate(positional_ids[:, self.atom_map_redux[atom_type], :])])
+
    def forward(self, batch_num_atoms_per_word, batch_atoms, batch_pos_idx, batch_neg_idx, sents_embedding):
        r"""
        Args:
@@ -307,7 +313,7 @@ class Linker(Module):

                # Run the Linker on the atoms
                logits_predictions = self(batch_num_atoms, batch_atoms_tok, batch_pos_idx, batch_neg_idx,
-                                          output['word_embeding'])
+                                          output['word_embedding'])

                linker_loss = self.cross_entropy_loss(logits_predictions, batch_true_links)
                # Perform a backward pass to calculate the gradients.
@@ -332,6 +338,10 @@ class Linker(Module):

        return avg_train_loss, avg_accuracy_train, training_time

+    #endregion
+
+    # region evaluation
+
    def eval_batch(self, batch):
        batch_num_atoms = batch[0].to(self.device)
        batch_atoms_tok = batch[1].to(self.device)
@@ -344,12 +354,13 @@ class Linker(Module):
        output = self.Supertagger.forward(batch_sentences_tokens, batch_sentences_mask)

        logits_predictions = self(batch_num_atoms, batch_atoms_tok, batch_pos_idx, batch_neg_idx, output[
-            'word_embeding'])  # atom_vocab, batch_size, max atoms in one type, max atoms in one type
+            'word_embedding'])  # atom_vocab, batch_size, max atoms in one type, max atoms in one type
        axiom_links_pred = torch.argmax(logits_predictions, dim=3)  # atom_vocab, batch_size, max atoms in one type

        print('\n')
-        print("Les vrais liens de la catégorie n : ", batch_true_links[1][2][:100])
-        print("Les prédictions : ", axiom_links_pred[2][1][:100])
+        print(batch_true_links)
+        print("Les vrais liens de la catégorie n : ", batch_true_links[0][2][:100])
+        print("Les prédictions : ", axiom_links_pred[2][0][:100])
        print('\n')

        accuracy = measure_accuracy(batch_true_links, axiom_links_pred)
@@ -374,6 +385,10 @@ class Linker(Module):

        return loss_average / len(dataloader), accuracy_average / len(dataloader)

+    #endregion
+
+    #region prediction 
+
    def predict_with_categories(self, sentence, categories):
        r""" Predict the links from a sentence and its categories

@@ -406,7 +421,7 @@ class Linker(Module):

            output = self.Supertagger.forward(sentences_tokens, sentences_mask)

-            logits_predictions = self(num_atoms_per_word, atoms_tokenized, pos_idx, neg_idx, output['word_embeding'])
+            logits_predictions = self(num_atoms_per_word, atoms_tokenized, pos_idx, neg_idx, output['word_embedding'])
            axiom_links_pred = torch.argmax(logits_predictions, dim=3)

        return axiom_links_pred
@@ -444,28 +459,12 @@ class Linker(Module):
            pos_idx = get_pos_idx(atoms, polarities, self.max_atoms_in_one_type)
            neg_idx = get_neg_idx(atoms, polarities, self.max_atoms_in_one_type)

-            logits_predictions = self(num_atoms_per_word, atoms_tokenized, pos_idx, neg_idx, output['word_embeding'])
+            logits_predictions = self(num_atoms_per_word, atoms_tokenized, pos_idx, neg_idx, output['word_embedding'])
            axiom_links_pred = torch.argmax(logits_predictions, dim=3)

        return categories, axiom_links_pred
-
-    def load_weights(self, model_file):
-        print("#" * 15)
-        try:
-            params = torch.load(model_file, map_location=self.device)
-            self.atom_encoder.load_state_dict(params['atom_encoder'])
-            self.position_encoder.load_state_dict(params['position_encoder'])
-            self.transformer.load_state_dict(params['transformer'])
-            self.linker_encoder.load_state_dict(params['linker_encoder'])
-            self.pos_transformation.load_state_dict(params['pos_transformation'])
-            self.neg_transformation.load_state_dict(params['neg_transformation'])
-            self.cross_entropy_loss.load_state_dict(params['cross_entropy_loss'])
-            self.optimizer.load_state_dict(params['optimizer'])
-            print("\n The loading checkpoint was successful ! \n")
-        except Exception as e:
-            print("\n/!\ Can't load checkpoint model /!\ because :\n\n " + str(e), file=sys.stderr)
-            raise e
-        print("#" * 15)
+    
+    #endregion

    def __checkpoint_save(self, path='/linker.pt'):
        """
@@ -475,28 +474,12 @@ class Linker(Module):

        torch.save({
            'atom_encoder': self.atom_encoder.state_dict(),
-            'position_encoder': self.position_encoder,
+            'position_encoder': self.position_encoder.state_dict(),
            'transformer': self.transformer.state_dict(),
            'linker_encoder': self.linker_encoder.state_dict(),
            'pos_transformation': self.pos_transformation.state_dict(),
            'neg_transformation': self.neg_transformation.state_dict(),
-            'cross_entropy_loss': self.cross_entropy_loss,
+            'cross_entropy_loss': self.cross_entropy_loss.state_dict(),
            'optimizer': self.optimizer,
        }, path)
        self.to(self.device)
-
-    def make_sinkhorn_inputs(self, bsd_tensor, positional_ids, atom_type):
-        """
-        :param bsd_tensor:
-            Tensor of shape batch size \times sequence length \times feature dimensionality.
-        :param positional_ids:
-            A List of batch_size elements, each being a List of num_atoms LongTensors.
-            Each LongTensor in positional_ids[b][a] indexes the location of atoms of type a in sentence b.
-        :param atom_type:
-        :return:
-        """
-
-        return torch.stack([torch.stack([bsd_tensor.select(0, index=i).select(0, index=int(atom)).to(self.device)
-                                         if atom != -1 else torch.zeros(self.dim_cat_out, device=self.device)
-                                         for atom in sentence])
-                            for i, sentence in enumerate(positional_ids[:, self.atom_map_redux[atom_type], :])])
--- a/Linker/utils_linker.py
+++ b/Linker/utils_linker.py
@@ -25,7 +25,10 @@ class FFN(Module):
    def forward(self, x):
        return self.ffn(x)

-
+def generate_square_subsequent_mask(sz):
+    """Generates an upper-triangular matrix of -inf, with zeros on diag."""
+    return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
+    
 ################################ Regex ########################################
 regex_categories_axiom_links = r'\w+\(\d+,(?:((?R))|(\w+))*,?(?:((?R))|(\w+))*\)'
 regex_categories = r'\w+\(\d+,(?:((?R))|(\w+))*,?(?:((?R))|(\w+))*\)'
@@ -106,15 +109,6 @@ def get_atoms_links_batch(category_batch):
    return batch


-print("test to create links ",
-      get_axiom_links(20, torch.stack([torch.as_tensor(
-          [True, False, True, False, False, False, True, False, True, False,
-           False, True, False, False, False, True, False, False, True, False,
-           True, False, False, True, False, False, False, False, False, False])]),
-                      [['dr(0,np_1,n_2)', 'n_2', 'dr(0,dl(0,np_1,np_3),np_4)', 'dr(0,np_4,n_5)', 'n_6', 'dl(0,n_6,n_5)',
-                        'dr(0,dl(0,np_3,np_7),np_8)', 'dr(0,np_8,np_9)', 'np_9', 'GOAL:np_7']]))
-
-
 # endregion

 # region get atoms in sentence
@@ -159,10 +153,6 @@ def get_atoms_batch(category_batch):
    return batch


-print(" test for get atoms in categories on ['dr(0,np,n)', 'n', 'dr(0,dl(0,np,np),np)', 'let']",
-      get_atoms_batch([['dr(0,np,n)', 'n', 'dr(0,dl(0,np,np),np)', 'let']]))
-
-
 # endregion

 # region calculate num atoms per category
@@ -211,11 +201,6 @@ def get_num_atoms_batch(category_batch, max_len_sentence):
        batch.append(torch.as_tensor(num_atoms_sentence))
    return pad_sequence(batch, max_len=max_len_sentence, padding_value=0)

-
-print(" test for get number of atoms in categories on ['dr(0,s,np)', 'let']",
-      get_num_atoms_batch([["dr(0,s,np)", "let"]], 10))
-
-
 # endregion

 # region get polarity
@@ -309,11 +294,6 @@ def find_pos_neg_idexes(atoms_batch):
    return list_batch


-print(" test for get polarities for atoms in categories on ['dr(0,np,n)', 'n', 'dr(0,dl(0,np,np),np)', 'dr(0,np,n)', 'n', 'dl(0,n,n)', 'dr(0,dl(0,np,np),np)', 'dr(0,np,np)', 'np'] \n",
-    find_pos_neg_idexes([['dr(0,np,n)', 'n', 'dr(0,dl(0,np,np),np)', 'dr(0,np,n)', 'n', 'dl(0,n,n)',
-                          'dr(0,dl(0,np,np),np)', 'dr(0,np,np)', 'np']]))
-
-
 # endregion

 # region get atoms and polarities with GOAL
@@ -336,15 +316,6 @@ def get_GOAL(max_len_sentence, df_axiom_links):

    return atoms_batch, polarities, num_atoms_batch

-
-df_axiom_links = pd.DataFrame({"Z": [['dr(0,np,n)', 'n', 'dr(0,dl(0,np,np),np)', 'dr(0,np,n)', 'n', 'dl(0,n,n)',
-                                      'dr(0,dl(0,np,np),np)', 'dr(0,np,np)', 'np']],
-                               "Y": [['dr(0,np_1,n_2)', 'n_2', 'dr(0,dl(0,np_1,np_3),np_4)', 'dr(0,np_4,n_5)', 'n_6',
-                                      'dl(0,n_6,n_5)', 'dr(0,dl(0,np_3,np_7),np_8)', 'dr(0,np_8,np_9)', 'np_9',
-                                      'GOAL:np_7']]})
-print(" test for get GOAL ", get_GOAL(10, df_axiom_links))
-
-
 # endregion

 # region get idx for pos and neg
@@ -370,13 +341,40 @@ def get_neg_idx(atoms_batch, atoms_polarity_batch, max_atoms_in_one_type):

    return torch.stack(pos_idx).permute(1, 0, 2)

+# endregion

-print(" test for cut into pos neg on ['dr(0,s,np)', 's']",
-      get_neg_idx([['s', 's', 'np', 's', 'np', '[SEP]', 's', '[SEP]']],
-                  torch.as_tensor(
-                      [[True, True, False, False,
-                        True, False, False, False,
-                        False, False,
-                        False, False]]), 10))

-# endregion
\ No newline at end of file
+if __name__ == 'main ':
+
+    print("test to create links ",
+        get_axiom_links(20, torch.stack([torch.as_tensor(
+            [True, False, True, False, False, False, True, False, True, False,
+            False, True, False, False, False, True, False, False, True, False,
+            True, False, False, True, False, False, False, False, False, False])]),
+                        [['dr(0,np_1,n_2)', 'n_2', 'dr(0,dl(0,np_1,np_3),np_4)', 'dr(0,np_4,n_5)', 'n_6', 'dl(0,n_6,n_5)',
+                            'dr(0,dl(0,np_3,np_7),np_8)', 'dr(0,np_8,np_9)', 'np_9', 'GOAL:np_7']]))
+
+    print(" test for get atoms in categories on ['dr(0,np,n)', 'n', 'dr(0,dl(0,np,np),np)', 'let']",
+        get_atoms_batch([['dr(0,np,n)', 'n', 'dr(0,dl(0,np,np),np)', 'let']]))
+
+    print(" test for get number of atoms in categories on ['dr(0,s,np)', 'let']",
+        get_num_atoms_batch([["dr(0,s,np)", "let"]], 10))
+
+    print(" test for get polarities for atoms in categories on ['dr(0,np,n)', 'n', 'dr(0,dl(0,np,np),np)', 'dr(0,np,n)', 'n', 'dl(0,n,n)', 'dr(0,dl(0,np,np),np)', 'dr(0,np,np)', 'np'] \n",
+        find_pos_neg_idexes([['dr(0,np,n)', 'n', 'dr(0,dl(0,np,np),np)', 'dr(0,np,n)', 'n', 'dl(0,n,n)',
+                            'dr(0,dl(0,np,np),np)', 'dr(0,np,np)', 'np']]))
+
+    df_axiom_links = pd.DataFrame({"Z": [['dr(0,np,n)', 'n', 'dr(0,dl(0,np,np),np)', 'dr(0,np,n)', 'n', 'dl(0,n,n)',
+                                        'dr(0,dl(0,np,np),np)', 'dr(0,np,np)', 'np']],
+                                "Y": [['dr(0,np_1,n_2)', 'n_2', 'dr(0,dl(0,np_1,np_3),np_4)', 'dr(0,np_4,n_5)', 'n_6',
+                                        'dl(0,n_6,n_5)', 'dr(0,dl(0,np_3,np_7),np_8)', 'dr(0,np_8,np_9)', 'np_9',
+                                        'GOAL:np_7']]})
+    print(" test for get GOAL ", get_GOAL(10, df_axiom_links))
+
+    print(" test for cut into pos neg on ['dr(0,s,np)', 's']",
+        get_neg_idx([['s', 's', 'np', 's', 'np', '[SEP]', 's', '[SEP]']],
+                    torch.as_tensor(
+                        [[True, True, False, False,
+                            True, False, False, False,
+                            False, False,
+                            False, False]]), 10))
\ No newline at end of file
--- a/NeuralProofNet/NeuralProofNet.py
+++ b/NeuralProofNet/NeuralProofNet.py
-import os
-import datetime
-import os
 import time

 import torch
@@ -8,7 +5,6 @@ from torch.nn import Module
 from torch.optim import AdamW
 from torch.optim.lr_scheduler import StepLR
 from torch.utils.data import TensorDataset, random_split
-from torch.utils.tensorboard import SummaryWriter
 from tqdm import tqdm

 from Configuration import Configuration
@@ -16,70 +12,39 @@ from Linker import Linker
 from Linker.eval import measure_accuracy, SinkhornLoss
 from Linker.utils_linker import get_axiom_links, get_GOAL, get_pos_idx, get_num_atoms_batch, get_neg_idx
 from NeuralProofNet.utils_proofnet import get_info_for_tagger
-from utils import pad_sequence
-
-
-def format_time(elapsed):
-    '''
-    Takes a time in seconds and returns a string hh:mm:ss
-    '''
-    # Round to the nearest second.
-    elapsed_rounded = int(round(elapsed))
-
-    # Format as hh:mm:ss
-    return str(datetime.timedelta(seconds=elapsed_rounded))
-
-
-def output_create_dir():
-    """
-    Create le output dir for tensorboard and checkpoint
-    @return: output dir, tensorboard writter
-    """
-    from datetime import datetime
-    outpout_path = 'TensorBoard'
-    training_dir = os.path.join(outpout_path, 'Tranning_' + datetime.today().strftime('%d-%m_%H-%M'))
-    logs_dir = os.path.join(training_dir, 'logs')
-    writer = SummaryWriter(log_dir=logs_dir)
-    return training_dir, writer
+from utils import pad_sequence, format_time, output_create_dir


 class NeuralProofNet(Module):
+    
    def __init__(self, supertagger_path_model, linker_path_model=None):
        super(NeuralProofNet, self).__init__()
        config = Configuration.read_config()
        datasetConfig = config["DATASET_PARAMS"]
-        modelTrainingConfig = config["MODEL_TRAINING"]

-        # pretrain settings
-        self.pretrain_linker_epochs = int(modelTrainingConfig['pretrain_linker_epochs'])
        # settings
        self.max_len_sentence = int(datasetConfig['max_len_sentence'])
        self.max_atoms_in_sentence = int(datasetConfig['max_atoms_in_sentence'])
        self.max_atoms_in_one_type = int(datasetConfig['max_atoms_in_one_type'])
-        learning_rate = float(modelTrainingConfig['learning_rate'])
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.batch_size = int(modelTrainingConfig['batch_size'])

        linker = Linker(supertagger_path_model)
        if linker_path_model is not None:
            linker.load_weights(linker_path_model)
-            self.pretrain_linker_epochs = 0
        self.linker = linker

        # Learning
        self.linker_loss = SinkhornLoss()
        self.linker_optimizer = AdamW(self.linker.parameters(),
-                                      lr=learning_rate)
+                                      lr=0.001)
        self.linker_scheduler = StepLR(self.linker_optimizer, step_size=2, gamma=0.5)

        self.to(self.device)

-    def __pretrain_linker__(self, df_axiom_links, checkpoint=False, tensorboard=True):
+    def __pretrain_linker__(self, df_axiom_links, pretrain_linker_epochs, batch_size, checkpoint=False, tensorboard=True):
        print("\nLinker Pre-Training\n")
-        self.linker.train_linker(df_axiom_links, validation_rate=0.05, epochs=self.pretrain_linker_epochs,
-                                 batch_size=self.batch_size,
-                                 checkpoint=checkpoint,
-                                 tensorboard=tensorboard)
+        self.linker.train_linker(df_axiom_links, validation_rate=0.05, epochs=pretrain_linker_epochs,
+                                 batch_size=batch_size, checkpoint=checkpoint, tensorboard=tensorboard)
        print("\nEND Linker Pre-Training\n")

    def __preprocess_data(self, batch_size, df_axiom_links, validation_rate=0.1):
@@ -143,11 +108,11 @@ class NeuralProofNet(Module):
        batch_neg_idx = batch_neg_idx.to(self.device)

        logits_links = self.linker(batch_num_atoms_per_word, atoms_batch_tokenized, batch_pos_idx, batch_neg_idx,
-                                   output['word_embeding'])
+                                   output['word_embedding'])

        return torch.log_softmax(logits_links, dim=3)

-    def train_neuralproofnet(self, df_axiom_links, validation_rate=0.1, epochs=20,
+    def train_neuralproofnet(self, df_axiom_links, validation_rate=0.1, epochs=20, pretrain_linker_epochs=0, 
                             batch_size=32, checkpoint=True, tensorboard=False):
        r"""
        Args:
@@ -161,7 +126,7 @@ class NeuralProofNet(Module):
            Final accuracy and final loss
        """
        # Pretrain the linker
-        self.__pretrain_linker__(df_axiom_links)
+        self.__pretrain_linker__(df_axiom_links, pretrain_linker_epochs, batch_size)

        # Start learning with output from tagger
        training_dataloader, validation_dataloader = self.__preprocess_data(batch_size, df_axiom_links,
@@ -261,8 +226,8 @@ class NeuralProofNet(Module):
                                        dim=3)  # atom_vocab, batch_size, max atoms in one type

        print('\n')
-        print("Les vrais liens de la catégorie n : ", batch_true_links[1][2][:100])
-        print("Les prédictions : ", axiom_links_pred[2][1][:100])
+        print("Les vrais liens de la catégorie n : ", batch_true_links[0][2][:100])
+        print("Les prédictions : ", axiom_links_pred[2][0][:100])
        print('\n')

        accuracy = measure_accuracy(batch_true_links, axiom_links_pred)
@@ -295,12 +260,12 @@ class NeuralProofNet(Module):

        torch.save({
            'atom_encoder': self.linker.atom_encoder.state_dict(),
-            'position_encoder': self.linker.position_encoder,
+            'position_encoder': self.linker.position_encoder.state_dict(),
            'transformer': self.linker.transformer.state_dict(),
            'linker_encoder': self.linker.linker_encoder.state_dict(),
            'pos_transformation': self.linker.pos_transformation.state_dict(),
            'neg_transformation': self.linker.neg_transformation.state_dict(),
-            'cross_entropy_loss': self.linker_loss,
+            'cross_entropy_loss': self.linker_loss.state_dict(),
            'optimizer': self.linker_optimizer,
        }, path)
        self.to(self.device)
\ No newline at end of file
--- a/README.md
+++ b/README.md
@@ -6,7 +6,8 @@ This code was designed to work with the [DeepGrail Tagger](https://gitlab.irit.f
 [DeepGrail Linker](https://gitlab.irit.fr/pnria/global-helper/deepgrail-linker). 
 

-In this version the tagger is not retrained with the linker.
+In this version the tagger is not retrained with the linker. Meaning they are both trained separately in training phase but in inference phase, predictions of tagger feeds inputs of linker.
+

 ## Usage

@@ -17,24 +18,14 @@ Clone the project locally.

 ### Libraries installation

-Run the following script :
-
-```bash
-python3 -m venv env
-source env/bin/activate
-pip install -r requirements.txt
-
-git clone https://gitlab.irit.fr/pnria/global-helper/deepgrail_tagger
-
-mkdir Output
-mkdir TensorBoard
-```
+Run the script init.sh

-Optional : Upload the tagger.pt and linker.pt in models.  (You may need to modify 'model_tagger' in train.py.)
+Optional : Upload the .pt files containing models weights in the **models** directory.

 ### Structure

 The structure should look like this : 
+
 ```
 .
 .
@@ -43,23 +34,28 @@ The structure should look like this :
 │   └── config.ini                   # contains parameters
 ├── requirements.txt                 # librairies needed
 ├── Datasets                         # TLGbank data with links
-├── SuperTagger                      # The Supertagger directory (that you need to install)
-│    ├── ...
-│    └── SuperTagger                 # Implementation of BertForTokenClassification
-│        ├── SuperTagger.py          # Main class
-│        └── Tagging_bert_model.py   # Bert model
+├── SuperTagger                      # The Supertagger directory (that you need 
+│    ├── Datasets                    # TLGbank data with supertags
+│    └──  SuperTagger                # BertForTokenClassification
+│       ├── SuperTagger.py           # Main class
+│       ├── Tagging_bert_model.py # Bert model
+│       ├── SymbolTokenizer       # Tags tokenizer
+│       └── SentencesTokenizer    # Words tokenizer
 ├── Linker                           # The Linker directory (that you need to install)
 │    ├── ...
 │    └── Linker.py                   # Linker class containing the neural network
 ├── NeuralProofNet                   # The NeuralProofNet directory
-│    ├── ...
-│    └── NeuralProofNet.py           # NeuralProofNet class containing the linker and supertagger
+│    ├── utils_proofnet              # utils for NeuralProofNet
+│    └── NeuralProofNet.py           # NeuralProofNet class
 ├── models                           
-│    ├── linker.pt                   # OPTIONAL : the pt file contaning the pretrained linker (you need to install it)
-│    └── supertagger.pt              # the pt file contaning the pretrained supertagger (you need to install it)    
-├── Output                           # Directory where your linker models will be saved if checkpoint=True in train               
-├── TensorBoard                      # Directory where the stats will be saved if tensorboard=True in train
-└──  train.py                        # Example of train
+│    ├── linker.pt                   # OPTIONAL : pretrained linker 
+│    └── supertagger.pt              # pretrained supertagger 
+├── Output                           # Directory with models backups while training
+├── TensorBoard                      # Directory with stats
+├── train_neuralproofnet.py          # train for linker with the pretrained supertager
+├── train_supertagger.py             # train for the supertager
+├── predict_supertags.py             # tags predictions
+└── predict_links.py                 # links predictions
 ```


@@ -68,15 +64,72 @@ The structure should look like this :
 The sentences should be in a column "X", the links with '_x' postfix should be in a column "Y" and the categories in a column "Z".
 For the links each atom_x goes with the one and only other atom_x in the sentence.

+### Utils
+
+In order to load **m2_dataset.csv**, you can use `utils.read_csv_pgbar(...)`. This function return a pandas
+dataframe.
+
+
 ## Training

+### Training of supertagger
+
+```
+df = read_csv_pgbar(file_path,1000)
+texts = df['X'].tolist()
+tags = df['Z'].tolist()
+
+#Dict for convert ID to token (The dict is save with the model for prediction)
+index_to_super = load_obj('Datasets/index_to_super') 
+
+tagger = SuperTagger()
+
+bert_name = 'camembert-base'
+
+tagger.create_new_model(len(index_to_super), bert_name, index_to_super)
+# You can load your model for re-train this
+# tagger.load_weights("your/model/path")
+
+tagger.train(texts, tags, checkpoint=True)
+
+pred_without_argmax, pred_convert, bert_hidden_state = tagger.predict(texts[7])
+```
+
+In train, if you use `checkpoint=True`, the model is automatically saved in a folder: Training_XX-XX_XX-XX. It saves
+after each epoch. Use `tensorboard=True` for log in same folder. (`tensorboard --logdir=logs` for see logs)
+
+`bert_name` can be any model available on [Hugging Face](https://huggingface.co/models)
+
+### Training of linker
+
 Launch train.py, if you look at it you can give another dataset file and another tagging model.

 In train, if you use `checkpoint=True`, the model is automatically saved in a folder: Training_XX-XX_XX-XX. It saves
 after each epoch. Use `tensorboard=True` for log in same folder. (`tensorboard --logdir=logs` for see logs)

+
 ## Predicting

+### Prediction of supertags
+
+For predict on your data you need to load a model (save with this code).
+
+```
+df = read_csv_pgbar(file_path,20)
+texts = df['X'].tolist()
+
+tagger = SuperTagger()
+
+tagger.load_weights("your/model/path")
+
+pred_without_argmax, pred_convert, bert_hidden_state = tagger.predict(texts[7])
+
+print(pred_convert)
+#['let', 'dr(0,s,s)', 'let', 'dr(0,dr(0,s,s),np)', 'dr(0,np,n)', 'dr(0,n,n)', 'let', 'n', 'let', 'dl(0,n,n)', 'dr(0,dl(0,dl(0,n,n),dl(0,n,n)),dl(0,n,n))', 'dl(0,n,n)', 'let', 'dr(0,np,np)', 'np', 'dr(0,dl(0,np,np),np)', 'np', 'dr(0,dl(0,np,np),np)', 'np', 'dr(0,dl(0,np,s),dl(0,np,s))', 'dr(0,dl(0,np,s),np)', 'dl(1,s,s)', 'np', 'dr(0,dl(0,np,np),n)', 'n', 'dl(0,s,txt)']
+```
+
+### Prediction of links
+
 For predict on your data you need to load a model (save with this code).

 ```
@@ -85,9 +138,9 @@ links = linker.predict_without_categories("le chat est noir")
 print(links)
 ```

-The file ```postprocessing.py``` will allow you to draw the prediction. (limited sentence length otherwise it will be confusing) 
+The file ```postprocessing.py``` will allow you to draw the prediction with graphviz (you need to install it). Be careful to predict on limited sentence length otherwise the graph will not be helpful.

-You can also use the function ```predict_without_categories``` which only needs the sentence.
+You can also use the function ```predict_without_categories``` which only needs the sentence (it uses the supertagger to predict the tags) or ```predict_with_categories``` so you can give directlythe categories (useful to check the links without bias from the supertager).


 ## LICENSE

--- a/SuperTagger/Datasets/index_to_pos1.pkl
+++ b/SuperTagger/Datasets/index_to_pos1.pkl
--- a/SuperTagger/Datasets/index_to_super.pkl
+++ b/SuperTagger/Datasets/index_to_super.pkl
--- a/SuperTagger/Datasets/m2_dataset.csv
+++ b/SuperTagger/Datasets/m2_dataset.csv
--- a/SuperTagger/Datasets/m2_dataset_V2.csv
+++ b/SuperTagger/Datasets/m2_dataset_V2.csv
--- a/SuperTagger/Datasets/originals/m2.txt
+++ b/SuperTagger/Datasets/originals/m2.txt
--- a/SuperTagger/Datasets/originals/processingTXT.py
+++ b/SuperTagger/Datasets/originals/processingTXT.py
+import itertools
+import pickle
+import re
+
+import numpy as np
+import pandas as pd
+
+"""
+Format data for training supertagger from txt to csv and tags pkl
+"""
+
+# dr = /
+# dl = \
+
+def sub_tree_word(word_with_data: str):
+    s = word_with_data.split('|')
+    word = s[0]
+    tree = s[2]
+    tree = re.sub("dr", "/", tree)
+    tree = re.sub("dl", "\\\\", tree)
+    tree = re.sub("dia", "dia,", tree)
+    tree = re.sub("box", "box,", tree)
+    tree = re.sub("dl", "\\\\,", tree)
+    tree = re.sub(",\(1,|,\(0,|\(1,|\(0,", ",", tree)
+    tree = re.sub("|\)", "", tree)
+
+    return word, tree.split(',')
+
+def sub_tree_line(line_with_data: str):
+    line_list = line_with_data.split()
+    sentence = ""
+    sub_trees = []
+    #sub_trees.append(["[START]"])
+    for word_with_data in line_list:
+        w, t = sub_tree_word(word_with_data)
+        sentence += ' ' +w
+        t.append("[SEP]")
+        sub_trees.append(t)
+        """if ('ppp' in list(itertools.chain(*sub_trees))):
+            print(sentence)"""
+    sub_trees.append(["[SOS]"])
+    return sentence, list(itertools.chain(*sub_trees))
+
+def Txt_to_csv(file_name: str, csv_name:str = "../Datasets/m2V2_dataset.csv"):
+    file = open(file_name, "r", encoding="utf8")
+    text = file.readlines()
+    sub = [sub_tree_line(data) for data in text]
+    df = pd.DataFrame(data=sub, columns = ['Sentences', 'sub_tree'])
+    df.to_csv(csv_name, index=False)
+
+def normalize_word(orig_word):
+    word = orig_word.lower()
+    if (word == "["):
+        word = "("
+    if (word == "]"):
+        word = ")"
+
+    return word
+
+def read_maxentdata(file):
+    with open(file, 'r', encoding="UTF8") as f:
+        vocabulary = set()
+        vnorm = set()
+        partsofspeech1 = set()
+        partsofspeech2 = set()
+        superset = set()
+        sentno = 0
+        maxlen = 0
+        words = ""
+        postags1 = []
+        postags2 = []
+        supertags = []
+        allwords = []
+        allpos1 = []
+        allpos2 = []
+        allsuper = []
+        for line in f:
+            line = line.strip().split()
+            length = len(line)
+            if (length > maxlen):
+                maxlen = length
+            for l in range(length):
+                item = line[l].split('|')
+                if len(item) > 2:
+                    orig_word = item[0]
+                    word = normalize_word(orig_word)
+                    postag = item[1]
+                    supertag = item[2]
+                    poslist = postag.split('-')
+                    pos1 = poslist[0]
+                    pos2 = poslist[1]
+                    vocabulary.add(orig_word)
+                    vnorm.add(word)
+                    partsofspeech1.add(pos1)
+                    partsofspeech2.add(pos2)
+                    superset.add(supertag)
+                    # words +=  ' ' +(str(orig_word))
+                    words += ' ' + (str(orig_word))
+                    postags1.append(pos1)
+                    postags2.append(pos2)
+                    supertags.append(supertag)
+            allwords.append(words)
+            allpos1.append(postags1)
+            allpos2.append(postags2)
+            allsuper.append(supertags)
+            words = ""
+            postags1 = []
+            postags2 = []
+            supertags = []
+
+        X = np.asarray(allwords)
+        Y1 = np.asarray(allpos1)
+        Y2 = np.asarray(allpos2)
+        Z = np.asarray(allsuper)
+        return X, Y1, Y2, Z, vocabulary, vnorm, partsofspeech1, partsofspeech2, superset, maxlen
+
+def save_obj(obj, name):
+    with open(name + '.pkl', 'wb+') as f:
+        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
+
+# Format from txt to csv 
+# Txt_to_csv("m2.txt")
+X, Y1, Y2, Z, vocabulary, vnorm, partsofspeech1, partsofspeech2, superset, maxlen = read_maxentdata("SuperTagger/Datasets/m2.txt")
+df = pd.DataFrame({"X":X[:-1], "Y1":Y1[:-1], "Y2":Y2[:-1], "Z":Z[:-1]})
+df.to_csv("SuperTagger/Datasets/m2_dataset_V2.csv", index=False)
+
+# Dictionary for supertags
+t =  np.unique(np.array(list(itertools.chain(*Z))))
+dict = { i : t[i] for i in range(0, len(t) ) }
+save_obj(dict,"SuperTagger/Datasets/index_to_super")
+
+# Dictionary for grammar tags (not used)
+t =  np.unique(np.array(list(itertools.chain(*Y1))))
+dict = { i : t[i] for i in range(0, len(t) ) }
+save_obj(dict,"SuperTagger/Datasets/index_to_pos1")
\ No newline at end of file
--- a/SuperTagger/README.md
+++ b/SuperTagger/README.md
+# DeepGrail
+
+This repository contains a Python implementation of BertForTokenClassification using TLGbank data to develop
+part-of-speech taggers and supertaggers.
+
+This code was designed to work with the [DeepGrail Linker](https://gitlab.irit.fr/pnria/global-helper/deepgrail-linker)
+to provide a wide coverage syntactic and semantic parser for French. But the Tagger is independent, you can use it for your own tags.
+
+
+## Structure
+
+```
+.
+├── Datasets                      # TLGbank data
+└──  SuperTagger                  # BertForTokenClassification
+        ├── SuperTagger.py        # Main class
+        ├── Tagging_bert_model.py # Bert model
+        ├── SymbolTokenizer       # Tags tokenizer
+        ├── SentencesTokenizer    # Words tokenizer
+        └── helpers               # utils
+```
+
+
+
+
--- a/SuperTagger/SuperTagger/SentencesTokenizer.py
+++ b/SuperTagger/SuperTagger/SentencesTokenizer.py
+
+class SentencesTokenizer():
+    """
+    Tokenizer for sentences : Based on a pretrained tokenzer
+
+    Atributes:
+    ----------
+        tokenizer : Tokenizer
+            Pretrained Tokenizer
+        max_length : 
+            Maximal length of a sentence (i.e maximum number of words)
+    """
+
+    def __init__(self, tokenizer, max_length):
+        """
+        Parameters :
+        ------------
+            tokenizer (PretrainedTokenizer): Tokenizer that tokenizes text 
+            max_length : Maximal length of a sentence
+        """
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+
+    def fit_transform(self, sents):
+        """
+        Tokenizes the given sentences
+        """
+        return self.tokenizer(sents, padding=True)
+
+    def fit_transform_tensors(self, sents):
+        """
+        Tokenizes the sentences and returns tensor
+        """
+        temp = self.tokenizer(sents, padding='max_length', truncation=True, return_tensors = 'pt', max_length=self.max_length)
+
+        return temp["input_ids"], temp["attention_mask"]
+
+    def convert_ids_to_tokens(self, inputs_ids, skip_special_tokens=False):
+        """
+        Decodes a sentence.
+        """
+        return self.tokenizer.batch_decode(inputs_ids, skip_special_tokens=skip_special_tokens)
--- a/SuperTagger/SuperTagger/SuperTagger.py
+++ b/SuperTagger/SuperTagger/SuperTagger.py
+import os
+import sys
+import time
+
+import torch
+import transformers
+from torch.optim import Adam
+from torch.utils.data import TensorDataset, random_split
+from tqdm import tqdm
+from transformers import AutoTokenizer
+from transformers import logging
+
+from Configuration import Configuration
+from .SentencesTokenizer import SentencesTokenizer
+from .SymbolTokenizer import SymbolTokenizer
+from .Tagging_bert_model import Tagging_bert_model
+from .eval import categorical_accuracy
+from utils import format_time, output_create_dir
+
+
+logging.set_verbosity(logging.ERROR)
+
+
+# region Class
+
+class SuperTagger:
+    """
+    Implements the SuperTagger to assign each word a supertag (also named symbol). A supertag is a tree of tags such as np, s, ...
+
+    Attributes:
+    -----------
+    max_len_sentence : int
+        Maximum length of sentence, equals to the maximum number of supertags
+    index_to_tags : dic
+    num_label : int
+        number of possible supertags 
+    bert_name : 
+        name of BERT model
+    sent_tokenizer ! Tokenizer
+        Tokenize words to word_token
+    tags_tokenizer : Tokenizer
+        Tokenize supertag to supertag_token
+    model : TokenClassifier
+        Model for classification of tokens. Classify word_token to supertag_token.
+    optimizer : Optimizer
+        Optimizer to repropagate gradients
+    epoch_i : int
+        Current number of epoch
+    device : Device
+        CPU or cuda
+    trainable : bool
+    model_load : bool
+    """
+
+    # region Instanciation
+
+    def __init__(self):
+        """
+        Python implementation of BertForTokenClassification using TLGbank data to develop supertaggers.
+        """
+        config = Configuration.read_config()
+        datasetConfig = config["DATASET_PARAMS"]
+        self.max_len_sentence = int(datasetConfig['max_len_sentence'])
+
+        self.index_to_tags = None
+        self.num_label = None
+        self.bert_name = None
+        self.sent_tokenizer = None
+        self.tags_tokenizer = None
+        self.model = None
+
+        self.optimizer = None
+
+        self.epoch_i = 0
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+
+        self.trainable = False
+        self.model_load = False
+
+    def load_weights(self, model_file):
+        """
+        Loads an SupperTagger saved with SupperTagger.__checkpoint_save() (during a train) from a file.
+
+        Parameters:
+        -----------
+            model_file: 
+                path of .pt save of model
+        """
+        self.trainable = False
+
+        print("#" * 20)
+        print("\n Loading model for supertagger ...")
+        try:
+            params = torch.load(model_file, map_location=self.device)
+            args = params['args']
+            self.bert_name = args['bert_name']
+            self.index_to_tags = args['index_to_tags']
+            self.num_label = len(self.index_to_tags)
+            self.model = Tagging_bert_model(self.bert_name, self.num_label)
+            self.tags_tokenizer = SymbolTokenizer(self.index_to_tags)
+            self.sent_tokenizer = SentencesTokenizer(transformers.AutoTokenizer.from_pretrained(self.bert_name,do_lower_case=True), 
+                                                    self.max_len_sentence)
+            self.model.load_state_dict(params['state_dict'])
+            self.optimizer = params['optimizer']
+            # self.epoch_i = args['epoch']
+            print("\n The loading checkpoint was successful ! \n")
+            print("\tBert model : ", self.bert_name)
+            print("\tLast epoch : ", self.epoch_i)
+            print()
+        except Exception as e:
+            print("\n/!\ Can't load checkpoint model /!\ because :\n\n " + str(e), file=sys.stderr)
+            raise e
+        print("#" * 20)
+
+        self.model_load = True
+        self.trainable = True
+
+    def create_new_model(self, num_label, bert_name, index_to_tags):
+        """
+        Instantiation and parameterization of a new bert model.
+
+        Parameters:
+        -----------
+            num_label: 
+                number of diferent labels (tags)
+            bert_name: 
+                name of model available on Hugging Face `<https://huggingface.co/models>`
+            index_to_tags: 
+                Dict for convert ID to tags
+        """
+        assert len(
+            index_to_tags) == num_label, f" len(index_to_tags) : {len(index_to_tags)} must be equels with num_label: {num_label}"
+
+        self.model = Tagging_bert_model(bert_name, num_label + 1)
+        index_to_tags = {k + 1: v for k, v in index_to_tags.items()}
+        # <unk> is used for the pad AND unknown tags
+        index_to_tags[0] = '<unk>'
+
+        self.index_to_tags = index_to_tags
+        self.bert_name = bert_name
+        self.sent_tokenizer = SentencesTokenizer(AutoTokenizer.from_pretrained(bert_name,do_lower_case=True),
+                                                self.max_len_sentence)
+        self.optimizer = Adam(params=self.model.parameters(), lr=2e-4, eps=1e-8)
+        self.tags_tokenizer = SymbolTokenizer(index_to_tags)
+        self.trainable = True
+        self.model_load = True
+
+    # endregion Instanciation
+
+    # region Usage
+
+    def predict(self, sentences):
+        """
+        Predict and convert sentences in tags (depends on the dictation given when the model was created)
+
+        Parameters:
+        -----------
+            sentences: list of sentences : list[str] OR one sentences : str
+        
+        Returns:
+        --------
+            tags prediction for all sentences (no argmax tags, convert tags, embedding layer of bert )
+        """
+        assert self.trainable or self.model is None, "Please use the create_new_model(...) or load_weights(...) " \
+                                                        "function before the predict, the model is not integrated "
+        assert type(sentences) == str or type(sentences) == list, "param sentences: list of sentences : list[" \
+                                                                       "str] OR one sentences : str "
+        sentences = [sentences] if type(sentences) == str else sentences
+
+        self.model.eval()
+        with torch.no_grad():
+            sents_tokenized_t, sents_mask_t = self.sent_tokenizer.fit_transform_tensors(sentences)
+
+            self.model = self.model.cpu()
+
+            output = self.model.predict((sents_tokenized_t, sents_mask_t))
+
+            return output['logit'], self.tags_tokenizer.convert_ids_to_tags(torch.argmax(output['logit'], dim=2).detach())
+
+    def forward(self, b_sents_tokenized, b_sents_mask):
+        """
+        Forward to the model
+        """
+        with torch.no_grad():
+            output = self.model.predict((b_sents_tokenized, b_sents_mask))
+            return output
+
+    def train(self, sentences, tags, validation_rate=0.1, epochs=20, batch_size=16,
+              tensorboard=False,
+              checkpoint=False):
+        """
+        Starts the training of the model, either new or previously loaded
+
+        Parameters:
+        -----------
+            sentences: list of sentences for train (X)
+            tags: list of tags for train (Y)
+            validation_rate: percentage of validation data [0-1]
+            epochs: number of epoch (50 recommended)
+            batch_size:  number of sample in batch (32 recommended, attention to memory)
+            tensorboard: use tensorboard for see loss and accuracy
+            checkpoint: save the model after each epoch
+        """
+        assert self.trainable or self.model is None, "Please use the create_new_model(...) or load_weights(...) function before the train, the model is not integrated"
+
+        assert len(sentences) == len(
+            tags), f" num of sentences (X): {len(sentences)} must be equals with num of labels " \
+                   f"(Y): {len(tags)} "
+
+        if checkpoint or tensorboard:
+            checkpoint_dir, writer = output_create_dir()
+
+        training_dataloader, validation_dataloader = self.__preprocess_data(batch_size, sentences, tags,
+                                                                            1 - validation_rate)
+        epochs = epochs - self.epoch_i
+        self.model = self.model.to(self.device)
+        self.model.train()
+
+        for epoch_i in range(0, epochs):
+            print("")
+            print('======== Epoch {:} / {:} ========'.format(epoch_i+1, epochs))
+            print('Training...')
+
+            # Train
+            epoch_acc, epoch_loss, training_time = self.__train_epoch(training_dataloader)
+
+            # Validation
+            if validation_rate > 0.0:
+                eval_accuracy, eval_loss, nb_eval_steps = self.__eval_epoch(validation_dataloader)
+
+            print("")
+            print(f'Epoch: {epoch_i+1:02} | Epoch Time: {training_time}')
+            print(f'\tTrain Loss: {epoch_loss:.3f} | Train Acc: {epoch_acc * 100:.2f}%')
+            if validation_rate > 0.0:
+                print(f'\tVal Loss: {eval_loss:.3f} | Val Acc: {eval_accuracy * 100:.2f}%')
+
+            if tensorboard:
+                writer.add_scalars(f'Accuracy', {
+                    'Train': epoch_acc}, epoch_i+1)
+                writer.add_scalars(f'Loss', {
+                    'Train': epoch_loss}, epoch_i+1)
+                if validation_rate > 0.0:
+                    writer.add_scalars(f'Accuracy', {
+                        'Validation': eval_accuracy}, epoch_i+1)
+                    writer.add_scalars(f'Loss', {
+                        'Validation': eval_loss}, epoch_i+1)
+
+            self.epoch_i += 1
+
+            if checkpoint:
+                self.__checkpoint_save(path=os.path.join(checkpoint_dir, 'model_check.pt'))
+
+    # endregion Usage
+
+    # region Private
+
+    def __preprocess_data(self, batch_size, sentences, tags,
+                          validation_rate):
+        """
+        Create torch dataloader for training
+        
+        Parameters:
+        -----------
+            batch_size: number of sample in batch
+            sentences: list of sentences for train (X)
+            tags: list of tags for train (Y)
+            validation_rate: percentage of validation data [0-1]
+
+        Returns:
+        --------
+            training dataloader, validation dataloader
+        """
+        validation_dataloader = None
+
+        sents_tokenized_t, sents_mask_t = self.sent_tokenizer.fit_transform_tensors(sentences)
+        tags_t = self.tags_tokenizer.convert_batchs_to_ids(tags, sents_tokenized_t)
+        dataset = TensorDataset(sents_tokenized_t, sents_mask_t, tags_t)
+
+        train_size = int(validation_rate * len(dataset))
+        print('{:>5,} training samples'.format(train_size))
+
+        if validation_rate < 1:
+            val_size = len(dataset) - train_size
+            train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
+            print('{:>5,} validation samples'.format(val_size))
+            validation_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
+        else:
+            train_dataset = dataset
+        training_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+        return training_dataloader, validation_dataloader
+
+    def __train_epoch(self, training_dataloader):
+        """
+        Train on epoch
+        
+        Parameters:
+        -----------
+            training_dataloader: dataloader of training data
+
+        Returns:
+        --------
+            epoch accuracy, epoch loss, training time
+        """
+        self.model.train()
+        epoch_loss = 0
+        epoch_acc = 0
+        t0 = time.time()
+        i = 0
+        with tqdm(training_dataloader, unit="batch") as tepoch:
+            for batch in tepoch:
+                # Convert to device
+                b_sents_tokenized = batch[0].to(self.device)
+                b_sents_mask = batch[1].to(self.device)
+                targets = batch[2].to(self.device)
+                self.optimizer.zero_grad()
+
+                output = self.model((b_sents_tokenized, b_sents_mask, targets))
+                loss = output['loss']
+
+                predictions = torch.argmax(output['logit'], dim=2).detach().cpu().numpy()
+                label_ids = targets.cpu().numpy()
+
+                acc = categorical_accuracy(predictions, label_ids)
+
+                loss.backward()
+
+                epoch_acc += acc
+                epoch_loss += loss.item()
+
+                self.optimizer.step()
+                i += 1
+
+        # Measure how long this epoch took.
+        training_time = format_time(time.time() - t0)
+
+        epoch_acc = epoch_acc / i
+        epoch_loss = epoch_loss / i
+
+        return epoch_acc, epoch_loss, training_time
+
+    def __eval_epoch(self, validation_dataloader):
+        """
+        Validation on epoch
+
+        Parameters:
+        -----------
+            validation_dataloader:  dataloader of validation data
+
+        Returns: 
+        --------
+            epoch accuracy, epoch loss, num step
+        """
+        self.model.eval()
+        eval_loss = 0
+        eval_accuracy = 0
+        nb_eval_steps, nb_eval_examples = 0, 0
+        with torch.no_grad():
+            print("Start eval")
+            for step, batch in enumerate(validation_dataloader):
+                # Convert to device
+                b_sents_tokenized = batch[0].to(self.device)
+                b_sents_mask = batch[1].to(self.device)
+                b_symbols_tokenized = batch[2].to(self.device)
+
+                output = self.model((b_sents_tokenized, b_sents_mask, b_symbols_tokenized))
+                loss = output['loss']
+
+                predictions = torch.argmax(output['logit'], dim=2).detach().cpu().numpy()
+                label_ids = b_symbols_tokenized.cpu().numpy()
+
+                accuracy = categorical_accuracy(predictions, label_ids)
+                eval_loss += loss.item()
+                eval_accuracy += accuracy
+                nb_eval_examples += b_sents_tokenized.size(0)
+                nb_eval_steps += 1
+
+            eval_loss = eval_loss / nb_eval_steps
+            eval_accuracy = eval_accuracy / nb_eval_steps
+        return eval_accuracy, eval_loss, nb_eval_steps
+
+    def __checkpoint_save(self, path='/model_check.pt'):
+        """
+        Save the model with good parameters
+
+        Parameters:
+        -----------
+            path: poth and name for save
+        """
+        self.model.cpu()
+        # print('save model parameters to [%s]' % path, file=sys.stderr)
+
+        torch.save({
+            'args': dict(bert_name=self.bert_name, index_to_tags=self.index_to_tags, epoch=self.epoch_i),
+            'state_dict': self.model.state_dict(),
+            'optimizer': self.optimizer,
+        }, path)
+        self.model.to(self.device)
+
+    # endregion Private
+
+# endregion Class
\ No newline at end of file
--- a/SuperTagger/SuperTagger/SymbolTokenizer.py
+++ b/SuperTagger/SuperTagger/SymbolTokenizer.py
+import pickle
+
+import numpy as np
+import torch
+
+
+def load_obj(name):
+    with open(name + '.pkl', 'rb') as f:
+        return pickle.load(f)
+
+def pad_sequence(sequences, max_len=400):
+    padded = [0] * max_len
+    padded[:len(sequences)] = sequences
+    return padded
+
+class SymbolTokenizer():
+    """
+    Tokenizer for tags : Based on a dictionary
+
+    Atributes:
+    ----------
+        index_to_super : dict
+            Convert id to supertag
+        super_to_index : dict
+            Convert supertag to id
+    """
+
+    def __init__(self, index_to_super):
+        """
+        Parameters:
+        -----------
+            index_to_super: Dict for convert ID to tags """
+        self.index_to_super = index_to_super
+        self.super_to_index = {v: int(k) for k, v in self.index_to_super.items()}
+
+    def lenSuper(self):
+        """Returns len of dict for convert ID to tags """
+        return len(self.index_to_super) + 1
+
+    def convert_batchs_to_ids(self, tags, sents_tokenized):
+        """
+        Convert batch of tags to id
+        """
+        encoded_labels = []
+        labels = [[self.super_to_index[str(symbol)] for symbol in sents] for sents in tags]
+        for l, s in zip(labels, sents_tokenized):
+            super_tok = pad_sequence(l, len(s))
+            encoded_labels.append(super_tok)
+
+        return torch.tensor(encoded_labels)
+
+    def convert_ids_to_tags(self, tags_ids):
+        labels = [[self.index_to_super[int(symbol)] for symbol in sents if self.index_to_super[int(symbol)] != '<unk>']
+                  for sents in tags_ids]
+
+        return labels
+
+
--- a/SuperTagger/SuperTagger/Tagging_bert_model.py
+++ b/SuperTagger/SuperTagger/Tagging_bert_model.py
+import torch
+import transformers
+from torch.nn import Module
+
+from transformers import logging
+
+
+class Tagging_bert_model(Module):
+    """ Implements a Token Classification model with transformers library.
+
+    Attributes:
+    -----------
+    bert_name : str
+        Name of BERT model to upload
+    num_labels : int
+        number of possible supertags
+    config : transformer Config
+    bert : TokenClassification model
+    """
+
+    def __init__(self, bert_name, num_labels):
+        super(Tagging_bert_model, self).__init__()
+        self.bert_name = bert_name
+        self.num_labels = num_labels
+        config = transformers.AutoConfig.from_pretrained(bert_name, output_hidden_states=True, num_labels=num_labels)
+        self.bert = transformers.AutoModelForTokenClassification.from_pretrained(bert_name, config=config)
+
+    def forward(self, batch):
+        """
+        Forward to the model.
+
+        Parameters:
+        -----------
+            batch :
+                batch of tokenized sentences
+
+        Returns:
+        --------
+            result : dict containing logit, word_embedding and last_hidden_state
+        """
+        b_input_ids = batch[0]
+        b_input_mask = batch[1]
+        labels = batch[2]
+
+        output = self.bert(
+            input_ids=b_input_ids, attention_mask=b_input_mask, labels=labels)
+
+        result = {'loss': output[0],'logit': output[1], 'word_embedding': output[2][0], 'last_hidden_state': output[2][1]}
+
+        return result
+
+    def predict(self, batch):
+        """
+        Prediction of supertags for a batch of sentences
+
+        Parameters:
+        -----------
+            batch :
+                batch of tokenized sentences
+
+        Returns:
+        --------
+            result : dict containing logit, word_embedding and last_hidden_state
+        """
+        b_input_ids = batch[0]
+        b_input_mask = batch[1]
+
+        output = self.bert(
+            input_ids=b_input_ids, attention_mask=b_input_mask)
+
+        result = {'logit' : output[0], 'word_embedding': output[1][0], 'last_hidden_state':output[1][1]}
+
+        return result
--- a/SuperTagger/SuperTagger/eval.py
+++ b/SuperTagger/SuperTagger/eval.py
+def categorical_accuracy(preds, truth):
+    """
+    Calculates how often predictions match argmax labels.
+    preds: batch of prediction. (argmax)
+    truth: batch of truth label.
+    @return: scoring of batch prediction. (Categorical accuracy values)
+    """
+    good_label = 0
+    nb_label = 0
+    for i in range(len(truth)):
+        sublist_truth = truth[i]
+        sublist_preds = preds[i]
+        for j in range(len(sublist_truth)):
+            if sublist_truth[j] != 0:
+                if sublist_truth[j] == sublist_preds[j]:
+                    good_label += 1
+                nb_label += 1
+    return good_label / nb_label
+    
\ No newline at end of file
--- a/SuperTagger/__init__.py
+++ b/SuperTagger/__init__.py
+from .SuperTagger.SuperTagger import SuperTagger
\ No newline at end of file
--- a/SuperTagger/setup.cfg
+++ b/SuperTagger/setup.cfg
+[metadata]
+name = SuperTagger
+version = 1.0
+
No results found