From ea12a5e78085dc81b9f9bebb51ce068cd5346e2c Mon Sep 17 00:00:00 2001
From: PNRIA - Julien <julien.rabault@irit.fr>
Date: Tue, 17 May 2022 17:50:58 +0200
Subject: [PATCH] Add comment on code

---
 README.md                               |   7 +-
 SuperTagger/SuperTagger.py              | 155 +++++++++++++++---------
 SuperTagger/Utils/Tagging_bert_model.py |   3 -
 SuperTagger/Utils/utils.py              |   7 +-
 4 files changed, 108 insertions(+), 64 deletions(-)

diff --git a/README.md b/README.md
index 5ece8c3..23a2c05 100644
--- a/README.md
+++ b/README.md
@@ -74,14 +74,15 @@ tagger.create_new_model(len(index_to_super), bert_name, index_to_super)
 # You can load your model for re-train this
 # tagger.load_weights("your/model/path")
 
-tagger.train(texts,tags, checkpoint=True)
+tagger.train(texts, tags, checkpoint=True)
 
 pred_without_argmax, pred_convert, bert_hidden_state = tagger.predict(texts[7])
 ```
 
-In train, if you use `checkpoint=True`, the model is automatically saved in a folder: Training_XX-XX_XX-XX. It saves after each epoch.
-Use `tensorboard=True` for log in same folder. (`tensorboard --logdir=logs` for see logs)
+In train, if you use `checkpoint=True`, the model is automatically saved in a folder: Training_XX-XX_XX-XX. It saves
+after each epoch. Use `tensorboard=True` for log in same folder. (`tensorboard --logdir=logs` for see logs)
 
+`bert_name` can be any model available on [Hugging Face](https://huggingface.co/models)
 ## Authors
 
 [Rabault Julien](https://www.linkedin.com/in/julienrabault), de Pourtales Caroline
\ No newline at end of file
diff --git a/SuperTagger/SuperTagger.py b/SuperTagger/SuperTagger.py
index 047acb4..b49485e 100644
--- a/SuperTagger/SuperTagger.py
+++ b/SuperTagger/SuperTagger.py
@@ -5,7 +5,7 @@ import time
 
 import torch
 import transformers
-from torch import nn, Tensor
+from torch import Tensor
 from torch.optim import Adam
 from torch.utils.data import Dataset, TensorDataset, random_split, DataLoader
 from torch.utils.tensorboard import SummaryWriter
@@ -20,10 +20,12 @@ from SuperTagger.Utils.Tagging_bert_model import Tagging_bert_model
 logging.set_verbosity(logging.ERROR)
 
 
+# region Utils
+
 def output_create_dir():
     """
-
-    @return:
+    Create le output dir for tensorboard and checkpoint
+    @return: output dir, tensorboard writter
     """
     from datetime import datetime
     outpout_path = 'TensorBoard'
@@ -35,10 +37,10 @@ def output_create_dir():
 
 def categorical_accuracy(preds: list[list[int]], truth: list[list[int]]) -> float:
     """
-
-    @param preds:
-    @param truth:
-    @return:
+    Calculates how often predictions match argmax labels.
+    @param preds: batch of prediction. (argmax)
+    @param truth: batch of truth label.
+    @return: scoring of batch prediction. (Categorical accuracy values)
     """
     good_label = 0
     nb_label = 0
@@ -64,11 +66,17 @@ def format_time(elapsed):
     return str(datetime.timedelta(seconds=elapsed_rounded))
 
 
+# endregion Utils
+
+# region Class
+
 class SuperTagger:
 
+    # region Constructor
+
     def __init__(self):
         """
-
+        Python implementation of BertForTokenClassification using TLGbank data to develop supertaggers.
         """
         self.index_to_tags = None
         self.num_label = None
@@ -81,15 +89,19 @@ class SuperTagger:
 
         self.epoch_i = 0
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-        self.loss = None
 
         self.trainable = False
         self.model_load = False
 
+    # endregion Constructor
+
+    # region Instanciation
+
     def load_weights(self, model_file: str):
         """
-        yo mec
-        @param model_file:
+        Loads an SupperTagger saved with SupperTagger.__checkpoint_save() (during a train) from a file.
+
+        @param model_file: path of .pt save of model
         """
         self.trainable = False
 
@@ -107,10 +119,10 @@ class SuperTagger:
                 do_lower_case=True))
             self.model.load_state_dict(params['state_dict'])
             self.optimizer = params['optimizer']
-            self.epoch_i = args['epoch']
+            # self.epoch_i = args['epoch']
             print("\n The loading checkpoint was successful ! \n")
             print("\tBert model : ", self.bert_name)
-            # print("\tLast epoch : ", self.epoch_i)
+            print("\tLast epoch : ", self.epoch_i)
             print()
         except Exception as e:
             print("\n/!\ Can't load checkpoint model /!\ because :\n\n " + str(e), file=sys.stderr)
@@ -122,35 +134,45 @@ class SuperTagger:
 
     def create_new_model(self, num_label: int, bert_name: str, index_to_tags: dict):
         """
+        Instantiation and parameterization of a new bert model
 
-        @param num_label:
-        @param bert_name:
-        @param index_to_tags:
+        @param num_label: number of diferent labels (tags)
+        @param bert_name: name of model available on Hugging Face `<https://huggingface.co/models>`
+        @param index_to_tags: Dict for convert ID to tags
         """
         assert len(
             index_to_tags) == num_label, f" len(index_to_tags) : {len(index_to_tags)} must be equels with num_label: {num_label}"
 
         self.model = Tagging_bert_model(bert_name, num_label + 1)
         index_to_tags = {k + 1: v for k, v in index_to_tags.items()}
+        # <unk> is used for the pad AND unknown tags
         index_to_tags[0] = '<unk>'
+
         self.index_to_tags = index_to_tags
         self.bert_name = bert_name
         self.sent_tokenizer = SentencesTokenizer(AutoTokenizer.from_pretrained(
             bert_name,
             do_lower_case=True))
-        self.loss = nn.CrossEntropyLoss(ignore_index=0)
         self.optimizer = Adam(params=self.model.parameters(), lr=2e-4, eps=1e-8)
         self.tags_tokenizer = SymbolTokenizer(index_to_tags)
         self.trainable = True
         self.model_load = True
 
-    def predict(self, sentences: list[str]) -> (list[list[list[float]]], list[list[str]], Tensor):
+    # endregion Instanciation
+
+    # region Usage
+
+    def predict(self, sentences) -> (list[list[list[float]]], list[list[str]], Tensor):
         """
+        Predict and convert sentences in tags (depends on the dictation given when the model was created)
 
-        @param sentences:
-        @return:
+        @param sentences: list of sentences : list[str] OR one sentences : str
+        @return: tags prediction for all sentences (no argmax tags, convert tags, embedding layer of bert )
         """
-        assert self.trainable or self.model is None, "Please use the create_new_model(...) or load_weights(...) function before the predict, the model is not integrated"
+        assert self.trainable or self.model is None, "Please use the create_new_model(...) or load_weights(...) " \
+                                                     "function before the predict, the model is not integrated "
+        assert type(sentences) == str or type(sentences) == list[str], "param sentences: list of sentences : list[" \
+                                                                       "str] OR one sentences : str "
         sentences = [sentences] if type(sentences) == str else sentences
 
         self.model.eval()
@@ -163,21 +185,34 @@ class SuperTagger:
 
             return preds, self.tags_tokenizer.convert_ids_to_tags(torch.argmax(preds, dim=2).detach()), hidden
 
-    def train(self, sentences: list[str], tags: list[list[str]], validation_rate=0.1, epochs=20, batch_size=32,
+    def forward(self, b_sents_tokenized: Tensor, b_sents_mask: Tensor) -> (Tensor, Tensor):
+        """
+        Function used for the linker (same of predict)
+        """
+        with torch.no_grad():
+            logit, hidden = self.model.predict((b_sents_tokenized, b_sents_mask))
+            return logit, hidden
+
+    def train(self, sentences: list[str], tags: list[list[str]], validation_rate=0.1, epochs=20, batch_size=16,
               tensorboard=False,
               checkpoint=False):
         """
-
-        @param sentences:
-        @param tags:
-        @param validation_rate:
-        @param epochs:
-        @param batch_size:
-        @param tensorboard:
-        @param checkpoint:
+        Starts the training of the model, either new or previously loaded
+
+        @param sentences: list of sentences for train (X)
+        @param tags: list of tags for train (Y)
+        @param validation_rate: percentage of validation data [0-1]
+        @param epochs: number of epoch (50 recommended)
+        @param batch_size:  number of sample in batch (32 recommended, attention to memory)
+        @param tensorboard: use tensorboard for see loss and accuracy
+        @param checkpoint: save the model after each epoch
         """
         assert self.trainable or self.model is None, "Please use the create_new_model(...) or load_weights(...) function before the train, the model is not integrated"
 
+        assert len(sentences) == len(
+            tags), f" num of sentences (X): {len(sentences)} must be equals with num of labels " \
+                   f"(Y): {len(tags)} "
+
         if checkpoint or tensorboard:
             checkpoint_dir, writer = output_create_dir()
 
@@ -189,49 +224,59 @@ class SuperTagger:
 
         for epoch_i in range(0, epochs):
             print("")
-            print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
+            print('======== Epoch {:} / {:} ========'.format(epoch_i, epochs))
             print('Training...')
 
+            # Train
             epoch_acc, epoch_loss, training_time = self.__train_epoch(training_dataloader)
 
+            # Validation
             if validation_rate > 0.0:
                 eval_accuracy, eval_loss, nb_eval_steps = self.__eval_epoch(validation_dataloader)
 
             print("")
-            print(f'Epoch: {epoch_i + 1:02} | Epoch Time: {training_time}')
+            print(f'Epoch: {epoch_i:02} | Epoch Time: {training_time}')
             print(f'\tTrain Loss: {epoch_loss:.3f} | Train Acc: {epoch_acc * 100:.2f}%')
             if validation_rate > 0.0:
                 print(f'\tVal Loss: {eval_loss:.3f} | Val Acc: {eval_accuracy * 100:.2f}%')
 
             if tensorboard:
                 writer.add_scalars(f'Accuracy', {
-                    'Train': epoch_acc}, epoch_i + 1)
+                    'Train': epoch_acc}, epoch_i)
                 writer.add_scalars(f'Loss', {
-                    'Train': epoch_loss}, epoch_i + 1)
+                    'Train': epoch_loss}, epoch_i)
                 if validation_rate > 0.0:
                     writer.add_scalars(f'Accuracy', {
-                        'Validation': eval_accuracy}, epoch_i + 1)
+                        'Validation': eval_accuracy}, epoch_i)
                     writer.add_scalars(f'Loss', {
-                        'Validation': eval_loss}, epoch_i + 1)
+                        'Validation': eval_loss}, epoch_i)
+
+            self.epoch_i += 1
 
             if checkpoint:
                 self.__checkpoint_save(path=os.path.join(checkpoint_dir, 'model_check.pt'))
 
+    # endregion Usage
+
+    # region Private
+
     def __preprocess_data(self, batch_size: int, sentences: list[str], tags: list[list[str]],
                           validation_rate: float) -> (DataLoader, DataLoader):
         """
+        Create torch dataloader for training
 
-        @param batch_size:
-        @param sentences:
-        @param tags:
-        @param validation_rate:
-        @return:
+        @param batch_size: number of sample in batch
+        @param sentences: list of sentences for train (X)
+        @param tags: list of tags for train (Y)
+        @param validation_rate: percentage of validation data [0-1]
+        @return: (training dataloader, validation dataloader)
         """
         validation_dataloader = None
 
         sents_tokenized_t, sents_mask_t = self.sent_tokenizer.fit_transform_tensors(sentences)
         tags_t = self.tags_tokenizer.convert_batchs_to_ids(tags, sents_tokenized_t)
         dataset = TensorDataset(sents_tokenized_t, sents_mask_t, tags_t)
+
         train_size = int(validation_rate * len(dataset))
         print('{:>5,} training samples'.format(train_size))
 
@@ -247,9 +292,10 @@ class SuperTagger:
 
     def __train_epoch(self, training_dataloader: DataLoader) -> (float, float, str):
         """
+        Train on epoch
 
-        @param training_dataloader:
-        @return:
+        @param training_dataloader: dataloader of training data
+        @return: (epoch accuracy, epoch loss, training time)
         """
         self.model.train()
         epoch_loss = 0
@@ -258,7 +304,7 @@ class SuperTagger:
         i = 0
         with tqdm(training_dataloader, unit="batch") as tepoch:
             for batch in tepoch:
-                # Unpack this training batch from our dataloader.
+                # Convert to device
                 b_sents_tokenized = batch[0].to(self.device)
                 b_sents_mask = batch[1].to(self.device)
                 targets = batch[2].to(self.device)
@@ -287,17 +333,13 @@ class SuperTagger:
 
         return epoch_acc, epoch_loss, training_time
 
-    def foward(self, b_sents_tokenized: Tensor, b_sents_mask: Tensor) -> (Tensor, Tensor):
+    def __eval_epoch(self, validation_dataloader: DataLoader) -> (float, float, int):
         """
+        Validation on epoch
 
-        @param b_sents_tokenized:
-        @param b_sents_mask:
-        @return:
+        @param validation_dataloader:  dataloader of validation data
+        @return: (epoch accuracy, epoch loss, num step)
         """
-        _, logit, hidden = self.model((b_sents_tokenized, b_sents_mask))
-        return logit, hidden
-
-    def __eval_epoch(self, validation_dataloader: DataLoader) -> (float, float, int):
         self.model.eval()
         eval_loss = 0
         eval_accuracy = 0
@@ -305,6 +347,7 @@ class SuperTagger:
         with torch.no_grad():
             print("Start eval")
             for step, batch in enumerate(validation_dataloader):
+                # Convert to device
                 b_sents_tokenized = batch[0].to(self.device)
                 b_sents_mask = batch[1].to(self.device)
                 b_symbols_tokenized = batch[2].to(self.device)
@@ -326,8 +369,8 @@ class SuperTagger:
 
     def __checkpoint_save(self, path='/model_check.pt'):
         """
-
-        @param path:
+        Save the model with good parameters
+        @param path: poth and name for save
         """
         self.model.cpu()
         # print('save model parameters to [%s]' % path, file=sys.stderr)
@@ -338,3 +381,7 @@ class SuperTagger:
             'optimizer': self.optimizer,
         }, path)
         self.model.to(self.device)
+
+    # endregion Private
+
+# endregion Class
diff --git a/SuperTagger/Utils/Tagging_bert_model.py b/SuperTagger/Utils/Tagging_bert_model.py
index da37bfe..83ef46f 100644
--- a/SuperTagger/Utils/Tagging_bert_model.py
+++ b/SuperTagger/Utils/Tagging_bert_model.py
@@ -7,10 +7,7 @@ from transformers import logging
 
 class Tagging_bert_model(Module):
     """
-    A standard Encoder-Decoder architecture. Base for this and many
-    other models.
 
-    decoder : instance of Decoder
     """
 
     def __init__(self, bert_name, num_labels):
diff --git a/SuperTagger/Utils/utils.py b/SuperTagger/Utils/utils.py
index 863e7e2..7c4a2a0 100644
--- a/SuperTagger/Utils/utils.py
+++ b/SuperTagger/Utils/utils.py
@@ -22,12 +22,14 @@ def read_csv_pgbar(csv_path, nrows=float('inf'), chunksize=100):
     print("#" * 20)
     return df
 
+
 def load_obj(name):
     with open(name + '.pkl', 'rb') as f:
         import pickle
         return pickle.load(f)
 
-def categorical_accuracy_str(preds : list[list[float]], truth: list[list[float]]) -> float:
+
+def categorical_accuracy_str(preds: list[list[float]], truth: list[list[float]]) -> float:
     nb_label = 0
     good_label = 0
     for i in range(len(truth)):
@@ -39,6 +41,3 @@ def categorical_accuracy_str(preds : list[list[float]], truth: list[list[float]]
             nb_label += 1
 
     return good_label / nb_label
-
-
-
-- 
GitLab