From 3672f80350587cbff5ce6990355412f84107fd82 Mon Sep 17 00:00:00 2001
From: CarodePourtales <49019500+CarodePourtales@users.noreply.github.com>
Date: Fri, 8 Jul 2022 12:05:54 +0200
Subject: [PATCH] cleaning

---
 Linker/AtomTokenizer.py | 23 +++++++++++++++++
 Linker/Linker.py        | 56 +++++++++++++++++++++++++++++++++--------
 Linker/README.md        | 50 ------------------------------------
 Linker/eval.py          |  5 +++-
 README.md               | 47 +++++++++++++++++++++++++++++++++-
 init.sh                 |  3 +++
 train.py                |  8 ++++--
 7 files changed, 127 insertions(+), 65 deletions(-)
 delete mode 100644 Linker/README.md
 create mode 100644 init.sh

diff --git a/Linker/AtomTokenizer.py b/Linker/AtomTokenizer.py
index c72b73e..1f5c1a1 100644
--- a/Linker/AtomTokenizer.py
+++ b/Linker/AtomTokenizer.py
@@ -3,6 +3,9 @@ from utils import pad_sequence
 
 
 class AtomTokenizer(object):
+    r"""
+    Tokenizer for the atoms with padding
+    """
     def __init__(self, atom_map, max_atoms_in_sentence):
         self.atom_map = atom_map
         self.max_atoms_in_sentence = max_atoms_in_sentence
@@ -14,14 +17,34 @@ class AtomTokenizer(object):
         return len(self.atom_map)
 
     def convert_atoms_to_ids(self, atom):
+        r"""
+        Convert a atom to its id
+        :param atom: atom string
+        :return: atom id
+        """
         return self.atom_map[str(atom)]
 
     def convert_sents_to_ids(self, sentences):
+        r"""
+        Convert sentences to ids
+        :param sentences: List of atoms in a sentence
+        :return: List of atoms'ids
+        """
         return torch.as_tensor([self.convert_atoms_to_ids(atom) for atom in sentences])
 
     def convert_batchs_to_ids(self, batchs_sentences):
+        r"""
+        Convert a batch of sentences of atoms to the ids
+        :param batchs_sentences: batch of sentences atoms
+        :return: list of list of atoms'ids
+        """
         return torch.as_tensor(pad_sequence([self.convert_sents_to_ids(sents) for sents in batchs_sentences],
                                             max_len=self.max_atoms_in_sentence, padding_value=self.pad_token_id))
 
     def convert_ids_to_atoms(self, ids):
+        r"""
+        Translate id to atom
+        :param ids: atom id
+        :return: atom string
+        """
         return [self.inverse_atom_map[int(i)] for i in ids]
diff --git a/Linker/Linker.py b/Linker/Linker.py
index 2f10844..5223065 100644
--- a/Linker/Linker.py
+++ b/Linker/Linker.py
@@ -19,7 +19,7 @@ from Linker.AtomTokenizer import AtomTokenizer
 from Linker.PositionalEncoding import PositionalEncoding
 from Linker.Sinkhorn import sinkhorn_fn_no_exp as sinkhorn
 from Linker.atom_map import atom_map, atom_map_redux
-from Linker.eval import mesure_accuracy, SinkhornLoss
+from Linker.eval import measure_accuracy, SinkhornLoss
 from Linker.utils_linker import FFN, get_axiom_links, get_GOAL, get_pos_idx, get_neg_idx, get_atoms_batch, \
     find_pos_neg_idexes, get_num_atoms_batch
 from SuperTagger import SuperTagger
@@ -315,7 +315,7 @@ class Linker(Module):
                 self.optimizer.step()
 
                 pred_axiom_links = torch.argmax(logits_predictions, dim=3)
-                accuracy_train += mesure_accuracy(batch_true_links, pred_axiom_links, self.max_atoms_in_one_type)
+                accuracy_train += measure_accuracy(batch_true_links, pred_axiom_links, self.max_atoms_in_one_type)
 
         self.scheduler.step()
 
@@ -346,7 +346,7 @@ class Linker(Module):
         print("Les prédictions : ", axiom_links_pred[2][1][:100])
         print('\n')
 
-        accuracy = mesure_accuracy(batch_true_links, axiom_links_pred, self.max_atoms_in_one_type)
+        accuracy = measure_accuracy(batch_true_links, axiom_links_pred, self.max_atoms_in_one_type)
         loss = self.cross_entropy_loss(logits_predictions, batch_true_links, self.max_atoms_in_one_type)
 
         return loss, accuracy
@@ -368,7 +368,7 @@ class Linker(Module):
 
         return loss_average / len(dataloader), accuracy_average / len(dataloader)
 
-    def predict(self, sentence, categories):
+    def predict_with_categories(self, sentence, categories):
         r""" Predict the links from a sentence and its categories
 
         Args :
@@ -377,23 +377,23 @@ class Linker(Module):
         """
         self.eval()
         with torch.no_grad():
+            self.cpu()
+            self.device = torch.device("cpu")
             sentences_tokens, sentences_mask = self.Supertagger.sent_tokenizer.fit_transform_tensors([sentence])
-            sentences_tokens = sentences_tokens.to(self.device)
             nb_sentence, len_sentence = sentences_tokens.shape
-            sentences_mask = sentences_mask.to(self.device)
 
             atoms = get_atoms_batch([categories])
-            atoms_tokenized = self.atoms_tokenizer.convert_batchs_to_ids(atoms).to(self.device)
+            atoms_tokenized = self.atoms_tokenizer.convert_batchs_to_ids(atoms)
 
             polarities = find_pos_neg_idexes([categories])
             polarities = pad_sequence(
                 [torch.as_tensor(polarities[i], dtype=torch.bool) for i in range(len(polarities))],
-                max_len=self.max_atoms_in_sentence, padding_value=0).to(self.device)
+                max_len=self.max_atoms_in_sentence, padding_value=0)
 
-            num_atoms_per_word = get_num_atoms_batch([categories], len_sentence).to(self.device)
+            num_atoms_per_word = get_num_atoms_batch([categories], len_sentence)
 
-            pos_idx = get_pos_idx(atoms, polarities, self.max_atoms_in_one_type).to(self.device)
-            neg_idx = get_neg_idx(atoms, polarities, self.max_atoms_in_one_type).to(self.device)
+            pos_idx = get_pos_idx(atoms, polarities, self.max_atoms_in_one_type)
+            neg_idx = get_neg_idx(atoms, polarities, self.max_atoms_in_one_type)
 
             output = self.Supertagger.forward(sentences_tokens, sentences_mask)
 
@@ -402,6 +402,40 @@ class Linker(Module):
 
         return axiom_links_pred
 
+    def predict_without_categories(self, sentence):
+        r""" Predict the links from a sentence
+
+        Args :
+            sentence : list of words composing the sentence
+        """
+        self.eval()
+        with torch.no_grad():
+            self.cpu()
+            self.device = torch.device("cpu")
+            sentences_tokens, sentences_mask = self.Supertagger.sent_tokenizer.fit_transform_tensors([sentence])
+            nb_sentence, len_sentence = sentences_tokens.shape
+
+            hidden_state, categories = self.Supertagger.predict(sentence)
+
+            output = self.Supertagger.forward(sentences_tokens, sentences_mask)
+            atoms = get_atoms_batch(categories)
+            atoms_tokenized = self.atoms_tokenizer.convert_batchs_to_ids(atoms)
+
+            polarities = find_pos_neg_idexes(categories)
+            polarities = pad_sequence(
+                [torch.as_tensor(polarities[i], dtype=torch.bool) for i in range(len(polarities))],
+                max_len=self.max_atoms_in_sentence, padding_value=0)
+
+            num_atoms_per_word = get_num_atoms_batch(categories, len_sentence)
+
+            pos_idx = get_pos_idx(atoms, polarities, self.max_atoms_in_one_type)
+            neg_idx = get_neg_idx(atoms, polarities, self.max_atoms_in_one_type)
+
+            logits_predictions = self(num_atoms_per_word, atoms_tokenized, pos_idx, neg_idx, output['word_embeding'])
+            axiom_links_pred = torch.argmax(logits_predictions, dim=3)
+
+        return axiom_links_pred
+
     def load_weights(self, model_file):
         print("#" * 15)
         try:
diff --git a/Linker/README.md b/Linker/README.md
deleted file mode 100644
index d6903fa..0000000
--- a/Linker/README.md
+++ /dev/null
@@ -1,50 +0,0 @@
-# DeepGrail Linker
-
-This repository contains a Python implementation of a Neural Proof Net using TLGbank data.
-
-This code was designed to work with the [DeepGrail Tagger](https://gitlab.irit.fr/pnria/global-helper/deepgrail_tagger). 
-In this repository we only use the embedding of the word from the tagger and the tags from the dataset, but next step is to use the prediction of the tagger for the linking step.
- 
-## Usage
-
-### Installation
-Python 3.9.10 **(Warning don't use Python 3.10**+**)**
-Clone the project locally.
-
-### Libraries installation
-
-In a clean python venv do `pip install -r requirements.txt`
-
-### Dataset format
-
-The sentences should be in a column "X", the links with '_x' postfix should be in a column "Y" and the categories in a column "Z".
-For the links each atom_x goes with the one and only other atom_x in the sentence.
-
-## Training
-
-Launch train.py, if you look at it you can give another dataset file and another tagging model.
-
-In train, if you use `checkpoint=True`, the model is automatically saved in a folder: Training_XX-XX_XX-XX. It saves
-after each epoch. Use `tensorboard=True` for log in same folder. (`tensorboard --logdir=logs` for see logs)
-
-## Predicting
-
-For predict on your data you need to load a model (save with this code).
-
-```
-df = read_csv_pgbar(file_path,20)
-texts = df['X'].tolist()
-categories = df['Z'].tolist()
-
-linker = Linker(tagging_model)
-linker.load_weights("your/linker/path")
-
-links = linker.predict(texts[7], categories[7])
-print(links)
-```
-
-The file ```postprocessing.py``` will allow you to draw the prediction. (limited sentence length otherwise it will be confusing) 
-
-## Authors
-
-[de Pourtales Caroline](https://www.linkedin.com/in/caroline-de-pourtales/), [Rabault Julien](https://www.linkedin.com/in/julienrabault)
\ No newline at end of file
diff --git a/Linker/eval.py b/Linker/eval.py
index 05c0966..086f2a9 100644
--- a/Linker/eval.py
+++ b/Linker/eval.py
@@ -5,6 +5,9 @@ from Linker.atom_map import atom_map, atom_map_redux
 
 
 class SinkhornLoss(Module):
+    r"""
+    Loss for the linker
+    """
     def __init__(self):
         super(SinkhornLoss, self).__init__()
 
@@ -13,7 +16,7 @@ class SinkhornLoss(Module):
                    for link, perm in zip(predictions, truths.permute(1, 0, 2)))
 
 
-def mesure_accuracy(batch_true_links, axiom_links_pred, max_atoms_in_one_type):
+def measure_accuracy(batch_true_links, axiom_links_pred, max_atoms_in_one_type):
     r"""
     batch_true_links : (atom_vocab_size, batch_size, max_atoms_in_one_cat) contains the index of the negative atoms
     axiom_links_pred : (atom_vocab_size, batch_size, max_atoms_in_one_cat) contains the index of the negative atoms
diff --git a/README.md b/README.md
index 15a8616..3348ca6 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,52 @@
-# DeepGrail
+# DeepGrail Linker
 
 This repository contains a Python implementation of a Neural Proof Net using TLGbank data.
 
+This code was designed to work with the [DeepGrail Tagger](https://gitlab.irit.fr/pnria/global-helper/deepgrail_tagger). 
+In this repository we only use the embedding of the word from the tagger and the tags from the dataset, but next step is to use the prediction of the tagger for the linking step.
+ 
+## Usage
+
+### Installation
+Python 3.9.10 **(Warning don't use Python 3.10**+**)**
+Clone the project locally.
+
+### Libraries installation
+
+Run the init.sh script or install the Tagger project under SuperTagger name.
+
+### Dataset format
+
+The sentences should be in a column "X", the links with '_x' postfix should be in a column "Y" and the categories in a column "Z".
+For the links each atom_x goes with the one and only other atom_x in the sentence.
+
+## Training
+
+Launch train.py, if you look at it you can give another dataset file and another tagging model.
+
+In train, if you use `checkpoint=True`, the model is automatically saved in a folder: Training_XX-XX_XX-XX. It saves
+after each epoch. Use `tensorboard=True` for log in same folder. (`tensorboard --logdir=logs` for see logs)
+
+## Predicting
+
+For predict on your data you need to load a model (save with this code).
+
+```
+df = read_csv_pgbar(file_path,20)
+texts = df['X'].tolist()
+categories = df['Z'].tolist()
+
+linker = Linker(tagging_model)
+linker.load_weights("your/linker/path")
+
+links = linker.predict_with_categories(texts[7], categories[7])
+print(links)
+```
+
+The file ```postprocessing.py``` will allow you to draw the prediction. (limited sentence length otherwise it will be confusing) 
+
+You can also use the function ```predict_without_categories``` which only needs the sentence.
+
 ## Authors
 
 [de Pourtales Caroline](https://www.linkedin.com/in/caroline-de-pourtales/), [Rabault Julien](https://www.linkedin.com/in/julienrabault)
\ No newline at end of file
diff --git a/init.sh b/init.sh
new file mode 100644
index 0000000..be8706d
--- /dev/null
+++ b/init.sh
@@ -0,0 +1,3 @@
+git clone https://gitlab.irit.fr/pnria/global-helper/deepgrail_tagger.git SuperTagger
+
+pip install -r requirements.txt
\ No newline at end of file
diff --git a/train.py b/train.py
index 0f1d17c..b1164f4 100644
--- a/train.py
+++ b/train.py
@@ -6,7 +6,7 @@ from find_config import configurate
 
 torch.cuda.empty_cache()
 batch_size = int(Configuration.modelTrainingConfig['batch_size'])
-nb_sentences = batch_size * 4
+nb_sentences = batch_size * 800
 file_path_axiom_links = 'Datasets/goldANDsilver_dataset_links.csv'
 model_tagger = "models/flaubert_super_98_V2_50e.pt"
 configurate(file_path_axiom_links, model_tagger, nb_sentences=nb_sentences)
@@ -14,9 +14,13 @@ configurate(file_path_axiom_links, model_tagger, nb_sentences=nb_sentences)
 epochs = int(Configuration.modelTrainingConfig['epoch'])
 df_axiom_links = read_csv_pgbar(file_path_axiom_links, nb_sentences)
 
+print("#" * 20)
+print("#" * 20)
 print("Linker")
 # Load the Linker with trained tagger
 linker = Linker(model_tagger)
 print("\nLinker Training\n")
-linker.train_linker(df_axiom_links, validation_rate=0.05, epochs=1, batch_size=batch_size,
+linker.train_linker(df_axiom_links, validation_rate=0.05, epochs=epochs, batch_size=batch_size,
                     checkpoint=True, tensorboard=True)
+print("#" * 20)
+print("#" * 20)
\ No newline at end of file
-- 
GitLab