From 897a4516bf31f15776f007289563db1b0176c2fb Mon Sep 17 00:00:00 2001 From: Caroline DE POURTALES <caroline.de-pourtales@irit.fr> Date: Mon, 5 Dec 2022 15:54:56 +0100 Subject: [PATCH] adding predict --- Configuration/config.ini | 9 +- Linker/Linker.py | 136 +++++++-------- Linker/utils_linker.py | 5 +- NeuralProofNet/NeuralProofNet.py | 60 ++----- README.md | 8 +- SuperTagger/Datasets/processingTXT.py | 31 ++-- SuperTagger/README.md | 3 +- SuperTagger/SuperTagger/SentencesTokenizer.py | 42 +++++ SuperTagger/SuperTagger/SuperTagger.py | 160 ++++++++---------- .../{Utils => }/SymbolTokenizer.py | 28 ++- .../{Utils => }/Tagging_bert_model.py | 24 +++ .../SuperTagger/Utils/SentencesTokenizer.py | 18 -- SuperTagger/SuperTagger/Utils/helpers.py | 42 ----- SuperTagger/SuperTagger/eval.py | 19 +++ SuperTagger/__init__.py | 1 - predict_links.py | 12 +- predict_supertags.py | 4 +- train_neuralproofnet.py | 19 +-- train_supertagger.py | 7 +- utils.py | 112 ++++++++---- 20 files changed, 368 insertions(+), 372 deletions(-) create mode 100644 SuperTagger/SuperTagger/SentencesTokenizer.py rename SuperTagger/SuperTagger/{Utils => }/SymbolTokenizer.py (71%) rename SuperTagger/SuperTagger/{Utils => }/Tagging_bert_model.py (72%) delete mode 100644 SuperTagger/SuperTagger/Utils/SentencesTokenizer.py delete mode 100644 SuperTagger/SuperTagger/Utils/helpers.py create mode 100644 SuperTagger/SuperTagger/eval.py diff --git a/Configuration/config.ini b/Configuration/config.ini index 30dfc89..a695829 100644 --- a/Configuration/config.ini +++ b/Configuration/config.ini @@ -20,11 +20,4 @@ dim_cat_out = 256 dim_intermediate_ffn = 128 dim_pre_sinkhorn_transfo = 32 dropout = 0.1 -sinkhorn_iters = 5 - -[MODEL_TRAINING] -batch_size = 32 -pretrain_linker_epochs = 10 -epoch = 20 -seed_val = 42 -learning_rate = 2e-3 \ No newline at end of file +sinkhorn_iters = 5 \ No newline at end of file diff --git a/Linker/Linker.py b/Linker/Linker.py index 58197bd..e8494c9 100644 --- a/Linker/Linker.py +++ b/Linker/Linker.py @@ -1,4 +1,3 @@ -import datetime import math import os import sys @@ -11,7 +10,6 @@ from torch.nn import Sequential, LayerNorm, Module, Linear, Dropout, Transformer from torch.optim import AdamW from torch.optim.lr_scheduler import StepLR from torch.utils.data import TensorDataset, random_split -from torch.utils.tensorboard import SummaryWriter from tqdm import tqdm from Configuration import Configuration @@ -21,42 +19,15 @@ from Linker.Sinkhorn import sinkhorn_fn_no_exp as sinkhorn from Linker.atom_map import atom_map, atom_map_redux from Linker.eval import measure_accuracy, SinkhornLoss from Linker.utils_linker import FFN, get_axiom_links, get_GOAL, get_pos_idx, get_neg_idx, get_atoms_batch, \ - find_pos_neg_idexes, get_num_atoms_batch + find_pos_neg_idexes, get_num_atoms_batch, generate_square_subsequent_mask from SuperTagger import SuperTagger -from utils import pad_sequence - - -def format_time(elapsed): - ''' - Takes a time in seconds and returns a string hh:mm:ss - ''' - # Round to the nearest second. - elapsed_rounded = int(round(elapsed)) - - # Format as hh:mm:ss - return str(datetime.timedelta(seconds=elapsed_rounded)) - - -def output_create_dir(): - """ - Create le output dir for tensorboard and checkpoint - @return: output dir, tensorboard writter - """ - from datetime import datetime - outpout_path = 'TensorBoard' - training_dir = os.path.join(outpout_path, 'Tranning_' + datetime.today().strftime('%d-%m_%H-%M')) - logs_dir = os.path.join(training_dir, 'logs') - writer = SummaryWriter(log_dir=logs_dir) - return training_dir, writer - - -def generate_square_subsequent_mask(sz): - """Generates an upper-triangular matrix of -inf, with zeros on diag.""" - return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1) +from utils import pad_sequence, format_time, output_create_dir class Linker(Module): + # region initialization + def __init__(self, supertagger_path_model): super(Linker, self).__init__() @@ -65,7 +36,6 @@ class Linker(Module): datasetConfig = config["DATASET_PARAMS"] modelEncoderConfig = config["MODEL_ENCODER"] modelLinkerConfig = config["MODEL_LINKER"] - modelTrainingConfig = config["MODEL_TRAINING"] dim_encoder = int(modelEncoderConfig['dim_encoder']) atom_vocab_size = int(datasetConfig['atom_vocab_size']) @@ -85,7 +55,6 @@ class Linker(Module): self.max_len_sentence = int(datasetConfig['max_len_sentence']) self.max_atoms_in_sentence = int(datasetConfig['max_atoms_in_sentence']) self.max_atoms_in_one_type = int(datasetConfig['max_atoms_in_one_type']) - learning_rate = float(modelTrainingConfig['learning_rate']) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # endregion @@ -125,10 +94,32 @@ class Linker(Module): # Learning self.cross_entropy_loss = SinkhornLoss() - self.optimizer = AdamW(self.parameters(), lr=learning_rate) + self.optimizer = AdamW(self.parameters(), lr=0.001) self.scheduler = StepLR(self.optimizer, step_size=2, gamma=0.5) self.to(self.device) + def load_weights(self, model_file): + print("#" * 15) + try: + params = torch.load(model_file, map_location=self.device) + self.atom_encoder.load_state_dict(params['atom_encoder']) + self.position_encoder.load_state_dict(params['position_encoder']) + self.transformer.load_state_dict(params['transformer']) + self.linker_encoder.load_state_dict(params['linker_encoder']) + self.pos_transformation.load_state_dict(params['pos_transformation']) + self.neg_transformation.load_state_dict(params['neg_transformation']) + self.cross_entropy_loss.load_state_dict(params['cross_entropy_loss']) + self.optimizer = params['optimizer'] + print("\n The loading checkpoint was successful ! \n") + except Exception as e: + print("\n/!\ Can't load checkpoint model /!\ because :\n\n " + str(e), file=sys.stderr) + raise e + print("#" * 15) + + #endregion + + # region data + def __preprocess_data(self, batch_size, df_axiom_links, validation_rate=0.1): r""" Args: @@ -172,6 +163,26 @@ class Linker(Module): print("End preprocess Data") return training_dataloader, validation_dataloader + #endregion + + # region training + + def make_sinkhorn_inputs(self, bsd_tensor, positional_ids, atom_type): + """ + :param bsd_tensor: + Tensor of shape batch size \times sequence length \times feature dimensionality. + :param positional_ids: + A List of batch_size elements, each being a List of num_atoms LongTensors. + Each LongTensor in positional_ids[b][a] indexes the location of atoms of type a in sentence b. + :param atom_type: + :return: + """ + + return torch.stack([torch.stack([bsd_tensor.select(0, index=i).select(0, index=int(atom)).to(self.device) + if atom != -1 else torch.zeros(self.dim_cat_out, device=self.device) + for atom in sentence]) + for i, sentence in enumerate(positional_ids[:, self.atom_map_redux[atom_type], :])]) + def forward(self, batch_num_atoms_per_word, batch_atoms, batch_pos_idx, batch_neg_idx, sents_embedding): r""" Args: @@ -327,6 +338,10 @@ class Linker(Module): return avg_train_loss, avg_accuracy_train, training_time + #endregion + + # region evaluation + def eval_batch(self, batch): batch_num_atoms = batch[0].to(self.device) batch_atoms_tok = batch[1].to(self.device) @@ -343,8 +358,9 @@ class Linker(Module): axiom_links_pred = torch.argmax(logits_predictions, dim=3) # atom_vocab, batch_size, max atoms in one type print('\n') - print("Les vrais liens de la catégorie n : ", batch_true_links[1][2][:100]) - print("Les prédictions : ", axiom_links_pred[2][1][:100]) + print(batch_true_links) + print("Les vrais liens de la catégorie n : ", batch_true_links[0][2][:100]) + print("Les prédictions : ", axiom_links_pred[2][0][:100]) print('\n') accuracy = measure_accuracy(batch_true_links, axiom_links_pred) @@ -369,6 +385,10 @@ class Linker(Module): return loss_average / len(dataloader), accuracy_average / len(dataloader) + #endregion + + #region prediction + def predict_with_categories(self, sentence, categories): r""" Predict the links from a sentence and its categories @@ -443,24 +463,8 @@ class Linker(Module): axiom_links_pred = torch.argmax(logits_predictions, dim=3) return categories, axiom_links_pred - - def load_weights(self, model_file): - print("#" * 15) - try: - params = torch.load(model_file, map_location=self.device) - self.atom_encoder.load_state_dict(params['atom_encoder']) - self.position_encoder.load_state_dict(params['position_encoder']) - self.transformer.load_state_dict(params['transformer']) - self.linker_encoder.load_state_dict(params['linker_encoder']) - self.pos_transformation.load_state_dict(params['pos_transformation']) - self.neg_transformation.load_state_dict(params['neg_transformation']) - self.cross_entropy_loss.load_state_dict(params['cross_entropy_loss']) - self.optimizer.load_state_dict(params['optimizer']) - print("\n The loading checkpoint was successful ! \n") - except Exception as e: - print("\n/!\ Can't load checkpoint model /!\ because :\n\n " + str(e), file=sys.stderr) - raise e - print("#" * 15) + + #endregion def __checkpoint_save(self, path='/linker.pt'): """ @@ -470,28 +474,12 @@ class Linker(Module): torch.save({ 'atom_encoder': self.atom_encoder.state_dict(), - 'position_encoder': self.position_encoder, + 'position_encoder': self.position_encoder.state_dict(), 'transformer': self.transformer.state_dict(), 'linker_encoder': self.linker_encoder.state_dict(), 'pos_transformation': self.pos_transformation.state_dict(), 'neg_transformation': self.neg_transformation.state_dict(), - 'cross_entropy_loss': self.cross_entropy_loss, + 'cross_entropy_loss': self.cross_entropy_loss.state_dict(), 'optimizer': self.optimizer, }, path) self.to(self.device) - - def make_sinkhorn_inputs(self, bsd_tensor, positional_ids, atom_type): - """ - :param bsd_tensor: - Tensor of shape batch size \times sequence length \times feature dimensionality. - :param positional_ids: - A List of batch_size elements, each being a List of num_atoms LongTensors. - Each LongTensor in positional_ids[b][a] indexes the location of atoms of type a in sentence b. - :param atom_type: - :return: - """ - - return torch.stack([torch.stack([bsd_tensor.select(0, index=i).select(0, index=int(atom)).to(self.device) - if atom != -1 else torch.zeros(self.dim_cat_out, device=self.device) - for atom in sentence]) - for i, sentence in enumerate(positional_ids[:, self.atom_map_redux[atom_type], :])]) diff --git a/Linker/utils_linker.py b/Linker/utils_linker.py index ddf97bb..5f16c82 100644 --- a/Linker/utils_linker.py +++ b/Linker/utils_linker.py @@ -25,7 +25,10 @@ class FFN(Module): def forward(self, x): return self.ffn(x) - +def generate_square_subsequent_mask(sz): + """Generates an upper-triangular matrix of -inf, with zeros on diag.""" + return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1) + ################################ Regex ######################################## regex_categories_axiom_links = r'\w+\(\d+,(?:((?R))|(\w+))*,?(?:((?R))|(\w+))*\)' regex_categories = r'\w+\(\d+,(?:((?R))|(\w+))*,?(?:((?R))|(\w+))*\)' diff --git a/NeuralProofNet/NeuralProofNet.py b/NeuralProofNet/NeuralProofNet.py index 73ee607..ab12693 100644 --- a/NeuralProofNet/NeuralProofNet.py +++ b/NeuralProofNet/NeuralProofNet.py @@ -1,6 +1,3 @@ -import os -import datetime -import os import time import torch @@ -8,7 +5,6 @@ from torch.nn import Module from torch.optim import AdamW from torch.optim.lr_scheduler import StepLR from torch.utils.data import TensorDataset, random_split -from torch.utils.tensorboard import SummaryWriter from tqdm import tqdm from Configuration import Configuration @@ -16,31 +12,7 @@ from Linker import Linker from Linker.eval import measure_accuracy, SinkhornLoss from Linker.utils_linker import get_axiom_links, get_GOAL, get_pos_idx, get_num_atoms_batch, get_neg_idx from NeuralProofNet.utils_proofnet import get_info_for_tagger -from utils import pad_sequence - - -def format_time(elapsed): - ''' - Takes a time in seconds and returns a string hh:mm:ss - ''' - # Round to the nearest second. - elapsed_rounded = int(round(elapsed)) - - # Format as hh:mm:ss - return str(datetime.timedelta(seconds=elapsed_rounded)) - - -def output_create_dir(): - """ - Create le output dir for tensorboard and checkpoint - @return: output dir, tensorboard writter - """ - from datetime import datetime - outpout_path = 'TensorBoard' - training_dir = os.path.join(outpout_path, 'Tranning_' + datetime.today().strftime('%d-%m_%H-%M')) - logs_dir = os.path.join(training_dir, 'logs') - writer = SummaryWriter(log_dir=logs_dir) - return training_dir, writer +from utils import pad_sequence, format_time, output_create_dir class NeuralProofNet(Module): @@ -49,38 +21,30 @@ class NeuralProofNet(Module): super(NeuralProofNet, self).__init__() config = Configuration.read_config() datasetConfig = config["DATASET_PARAMS"] - modelTrainingConfig = config["MODEL_TRAINING"] - # pretrain settings - self.pretrain_linker_epochs = int(modelTrainingConfig['pretrain_linker_epochs']) # settings self.max_len_sentence = int(datasetConfig['max_len_sentence']) self.max_atoms_in_sentence = int(datasetConfig['max_atoms_in_sentence']) self.max_atoms_in_one_type = int(datasetConfig['max_atoms_in_one_type']) - learning_rate = float(modelTrainingConfig['learning_rate']) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - self.batch_size = int(modelTrainingConfig['batch_size']) linker = Linker(supertagger_path_model) if linker_path_model is not None: linker.load_weights(linker_path_model) - self.pretrain_linker_epochs = 0 self.linker = linker # Learning self.linker_loss = SinkhornLoss() self.linker_optimizer = AdamW(self.linker.parameters(), - lr=learning_rate) + lr=0.001) self.linker_scheduler = StepLR(self.linker_optimizer, step_size=2, gamma=0.5) self.to(self.device) - def __pretrain_linker__(self, df_axiom_links, checkpoint=False, tensorboard=True): + def __pretrain_linker__(self, df_axiom_links, pretrain_linker_epochs, batch_size, checkpoint=False, tensorboard=True): print("\nLinker Pre-Training\n") - self.linker.train_linker(df_axiom_links, validation_rate=0.05, epochs=self.pretrain_linker_epochs, - batch_size=self.batch_size, - checkpoint=checkpoint, - tensorboard=tensorboard) + self.linker.train_linker(df_axiom_links, validation_rate=0.05, epochs=pretrain_linker_epochs, + batch_size=batch_size, checkpoint=checkpoint, tensorboard=tensorboard) print("\nEND Linker Pre-Training\n") def __preprocess_data(self, batch_size, df_axiom_links, validation_rate=0.1): @@ -144,11 +108,11 @@ class NeuralProofNet(Module): batch_neg_idx = batch_neg_idx.to(self.device) logits_links = self.linker(batch_num_atoms_per_word, atoms_batch_tokenized, batch_pos_idx, batch_neg_idx, - output['word_embeding']) + output['word_embedding']) return torch.log_softmax(logits_links, dim=3) - def train_neuralproofnet(self, df_axiom_links, validation_rate=0.1, epochs=20, + def train_neuralproofnet(self, df_axiom_links, validation_rate=0.1, epochs=20, pretrain_linker_epochs=0, batch_size=32, checkpoint=True, tensorboard=False): r""" Args: @@ -162,7 +126,7 @@ class NeuralProofNet(Module): Final accuracy and final loss """ # Pretrain the linker - self.__pretrain_linker__(df_axiom_links) + self.__pretrain_linker__(df_axiom_links, pretrain_linker_epochs, batch_size) # Start learning with output from tagger training_dataloader, validation_dataloader = self.__preprocess_data(batch_size, df_axiom_links, @@ -262,8 +226,8 @@ class NeuralProofNet(Module): dim=3) # atom_vocab, batch_size, max atoms in one type print('\n') - print("Les vrais liens de la catégorie n : ", batch_true_links[1][2][:100]) - print("Les prédictions : ", axiom_links_pred[2][1][:100]) + print("Les vrais liens de la catégorie n : ", batch_true_links[0][2][:100]) + print("Les prédictions : ", axiom_links_pred[2][0][:100]) print('\n') accuracy = measure_accuracy(batch_true_links, axiom_links_pred) @@ -296,12 +260,12 @@ class NeuralProofNet(Module): torch.save({ 'atom_encoder': self.linker.atom_encoder.state_dict(), - 'position_encoder': self.linker.position_encoder, + 'position_encoder': self.linker.position_encoder.state_dict(), 'transformer': self.linker.transformer.state_dict(), 'linker_encoder': self.linker.linker_encoder.state_dict(), 'pos_transformation': self.linker.pos_transformation.state_dict(), 'neg_transformation': self.linker.neg_transformation.state_dict(), - 'cross_entropy_loss': self.linker_loss, + 'cross_entropy_loss': self.linker_loss.state_dict(), 'optimizer': self.linker_optimizer, }, path) self.to(self.device) \ No newline at end of file diff --git a/README.md b/README.md index 154f242..28e986e 100644 --- a/README.md +++ b/README.md @@ -38,11 +38,9 @@ The structure should look like this : │ ├── Datasets # TLGbank data with supertags │ └── SuperTagger # BertForTokenClassification │ ├── SuperTagger.py # Main class -│ └── Utils -│ ├── Tagging_bert_model.py # Bert model -│ ├── SymbolTokenizer # Tags tokenizer -│ ├── SentencesTokenizer # Words tokenizer -│ └── helpers # utils +│ ├── Tagging_bert_model.py # Bert model +│ ├── SymbolTokenizer # Tags tokenizer +│ └── SentencesTokenizer # Words tokenizer ├── Linker # The Linker directory (that you need to install) │ ├── ... │ └── Linker.py # Linker class containing the neural network diff --git a/SuperTagger/Datasets/processingTXT.py b/SuperTagger/Datasets/processingTXT.py index a0dbd84..320d004 100644 --- a/SuperTagger/Datasets/processingTXT.py +++ b/SuperTagger/Datasets/processingTXT.py @@ -5,6 +5,9 @@ import re import numpy as np import pandas as pd +""" +Format data for training supertagger from txt to csv and tags pkl +""" # dr = / # dl = \ @@ -38,21 +41,18 @@ def sub_tree_line(line_with_data: str): sub_trees.append(["[SOS]"]) return sentence, list(itertools.chain(*sub_trees)) -def Txt_to_csv(file_name: str): +def Txt_to_csv(file_name: str, csv_name:str = "../Datasets/m2V2_dataset.csv"): file = open(file_name, "r", encoding="utf8") text = file.readlines() - sub = [sub_tree_line(data) for data in text] - df = pd.DataFrame(data=sub, columns = ['Sentences', 'sub_tree']) - - df.to_csv("../Datasets/" + file_name[:-4] + "_dataset.csv", index=False) + df.to_csv(csv_name, index=False) def normalize_word(orig_word): word = orig_word.lower() - if (word is "["): + if (word == "["): word = "(" - if (word is "]"): + if (word == "]"): word = ")" return word @@ -118,27 +118,18 @@ def save_obj(obj, name): with open(name + '.pkl', 'wb+') as f: pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) -def load_obj(name): - with open(name + '.pkl', 'rb') as f: - return pickle.load(f) - +# Format from txt to csv # Txt_to_csv("m2.txt") - X, Y1, Y2, Z, vocabulary, vnorm, partsofspeech1, partsofspeech2, superset, maxlen = read_maxentdata("SuperTagger/Datasets/m2.txt") - -df = pd.DataFrame(columns = ["X", "Y1", "Y2", "Z"]) - -df['X'] = X[:-1] -df['Y1'] = Y1[:-1] -df['Y2'] = Y2[:-1] -df['Z'] = Z[:-1] - +df = pd.DataFrame({"X":X[:-1], "Y1":Y1[:-1], "Y2":Y2[:-1], "Z":Z[:-1]}) df.to_csv("SuperTagger/Datasets/m2_dataset_V2.csv", index=False) +# Dictionary for supertags t = np.unique(np.array(list(itertools.chain(*Z)))) dict = { i : t[i] for i in range(0, len(t) ) } save_obj(dict,"SuperTagger/Datasets/index_to_super") +# Dictionary for grammar tags (not used) t = np.unique(np.array(list(itertools.chain(*Y1)))) dict = { i : t[i] for i in range(0, len(t) ) } save_obj(dict,"SuperTagger/Datasets/index_to_pos1") \ No newline at end of file diff --git a/SuperTagger/README.md b/SuperTagger/README.md index 140b2b1..a6b7651 100644 --- a/SuperTagger/README.md +++ b/SuperTagger/README.md @@ -13,8 +13,7 @@ to provide a wide coverage syntactic and semantic parser for French. But the Tag . ├── Datasets # TLGbank data └── SuperTagger # BertForTokenClassification - ├── SuperTagger.py # Main class - └── Utils + ├── SuperTagger.py # Main class ├── Tagging_bert_model.py # Bert model ├── SymbolTokenizer # Tags tokenizer ├── SentencesTokenizer # Words tokenizer diff --git a/SuperTagger/SuperTagger/SentencesTokenizer.py b/SuperTagger/SuperTagger/SentencesTokenizer.py new file mode 100644 index 0000000..104577f --- /dev/null +++ b/SuperTagger/SuperTagger/SentencesTokenizer.py @@ -0,0 +1,42 @@ + +class SentencesTokenizer(): + """ + Tokenizer for sentences : Based on a pretrained tokenzer + + Atributes: + ---------- + tokenizer : Tokenizer + Pretrained Tokenizer + max_length : + Maximal length of a sentence (i.e maximum number of words) + """ + + def __init__(self, tokenizer, max_length): + """ + Parameters : + ------------ + tokenizer (PretrainedTokenizer): Tokenizer that tokenizes text + max_length : Maximal length of a sentence + """ + self.tokenizer = tokenizer + self.max_length = max_length + + def fit_transform(self, sents): + """ + Tokenizes the given sentences + """ + return self.tokenizer(sents, padding=True) + + def fit_transform_tensors(self, sents): + """ + Tokenizes the sentences and returns tensor + """ + temp = self.tokenizer(sents, padding='max_length', truncation=True, return_tensors = 'pt', max_length=self.max_length) + + return temp["input_ids"], temp["attention_mask"] + + def convert_ids_to_tokens(self, inputs_ids, skip_special_tokens=False): + """ + Decodes a sentence. + """ + return self.tokenizer.batch_decode(inputs_ids, skip_special_tokens=skip_special_tokens) diff --git a/SuperTagger/SuperTagger/SuperTagger.py b/SuperTagger/SuperTagger/SuperTagger.py index 80e006d..4f405a0 100644 --- a/SuperTagger/SuperTagger/SuperTagger.py +++ b/SuperTagger/SuperTagger/SuperTagger.py @@ -1,80 +1,31 @@ -import datetime import os import sys import time import torch import transformers -from torch import Tensor from torch.optim import Adam from torch.utils.data import TensorDataset, random_split -from torch.utils.tensorboard import SummaryWriter from tqdm import tqdm from transformers import AutoTokenizer from transformers import logging from Configuration import Configuration +from .SentencesTokenizer import SentencesTokenizer +from .SymbolTokenizer import SymbolTokenizer +from .Tagging_bert_model import Tagging_bert_model +from .eval import categorical_accuracy +from utils import format_time, output_create_dir -from .Utils.SentencesTokenizer import SentencesTokenizer -from .Utils.SymbolTokenizer import SymbolTokenizer -from .Utils.Tagging_bert_model import Tagging_bert_model logging.set_verbosity(logging.ERROR) -# region Utils - -def output_create_dir(): - """ - Create le output dir for tensorboard and checkpoint - @return: output dir, tensorboard writter - """ - from datetime import datetime - outpout_path = 'TensorBoard' - training_dir = os.path.join(outpout_path, 'Tranning_' + datetime.today().strftime('%d-%m_%H-%M')) - logs_dir = os.path.join(training_dir, 'logs') - writer = SummaryWriter(log_dir=logs_dir) - return training_dir, writer - - -def categorical_accuracy(preds, truth): - """ - Calculates how often predictions match argmax labels. - @param preds: batch of prediction. (argmax) - @param truth: batch of truth label. - @return: scoring of batch prediction. (Categorical accuracy values) - """ - good_label = 0 - nb_label = 0 - for i in range(len(truth)): - sublist_truth = truth[i] - sublist_preds = preds[i] - for j in range(len(sublist_truth)): - if sublist_truth[j] != 0: - if sublist_truth[j] == sublist_preds[j]: - good_label += 1 - nb_label += 1 - return good_label / nb_label - - -def format_time(elapsed): - ''' - Takes a time in seconds and returns a string hh:mm:ss - ''' - # Round to the nearest second. - elapsed_rounded = int(round(elapsed)) - - # Format as hh:mm:ss - return str(datetime.timedelta(seconds=elapsed_rounded)) - - -# endregion Utils - # region Class class SuperTagger: """ - Implements the SuperTagger to assign each word a supertag. A supertag is a tree of tags such as np, s, ... + Implements the SuperTagger to assign each word a supertag (also named symbol). A supertag is a tree of tags such as np, s, ... Attributes: ----------- @@ -101,7 +52,7 @@ class SuperTagger: model_load : bool """ - # region Constructor + # region Instanciation def __init__(self): """ @@ -126,20 +77,19 @@ class SuperTagger: self.trainable = False self.model_load = False - # endregion Constructor - - # region Instanciation - def load_weights(self, model_file): """ Loads an SupperTagger saved with SupperTagger.__checkpoint_save() (during a train) from a file. - @param model_file: path of .pt save of model + Parameters: + ----------- + model_file: + path of .pt save of model """ self.trainable = False print("#" * 20) - print("\n Loading...") + print("\n Loading model for supertagger ...") try: params = torch.load(model_file, map_location=self.device) args = params['args'] @@ -167,11 +117,16 @@ class SuperTagger: def create_new_model(self, num_label, bert_name, index_to_tags): """ - Instantiation and parameterization of a new bert model - - @param num_label: number of diferent labels (tags) - @param bert_name: name of model available on Hugging Face `<https://huggingface.co/models>` - @param index_to_tags: Dict for convert ID to tags + Instantiation and parameterization of a new bert model. + + Parameters: + ----------- + num_label: + number of diferent labels (tags) + bert_name: + name of model available on Hugging Face `<https://huggingface.co/models>` + index_to_tags: + Dict for convert ID to tags """ assert len( index_to_tags) == num_label, f" len(index_to_tags) : {len(index_to_tags)} must be equels with num_label: {num_label}" @@ -198,8 +153,13 @@ class SuperTagger: """ Predict and convert sentences in tags (depends on the dictation given when the model was created) - @param sentences: list of sentences : list[str] OR one sentences : str - @return: tags prediction for all sentences (no argmax tags, convert tags, embedding layer of bert ) + Parameters: + ----------- + sentences: list of sentences : list[str] OR one sentences : str + + Returns: + -------- + tags prediction for all sentences (no argmax tags, convert tags, embedding layer of bert ) """ assert self.trainable or self.model is None, "Please use the create_new_model(...) or load_weights(...) " \ "function before the predict, the model is not integrated " @@ -219,7 +179,7 @@ class SuperTagger: def forward(self, b_sents_tokenized, b_sents_mask): """ - Function used for the linker (same of predict) + Forward to the model """ with torch.no_grad(): output = self.model.predict((b_sents_tokenized, b_sents_mask)) @@ -231,13 +191,15 @@ class SuperTagger: """ Starts the training of the model, either new or previously loaded - @param sentences: list of sentences for train (X) - @param tags: list of tags for train (Y) - @param validation_rate: percentage of validation data [0-1] - @param epochs: number of epoch (50 recommended) - @param batch_size: number of sample in batch (32 recommended, attention to memory) - @param tensorboard: use tensorboard for see loss and accuracy - @param checkpoint: save the model after each epoch + Parameters: + ----------- + sentences: list of sentences for train (X) + tags: list of tags for train (Y) + validation_rate: percentage of validation data [0-1] + epochs: number of epoch (50 recommended) + batch_size: number of sample in batch (32 recommended, attention to memory) + tensorboard: use tensorboard for see loss and accuracy + checkpoint: save the model after each epoch """ assert self.trainable or self.model is None, "Please use the create_new_model(...) or load_weights(...) function before the train, the model is not integrated" @@ -296,12 +258,17 @@ class SuperTagger: validation_rate): """ Create torch dataloader for training - - @param batch_size: number of sample in batch - @param sentences: list of sentences for train (X) - @param tags: list of tags for train (Y) - @param validation_rate: percentage of validation data [0-1] - @return: (training dataloader, validation dataloader) + + Parameters: + ----------- + batch_size: number of sample in batch + sentences: list of sentences for train (X) + tags: list of tags for train (Y) + validation_rate: percentage of validation data [0-1] + + Returns: + -------- + training dataloader, validation dataloader """ validation_dataloader = None @@ -325,9 +292,14 @@ class SuperTagger: def __train_epoch(self, training_dataloader): """ Train on epoch - - @param training_dataloader: dataloader of training data - @return: (epoch accuracy, epoch loss, training time) + + Parameters: + ----------- + training_dataloader: dataloader of training data + + Returns: + -------- + epoch accuracy, epoch loss, training time """ self.model.train() epoch_loss = 0 @@ -370,8 +342,13 @@ class SuperTagger: """ Validation on epoch - @param validation_dataloader: dataloader of validation data - @return: (epoch accuracy, epoch loss, num step) + Parameters: + ----------- + validation_dataloader: dataloader of validation data + + Returns: + -------- + epoch accuracy, epoch loss, num step """ self.model.eval() eval_loss = 0 @@ -404,7 +381,10 @@ class SuperTagger: def __checkpoint_save(self, path='/model_check.pt'): """ Save the model with good parameters - @param path: poth and name for save + + Parameters: + ----------- + path: poth and name for save """ self.model.cpu() # print('save model parameters to [%s]' % path, file=sys.stderr) @@ -418,4 +398,4 @@ class SuperTagger: # endregion Private -# endregion Class +# endregion Class \ No newline at end of file diff --git a/SuperTagger/SuperTagger/Utils/SymbolTokenizer.py b/SuperTagger/SuperTagger/SymbolTokenizer.py similarity index 71% rename from SuperTagger/SuperTagger/Utils/SymbolTokenizer.py rename to SuperTagger/SuperTagger/SymbolTokenizer.py index e5095d1..dafcc12 100644 --- a/SuperTagger/SuperTagger/Utils/SymbolTokenizer.py +++ b/SuperTagger/SuperTagger/SymbolTokenizer.py @@ -8,19 +8,39 @@ def load_obj(name): with open(name + '.pkl', 'rb') as f: return pickle.load(f) +def pad_sequence(sequences, max_len=400): + padded = [0] * max_len + padded[:len(sequences)] = sequences + return padded class SymbolTokenizer(): + """ + Tokenizer for tags : Based on a dictionary + + Atributes: + ---------- + index_to_super : dict + Convert id to supertag + super_to_index : dict + Convert supertag to id + """ def __init__(self, index_to_super): - """@params index_to_super: Dict for convert ID to tags """ + """ + Parameters: + ----------- + index_to_super: Dict for convert ID to tags """ self.index_to_super = index_to_super self.super_to_index = {v: int(k) for k, v in self.index_to_super.items()} def lenSuper(self): - """@return len of dict for convert ID to tags """ + """Returns len of dict for convert ID to tags """ return len(self.index_to_super) + 1 def convert_batchs_to_ids(self, tags, sents_tokenized): + """ + Convert batch of tags to id + """ encoded_labels = [] labels = [[self.super_to_index[str(symbol)] for symbol in sents] for sents in tags] for l, s in zip(labels, sents_tokenized): @@ -36,7 +56,3 @@ class SymbolTokenizer(): return labels -def pad_sequence(sequences, max_len=400): - padded = [0] * max_len - padded[:len(sequences)] = sequences - return padded diff --git a/SuperTagger/SuperTagger/Utils/Tagging_bert_model.py b/SuperTagger/SuperTagger/Tagging_bert_model.py similarity index 72% rename from SuperTagger/SuperTagger/Utils/Tagging_bert_model.py rename to SuperTagger/SuperTagger/Tagging_bert_model.py index b5331ff..96dfbb6 100644 --- a/SuperTagger/SuperTagger/Utils/Tagging_bert_model.py +++ b/SuperTagger/SuperTagger/Tagging_bert_model.py @@ -26,6 +26,18 @@ class Tagging_bert_model(Module): self.bert = transformers.AutoModelForTokenClassification.from_pretrained(bert_name, config=config) def forward(self, batch): + """ + Forward to the model. + + Parameters: + ----------- + batch : + batch of tokenized sentences + + Returns: + -------- + result : dict containing logit, word_embedding and last_hidden_state + """ b_input_ids = batch[0] b_input_mask = batch[1] labels = batch[2] @@ -38,6 +50,18 @@ class Tagging_bert_model(Module): return result def predict(self, batch): + """ + Prediction of supertags for a batch of sentences + + Parameters: + ----------- + batch : + batch of tokenized sentences + + Returns: + -------- + result : dict containing logit, word_embedding and last_hidden_state + """ b_input_ids = batch[0] b_input_mask = batch[1] diff --git a/SuperTagger/SuperTagger/Utils/SentencesTokenizer.py b/SuperTagger/SuperTagger/Utils/SentencesTokenizer.py deleted file mode 100644 index 3b637bf..0000000 --- a/SuperTagger/SuperTagger/Utils/SentencesTokenizer.py +++ /dev/null @@ -1,18 +0,0 @@ - -class SentencesTokenizer(): - - def __init__(self, tokenizer, max_length): - """@params tokenizer (PretrainedTokenizer): Tokenizer that tokenizes text """ - self.tokenizer = tokenizer - self.max_length = max_length - - def fit_transform(self, sents): - return self.tokenizer(sents, padding=True) - - def fit_transform_tensors(self, sents): - temp = self.tokenizer(sents, padding='max_length', return_tensors = 'pt', max_length=300) - - return temp["input_ids"], temp["attention_mask"] - - def convert_ids_to_tokens(self, inputs_ids, skip_special_tokens=False): - return self.tokenizer.batch_decode(inputs_ids, skip_special_tokens=skip_special_tokens) diff --git a/SuperTagger/SuperTagger/Utils/helpers.py b/SuperTagger/SuperTagger/Utils/helpers.py deleted file mode 100644 index 21a60b8..0000000 --- a/SuperTagger/SuperTagger/Utils/helpers.py +++ /dev/null @@ -1,42 +0,0 @@ -import pandas as pd -from tqdm import tqdm - - -def read_csv_pgbar(csv_path, nrows=float('inf'), chunksize=100): - print("\n" + "#" * 20) - print("Loading csv...") - - rows = sum(1 for _ in open(csv_path, 'r', encoding="utf8")) - 1 # minus the header - chunk_list = [] - - if rows > nrows: - rows = nrows - - with tqdm(total=rows, desc='Rows read: ') as bar: - for chunk in pd.read_csv(csv_path, converters={'Y1': pd.eval, 'Y2': pd.eval, 'Z': pd.eval}, chunksize=chunksize, - nrows=rows): - chunk_list.append(chunk) - bar.update(len(chunk)) - - df = pd.concat((f for f in chunk_list), axis=0) - print("#" * 20) - return df - - -def load_obj(name): - with open(name + '.pkl', 'rb') as f: - import pickle - return pickle.load(f) - - -def categorical_accuracy_str(preds, truth): - nb_label = 0 - good_label = 0 - for i in range(len(truth)): - sublist_truth = truth[i] - sublist_preds = preds[i] - nb_label += len(sublist_truth) - for j in range(min(len(sublist_truth), len(sublist_preds))): - if str(sublist_truth[j]) == str(sublist_preds[j]): - good_label += 1 - return good_label / nb_label diff --git a/SuperTagger/SuperTagger/eval.py b/SuperTagger/SuperTagger/eval.py new file mode 100644 index 0000000..1fd1b97 --- /dev/null +++ b/SuperTagger/SuperTagger/eval.py @@ -0,0 +1,19 @@ +def categorical_accuracy(preds, truth): + """ + Calculates how often predictions match argmax labels. + preds: batch of prediction. (argmax) + truth: batch of truth label. + @return: scoring of batch prediction. (Categorical accuracy values) + """ + good_label = 0 + nb_label = 0 + for i in range(len(truth)): + sublist_truth = truth[i] + sublist_preds = preds[i] + for j in range(len(sublist_truth)): + if sublist_truth[j] != 0: + if sublist_truth[j] == sublist_preds[j]: + good_label += 1 + nb_label += 1 + return good_label / nb_label + \ No newline at end of file diff --git a/SuperTagger/__init__.py b/SuperTagger/__init__.py index d0947b7..993e537 100644 --- a/SuperTagger/__init__.py +++ b/SuperTagger/__init__.py @@ -1,2 +1 @@ -from .SuperTagger.Utils import * from .SuperTagger.SuperTagger import SuperTagger \ No newline at end of file diff --git a/predict_links.py b/predict_links.py index 3bc1db1..f3d05a6 100644 --- a/predict_links.py +++ b/predict_links.py @@ -3,22 +3,24 @@ from NeuralProofNet.NeuralProofNet import NeuralProofNet # region data a_s = "( 1 ) parmi les huit \" partants \" acquis ou potentiels , MM. Lacombe , Koehler et Laroze ne sont pas membres " \ "du PCF . " -tags_s = [['let', 'dr(0,s,s)', 'let', 'dr(0,dr(0,s,s),np)', 'dr(0,np,n)', 'dr(0,n,n)', 'let', 'n', 'let', 'dl(0,n,n)', +tags_s = ['let', 'dr(0,s,s)', 'let', 'dr(0,dr(0,s,s),np)', 'dr(0,np,n)', 'dr(0,n,n)', 'let', 'n', 'let', 'dl(0,n,n)', 'dr(0,dl(0,dl(0,n,n),dl(0,n,n)),dl(0,n,n))', 'dl(0,n,n)', 'let', 'dr(0,np,np)', 'np', 'dr(0,dl(0,np,np),np)', 'np', 'dr(0,dl(0,np,np),np)', 'np', 'dr(0,dl(0,np,s),dl(0,np,s))', 'dr(0,dl(0,np,s),np)', 'dl(1,s,s)', 'np', - 'dr(0,dl(0,np,np),n)', 'n', 'dl(0,s,txt)']] + 'dr(0,dl(0,np,np),n)', 'n', 'dl(0,s,txt)'] # endregion # region model -neuralproofnet = NeuralProofNet() -model = "models/linker.pt" +model_tagger = "models/flaubert_super_98_V2_50e.pt" +neuralproofnet = NeuralProofNet(model_tagger) +model = "Output/linker.pt" neuralproofnet.linker.load_weights(model) # endregion # region prediction linker = neuralproofnet.linker -links = linker.predict_without_categories("le chat est noir") +links = linker.predict_without_categories(a_s) +#links = linker.predict_with_categories(a_s, tags_s) print(links) # endregion \ No newline at end of file diff --git a/predict_supertags.py b/predict_supertags.py index 22725ab..88ec004 100644 --- a/predict_supertags.py +++ b/predict_supertags.py @@ -1,5 +1,5 @@ from SuperTagger.SuperTagger.SuperTagger import SuperTagger -from SuperTagger.SuperTagger.Utils.helpers import categorical_accuracy_str +from SuperTagger.SuperTagger.eval import categorical_accuracy # region data a_s = "( 1 ) parmi les huit \" partants \" acquis ou potentiels , MM. Lacombe , Koehler et Laroze ne sont pas membres " \ @@ -33,5 +33,5 @@ print("\tTags : ", tags_s[0]) print() print("\tPred_convert : ", pred_convert[0]) print() -print("\tScore :", categorical_accuracy_str(pred_convert, tags_s)) +print("\tScore :", f"{categorical_accuracy(pred_convert, tags_s)*100}%" ) # endregion \ No newline at end of file diff --git a/train_neuralproofnet.py b/train_neuralproofnet.py index 8bbc086..daa6f65 100644 --- a/train_neuralproofnet.py +++ b/train_neuralproofnet.py @@ -1,26 +1,13 @@ import torch from Linker import * from NeuralProofNet.NeuralProofNet import NeuralProofNet -from utils import read_csv_pgbar -from Configuration import Configuration +from utils import read_links_csv torch.cuda.empty_cache() -# region config -config = Configuration.read_config() -version = config["VERSION"] -datasetConfig = config["DATASET_PARAMS"] -modelEncoderConfig = config["MODEL_ENCODER"] -modelLinkerConfig = config["MODEL_LINKER"] -modelTrainingConfig = config["MODEL_TRAINING"] -epochs = int(modelTrainingConfig['epoch']) -batch_size = int(modelTrainingConfig['batch_size']) -# endregion - # region data -nb_sentences = 100 file_path_axiom_links = 'Datasets/goldANDsilver_dataset_links.csv' -df_axiom_links = read_csv_pgbar(file_path_axiom_links, nb_sentences) +df_axiom_links = read_links_csv(file_path_axiom_links) # endregion @@ -29,7 +16,7 @@ print("#" * 20) print("#" * 20) model_tagger = "models/flaubert_super_98_V2_50e.pt" neural_proof_net = NeuralProofNet(model_tagger) -neural_proof_net.train_neuralproofnet(df_axiom_links, validation_rate=0.1, epochs=epochs, batch_size=batch_size, +neural_proof_net.train_neuralproofnet(df_axiom_links, validation_rate=0.1, epochs=5, batch_size=16, checkpoint=True, tensorboard=True) print("#" * 20) print("#" * 20) diff --git a/train_supertagger.py b/train_supertagger.py index 3c0bde0..edba002 100644 --- a/train_supertagger.py +++ b/train_supertagger.py @@ -1,10 +1,9 @@ from SuperTagger.SuperTagger.SuperTagger import SuperTagger -from SuperTagger.SuperTagger.Utils.helpers import read_csv_pgbar, load_obj +from utils import read_supertags_csv, load_obj # region data file_path = 'SuperTagger/Datasets/m2_dataset_V2.csv' -nb_sentences = 100 -df = read_csv_pgbar(file_path,nb_sentences) +df = read_supertags_csv(file_path) texts = df['X'].tolist() tags = df['Z'].tolist() @@ -15,7 +14,7 @@ index_to_super = load_obj('SuperTagger/Datasets/index_to_super') # region model tagger = SuperTagger() tagger.create_new_model(len(index_to_super),'camembert-base',index_to_super) -## If you wnat to upload a pretrained model +## If you want to upload a pretrained model # tagger.load_weights("models/model_check.pt") tagger.train(texts, tags, epochs=2, batch_size=16, validation_rate=0.1, tensorboard=True, checkpoint=True) diff --git a/utils.py b/utils.py index c4fae14..9640a0b 100644 --- a/utils.py +++ b/utils.py @@ -1,17 +1,78 @@ import datetime +import os +from torch.utils.tensorboard import SummaryWriter import pandas as pd -import torch from tqdm import tqdm +# region load data + +def read_links_csv(csv_path, nrows=float('inf'), chunksize=100): + r""" + Preparing csv dataset. + + Parameters: + ----------- + csv_path: + nrows: + chunksize: + """ + print("\n" + "#" * 20) + print("Loading csv...") + + chunk_list = [] + + with tqdm(total=nrows, desc='Rows read: ') as bar: + for chunk in pd.read_csv(csv_path, header=0, converters={'Y': pd.eval, 'Z': pd.eval}, + chunksize=chunksize, nrows=nrows): + chunk_list.append(chunk) + bar.update(len(chunk)) + + df = pd.concat((f for f in chunk_list), axis=0) + print("#" * 20) + return df + +def read_supertags_csv(csv_path, nrows=float('inf'), chunksize=100): + r""" + Preparing csv dataset. + + Parameters: + ----------- + csv_path: + nrows: + chunksize: + """ + print("\n" + "#" * 20) + print("Loading csv...") + + chunk_list = [] + with tqdm(total=nrows, desc='Rows read: ') as bar: + for chunk in pd.read_csv(csv_path, header=0, converters={'Y1': pd.eval, 'Y2': pd.eval, 'Z': pd.eval}, + chunksize=chunksize, nrows=nrows): + chunk_list.append(chunk) + bar.update(len(chunk)) + + df = pd.concat((f for f in chunk_list), axis=0) + print("#" * 20) + return df + + +def load_obj(name): + with open(name + '.pkl', 'rb') as f: + import pickle + return pickle.load(f) + +#endregion + +# region format data def pad_sequence(sequences, batch_first=True, padding_value=0, max_len=400): r""" Padding sequence for preparation to tensorDataset - :param sequences: data to pad - :param batch_first: boolean indicating whether the batch are in first dimension - :param padding_value: the value for pad - :param max_len: the maximum length + sequences: data to pad + batch_first: boolean indicating whether the batch are in first dimension + padding_value: the value for pad + max_len: the maximum length :return: padding sequences """ max_size = sequences[0].size() @@ -32,32 +93,21 @@ def pad_sequence(sequences, batch_first=True, padding_value=0, max_len=400): return out_tensor +#endregion -def read_csv_pgbar(csv_path, nrows=float('inf'), chunksize=500): - r""" - Preparing csv dataset - :param csv_path: - :param nrows: - :param chunksize: - :return: - """ - print("Loading csv...") - - rows = sum(1 for _ in open(csv_path, 'r', encoding="utf8")) - 1 # minus the header - chunk_list = [] - - if rows > nrows: - rows = nrows - chunksize = nrows - - with tqdm(total=rows, desc='Rows read: ') as bar: - for chunk in pd.read_csv(csv_path, converters={'Y': pd.eval, 'Z': pd.eval}, chunksize=chunksize, nrows=rows): - chunk_list.append(chunk) - bar.update(len(chunk)) +# region utils training - df = pd.concat((f for f in chunk_list), axis=0) - - return df +def output_create_dir(): + """ + Create le output dir for tensorboard and checkpoint + @return: output dir, tensorboard writter + """ + from datetime import datetime + outpout_path = 'TensorBoard' + training_dir = os.path.join(outpout_path, 'Tranning_' + datetime.today().strftime('%d-%m_%H-%M')) + logs_dir = os.path.join(training_dir, 'logs') + writer = SummaryWriter(log_dir=logs_dir) + return training_dir, writer def format_time(elapsed): @@ -68,4 +118,6 @@ def format_time(elapsed): elapsed_rounded = int(round(elapsed)) # Format as hh:mm:ss - return str(datetime.timedelta(seconds=elapsed_rounded)) \ No newline at end of file + return str(datetime.timedelta(seconds=elapsed_rounded)) + +#endregion \ No newline at end of file -- GitLab