diff --git a/Configuration/config.ini b/Configuration/config.ini index 8e6c08cc532c9390444b52bc9da07564ddd04d6c..15547f6dd2817c8887fe7e7e309110a660b8b893 100644 --- a/Configuration/config.ini +++ b/Configuration/config.ini @@ -4,7 +4,7 @@ transformers = 4.16.2 [DATASET_PARAMS] symbols_vocab_size=26 atom_vocab_size=20 -max_len_sentence=148 +max_len_sentence=109 max_atoms_in_sentence=1250 max_atoms_in_one_type=250 diff --git a/Linker/AtomTokenizer.py b/Linker/AtomTokenizer.py index 568b3a5e3c8fb66058192ab5d005ab5cf41330c4..c72b73e3ea4720daca99b844098b65b7da1e0e74 100644 --- a/Linker/AtomTokenizer.py +++ b/Linker/AtomTokenizer.py @@ -1,5 +1,5 @@ import torch -from ..utils import pad_sequence +from utils import pad_sequence class AtomTokenizer(object): diff --git a/Linker/Linker.py b/Linker/Linker.py index 167c923af4e2035cbeb3a2f3ff9aa27421363618..4d4eaaadee8560ef15667203cafdb3d8e9f0598f 100644 --- a/Linker/Linker.py +++ b/Linker/Linker.py @@ -12,18 +12,18 @@ from torch.utils.data import TensorDataset, random_split from transformers import get_cosine_schedule_with_warmup from Configuration import Configuration -from AtomEmbedding import AtomEmbedding -from AtomTokenizer import AtomTokenizer -from MHA import AttentionDecoderLayer -from atom_map import atom_map -from Sinkhorn import sinkhorn_fn_no_exp as sinkhorn -from utils_linker import find_pos_neg_idexes, get_atoms_batch, FFN, get_axiom_links -from eval import mesure_accuracy, SinkhornLoss -from ..utils import pad_sequence +from Linker.AtomEmbedding import AtomEmbedding +from Linker.AtomTokenizer import AtomTokenizer +from Linker.MHA import AttentionDecoderLayer +from Linker.atom_map import atom_map +from Linker.Sinkhorn import sinkhorn_fn_no_exp as sinkhorn +from Linker.utils_linker import find_pos_neg_idexes, get_atoms_batch, FFN, get_axiom_links +from Linker.eval import mesure_accuracy, SinkhornLoss +from utils import pad_sequence class Linker(Module): - def __init__(self): + def __init__(self, supertagger): super(Linker, self).__init__() self.dim_encoder = int(Configuration.modelEncoderConfig['dim_encoder']) @@ -39,6 +39,8 @@ class Linker(Module): self.dropout = Dropout(0.1) self.device = "" + self.Supertagger = supertagger + self.atom_map = atom_map self.padding_id = self.atom_map['[PAD]'] self.atoms_tokenizer = AtomTokenizer(atom_map, self.max_atoms_in_sentence) @@ -63,7 +65,7 @@ class Linker(Module): num_warmup_steps=0, num_training_steps=100) - def __preprocess_data(self, batch_size, df_axiom_links, validation_rate=0.0): + def __preprocess_data(self, batch_size, df_axiom_links, sentences_tokens, sentences_mask, validation_rate=0.0): atoms_batch = get_atoms_batch(df_axiom_links["sub_tree"]) atom_tokenizer = AtomTokenizer(atom_map, self.max_atoms_in_sentence) atoms_batch_tokenized = atom_tokenizer.convert_batchs_to_ids(atoms_batch) @@ -75,7 +77,8 @@ class Linker(Module): truth_links_batch = truth_links_batch.permute(1, 0, 2) # Construction tensor dataset - dataset = TensorDataset(atoms_batch_tokenized, atoms_polarity_batch, truth_links_batch) + dataset = TensorDataset(atoms_batch_tokenized, atoms_polarity_batch, truth_links_batch, sentences_tokens, + sentences_mask) if validation_rate > 0: train_size = int(0.9 * len(dataset)) @@ -109,8 +112,7 @@ class Linker(Module): atoms_embedding = self.atoms_embedding(atoms_batch_tokenized) # MHA ou LSTM avec sortie de BERT - sents_embedding = torch.randn(32, self.max_len_sentence, self.dim_encoder) - batch_size, len_sentence, sents_embedding_dim = sents_embedding.shape + batch_size, _, _ = sents_embedding.shape sents_mask = torch.randn(batch_size * self.nhead, self.max_atoms_in_sentence, self.max_len_sentence) atoms_encoding = self.linker_encoder(atoms_embedding, sents_embedding, sents_mask, self.make_decoder_mask(atoms_batch_tokenized)) @@ -143,12 +145,15 @@ class Linker(Module): return torch.stack(link_weights) - def train_linker(self, df_axiom_links, validation_rate=0.1, epochs=20, batch_size=32, checkpoint=True, validate=True): + def train_linker(self, df_axiom_links, sentences_tokens, sentences_mask, validation_rate=0.1, epochs=20, + batch_size=32, checkpoint=True, validate=True): - training_dataloader, validation_dataloader = self.__preprocess_data(batch_size, df_axiom_links, validation_rate) + training_dataloader, validation_dataloader = self.__preprocess_data(batch_size, df_axiom_links, + sentences_tokens, sentences_mask, + validation_rate) for epoch_i in range(0, epochs): - epoch_acc, epoch_loss = self.train_epoch(training_dataloader, validation_dataloader) + epoch_acc, epoch_loss = self.train_epoch(training_dataloader, validation_dataloader, checkpoint, validate) def train_epoch(self, training_dataloader, validation_dataloader, checkpoint=True, validate=True): @@ -163,15 +168,16 @@ class Linker(Module): batch_atoms = batch[0].to("cuda" if torch.cuda.is_available() else "cpu") batch_polarity = batch[1].to("cuda" if torch.cuda.is_available() else "cpu") batch_true_links = batch[2].to("cuda" if torch.cuda.is_available() else "cpu") - # batch_sentences = batch[3].to("cuda" if torch.cuda.is_available() else "cpu") + batch_sentences_tokens = batch[3].to("cuda" if torch.cuda.is_available() else "cpu") + batch_sentences_mask = batch[4].to("cuda" if torch.cuda.is_available() else "cpu") self.optimizer.zero_grad() # get sentence embedding from BERT which is already trained - # sentences_embedding = supertagger(batch_sentences) + logits, sentences_embedding = self.Supertagger.foward(batch_sentences_tokens, batch_sentences_mask) # Run the kinker on the categories predictions - logits_predictions = self(batch_atoms, batch_polarity, []) + logits_predictions = self(batch_atoms, batch_polarity, sentences_embedding, batch_sentences_mask) linker_loss = self.cross_entropy_loss(logits_predictions.permute(1, 0, 2, 3), batch_true_links) # Perform a backward pass to calculate the gradients. @@ -256,9 +262,11 @@ class Linker(Module): batch_atoms = batch[0].to("cuda" if torch.cuda.is_available() else "cpu") batch_polarity = batch[1].to("cuda" if torch.cuda.is_available() else "cpu") batch_true_links = batch[2].to("cuda" if torch.cuda.is_available() else "cpu") - # batch_sentences = batch[3].to("cuda" if torch.cuda.is_available() else "cpu") + batch_sentences_tokens = batch[3].to("cuda" if torch.cuda.is_available() else "cpu") + batch_sentences_mask = batch[4].to("cuda" if torch.cuda.is_available() else "cpu") - logits_axiom_links_pred = self.forward(batch_atoms, batch_polarity, []) + logits_axiom_links_pred = self.forward(batch_atoms, batch_polarity, batch_sentences_tokens, + batch_sentences_mask) logits_axiom_links_pred = logits_axiom_links_pred.permute(1, 0, 2, 3) axiom_links_pred = torch.argmax(F.softmax(logits_axiom_links_pred, dim=3), dim=3) diff --git a/Linker/MHA.py b/Linker/MHA.py index c1554f9a3454a8be0ed66917824e49534bb01f6a..c66580617a7c665111b0da1711c3f66c5f8abe16 100644 --- a/Linker/MHA.py +++ b/Linker/MHA.py @@ -2,7 +2,7 @@ from torch import Tensor from torch.nn import (Dropout, LayerNorm, Module, MultiheadAttention) from Configuration import Configuration -from utils_linker import FFN +from Linker.utils_linker import FFN class AttentionDecoderLayer(Module): @@ -35,8 +35,6 @@ class AttentionDecoderLayer(Module): # init params dim_encoder = int(Configuration.modelEncoderConfig['dim_encoder']) dim_decoder = int(Configuration.modelDecoderConfig['dim_decoder']) - max_len_sentence = int(Configuration.datasetConfig['max_len_sentence']) - atom_vocab_size = int(Configuration.datasetConfig['atom_vocab_size']) nhead = int(Configuration.modelLinkerConfig['nhead']) dropout = float(Configuration.modelLinkerConfig['dropout']) dim_feedforward = int(Configuration.modelLinkerConfig['dim_feedforward']) diff --git a/Linker/utils_linker.py b/Linker/utils_linker.py index f968984872d4513c0b31ae5ca6e5fc06ced70da0..da295deec08fd3693c204691d19a59bd74571aed 100644 --- a/Linker/utils_linker.py +++ b/Linker/utils_linker.py @@ -3,8 +3,8 @@ import regex import torch from torch.nn import Sequential, Linear, Dropout, GELU from torch.nn import Module -from atom_map import atom_map -from ..utils import pad_sequence +from Linker.atom_map import atom_map +from utils import pad_sequence class FFN(Module): diff --git a/requirements.txt b/requirements.txt index c611e9b46c45eac6d06ab14c47a951c0c641796c..1491f06d96fdbaac1338114c77226ffc488f38a7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,5 @@ transformers==4.16.2 torch==1.10.2 huggingface-hub==0.4.0 pandas==1.4.1 -sentencepiece \ No newline at end of file +sentencepiece +git+https://gitlab.irit.fr/pnria/global-helper/deepgrail-rnn/ \ No newline at end of file diff --git a/train.py b/train.py index a37aaa46a6a12b8f1272914c1b6a85be627c2b5a..f83a951477de82d4a8be7e11e58c1de1ddfb5d41 100644 --- a/train.py +++ b/train.py @@ -1,195 +1,28 @@ -import os -import time -from datetime import datetime - -import numpy as np import torch -from torch.optim import AdamW -from torch.utils.data import Dataset, TensorDataset, random_split -from transformers import (get_cosine_schedule_with_warmup) from Configuration import Configuration -from Linker.AtomTokenizer import AtomTokenizer from Linker.Linker import Linker -from Linker.atom_map import atom_map -from Linker.utils_linker import get_axiom_links, get_atoms_batch, find_pos_neg_idexes -from Linker.eval import SinkhornLoss -from utils import format_time, read_csv_pgbar +from Supertagger.SuperTagger.SuperTagger import SuperTagger +from utils import read_csv_pgbar torch.cuda.empty_cache() -# region ParamsModel - -max_len_sentence = int(Configuration.datasetConfig['max_len_sentence']) -max_atoms_in_sentence = int(Configuration.datasetConfig['max_atoms_in_sentence']) -max_atoms_in_one_type = int(Configuration.datasetConfig['max_atoms_in_one_type']) -atom_vocab_size = int(Configuration.datasetConfig['atom_vocab_size']) - -# endregion ParamsModel - -# region ParamsTraining - batch_size = int(Configuration.modelTrainingConfig['batch_size']) nb_sentences = batch_size * 10 epochs = int(Configuration.modelTrainingConfig['epoch']) -seed_val = int(Configuration.modelTrainingConfig['seed_val']) -learning_rate = float(Configuration.modelTrainingConfig['learning_rate']) - -# endregion ParamsTraining - -# region Data loader file_path_axiom_links = 'Datasets/aa1_links_dataset_links.csv' df_axiom_links = read_csv_pgbar(file_path_axiom_links, nb_sentences) -sentences_batch = df_axiom_links["Sentences"] - -atoms_batch = get_atoms_batch(df_axiom_links["sub_tree"]) -atom_tokenizer = AtomTokenizer(atom_map, max_atoms_in_sentence) -atoms_batch_tokenized = atom_tokenizer.convert_batchs_to_ids(atoms_batch) -print("atoms_tokens", atoms_batch_tokenized.shape) - -atoms_polarity_batch = find_pos_neg_idexes(max_atoms_in_sentence, df_axiom_links["sub_tree"]) -print("atoms_polarity_batch", atoms_polarity_batch.shape) - -torch.set_printoptions(edgeitems=20) -truth_links_batch = get_axiom_links(max_atoms_in_one_type, atoms_polarity_batch, df_axiom_links["sub_tree"]) -truth_links_batch = truth_links_batch.permute(1, 0, 2) -print("truth_links_batch", truth_links_batch.shape) -print("sentence", sentences_batch[14]) -print("categories ", df_axiom_links["sub_tree"][14]) -print("atoms_batch", atoms_batch[14]) -print("atoms_polarity_batch", atoms_polarity_batch[14]) -print(" truth_links_batch example on a sentence class txt", truth_links_batch[14][16]) - -# Construction tensor dataset -dataset = TensorDataset(atoms_batch_tokenized, atoms_polarity_batch, truth_links_batch) - -# Calculate the number of samples to include in each set. -train_size = int(0.9 * len(dataset)) -val_size = len(dataset) - train_size - -# Divide the dataset by randomly selecting samples. -train_dataset, val_dataset = random_split(dataset, [train_size, val_size]) - -training_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=False) -validation_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=False) - -# endregion Data loader - - -# region Models - -# supertagger = SuperTagger() -# supertagger.load_weights("models/model_check.pt") - -linker = Linker() - -# endregion Models - - -# region Fit tunning - -# Optimizer -optimizer_linker = AdamW(linker.parameters(), - weight_decay=1e-5, - lr=learning_rate) - -# Create the learning rate scheduler. -scheduler_linker = get_cosine_schedule_with_warmup(optimizer_linker, - num_warmup_steps=0, - num_training_steps=100) - -# Loss -cross_entropy_loss = SinkhornLoss() - -np.random.seed(seed_val) -torch.manual_seed(seed_val) -torch.cuda.manual_seed_all(seed_val) -torch.autograd.set_detect_anomaly(True) - -# endregion Fit tunning - -# region Train - -# Measure the total training time for the whole run. -total_t0 = time.time() - -validate = True -checkpoint = True - - -def run_epochs(epochs): - # For each epoch... - for epoch_i in range(0, epochs): - # ======================================== - # Training - # ======================================== - - # Perform one full pass over the training set. - - print("") - print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)) - print('Training...') - - # Measure how long the training epoch takes. - t0 = time.time() - - # Reset the total loss for this epoch. - total_train_loss = 0 - - linker.train() - - # For each batch of training data... - for step, batch in enumerate(training_dataloader): - # Unpack this training batch from our dataloader - batch_atoms = batch[0].to("cuda" if torch.cuda.is_available() else "cpu") - batch_polarity = batch[1].to("cuda" if torch.cuda.is_available() else "cpu") - batch_true_links = batch[2].to("cuda" if torch.cuda.is_available() else "cpu") - # batch_sentences = batch[3].to("cuda" if torch.cuda.is_available() else "cpu") - - optimizer_linker.zero_grad() - - # get sentence embedding from BERT which is already trained - # sentences_embedding = supertagger(batch_sentences) - - # Run the kinker on the categories predictions - logits_predictions = linker(batch_atoms, batch_polarity, []) - - linker_loss = cross_entropy_loss(logits_predictions.permute(1, 0, 2, 3), batch_true_links) - # Perform a backward pass to calculate the gradients. - total_train_loss += float(linker_loss) - linker_loss.backward() - - # This is to help prevent the "exploding gradients" problem. - # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0, norm_type=2) - - # Update parameters and take a step using the computed gradient. - optimizer_linker.step() - scheduler_linker.step() - - avg_train_loss = total_train_loss / len(training_dataloader) - - # Measure how long this epoch took. - training_time = format_time(time.time() - t0) - - if checkpoint: - checkpoint_dir = os.path.join("Output", 'Tranning_' + datetime.today().strftime('%d-%m_%H-%M')) - linker.__checkpoint_save(path=os.path.join(checkpoint_dir, 'model_check.pt')) +sentences_batch = df_axiom_links["Sentences"].tolist() - if validate: - linker.eval() - with torch.no_grad(): - print("Start eval") - accuracy, loss = linker.eval_epoch(validation_dataloader, cross_entropy_loss) - print("") - print(" Average accuracy on epoch: {0:.2f}".format(accuracy)) - print(" Average loss on epoch: {0:.2f}".format(loss)) +supertagger = SuperTagger() +supertagger.load_weights("models/model_supertagger.pt") - print("") - print(" Average training loss: {0:.2f}".format(avg_train_loss)) - print(" Training epcoh took: {:}".format(training_time)) +sents_tokenized, sents_mask = supertagger.sent_tokenizer.fit_transform_tensors(sentences_batch) +print("Linker") +linker = Linker(supertagger) -run_epochs(epochs) -# endregion Train +print("Linker Training") +linker.train_linker(df_axiom_links, sents_tokenized, sents_mask, validation_rate=0.1, epochs=epochs, batch_size=batch_size, checkpoint=True, validate=True)