diff --git a/.gitignore b/.gitignore index 181d654b177d22316dfa2932f8c6dae230970f11..843b687520e3b945b3a565aa9952586917dede64 100644 --- a/.gitignore +++ b/.gitignore @@ -4,3 +4,5 @@ venv *.pyc .DS_Store .env +./bash_GPU.sh +push pull texte \ No newline at end of file diff --git a/Configuration/config.ini b/Configuration/config.ini index 425af5e5890518ebe30742150896c3f9e1c758d2..9ed1544510f063d857b1569da8b720c59806046d 100644 --- a/Configuration/config.ini +++ b/Configuration/config.ini @@ -5,17 +5,17 @@ dim_encoder = 768 dim_decoder = 768 num_rnn_layers=1 dropout=0.1 -teacher_forcing=0.8 +teacher_forcing=0.05 symbols_vocab_size=26 max_symbols_in_sentence=1250 max_len_sentence=112 [MODEL_TRAINING] device=cpu -batch_size=16 +batch_size=32 epoch=20 seed_val=42 learning_rate=0.005 -use_checkpoint_SAVE=1 +use_checkpoint_SAVE=0 output_path=Output use_checkpoint_LOAD=0 input_path=Input diff --git a/Output/Tranning_28-04_16-09/logs/events.out.tfevents.1651154974.montana.67010.0 b/Output/Tranning_28-04_16-09/logs/events.out.tfevents.1651154974.montana.67010.0 new file mode 100644 index 0000000000000000000000000000000000000000..10099c083d655678d19ae67ed771450c0d165dc9 Binary files /dev/null and b/Output/Tranning_28-04_16-09/logs/events.out.tfevents.1651154974.montana.67010.0 differ diff --git a/Output/Tranning_28-04_16-10/logs/events.out.tfevents.1651155018.montana.67155.0 b/Output/Tranning_28-04_16-10/logs/events.out.tfevents.1651155018.montana.67155.0 new file mode 100644 index 0000000000000000000000000000000000000000..ebc368200c0cc5bdf5db857a05c5c06b81794e75 Binary files /dev/null and b/Output/Tranning_28-04_16-10/logs/events.out.tfevents.1651155018.montana.67155.0 differ diff --git a/Output/Tranning_28-04_16-56/logs/events.out.tfevents.1651157799.montana.73738.0 b/Output/Tranning_28-04_16-56/logs/events.out.tfevents.1651157799.montana.73738.0 new file mode 100644 index 0000000000000000000000000000000000000000..08366b9b591915d9747cff6329633338415f3124 Binary files /dev/null and b/Output/Tranning_28-04_16-56/logs/events.out.tfevents.1651157799.montana.73738.0 differ diff --git a/SuperTagger/Decoder/RNNDecoderLayer.py b/SuperTagger/Decoder/RNNDecoderLayer.py index 89f0798647deed1bc950aae0b34a8e49ad14fa39..f4f4f1faa3b58a91867f46344d696a56edd5954c 100644 --- a/SuperTagger/Decoder/RNNDecoderLayer.py +++ b/SuperTagger/Decoder/RNNDecoderLayer.py @@ -2,7 +2,7 @@ import random import torch import torch.nn.functional as F -from torch.nn import (Dropout, Module, Linear, LSTM) +from torch.nn import (Dropout, Module, ModuleList, Linear, LSTM, GRU) from Configuration import Configuration from SuperTagger.Symbol.SymbolEmbedding import SymbolEmbedding @@ -18,6 +18,8 @@ class RNNDecoderLayer(Module): self.symbols_vocab_size = int(Configuration.modelDecoderConfig['symbols_vocab_size']) dropout = float(Configuration.modelDecoderConfig['dropout']) self.num_rnn_layers = int(Configuration.modelDecoderConfig['num_rnn_layers']) + self.teacher_forcing = float(Configuration.modelDecoderConfig['teacher_forcing']) + self.bidirectional = False self.symbols_map = symbols_map self.symbols_padding_id = self.symbols_map["[PAD]"] @@ -25,8 +27,6 @@ class RNNDecoderLayer(Module): self.symbols_start_id = self.symbols_map["[START]"] self.symbols_sos_id = self.symbols_map["[SOS]"] - self.teacher_forcing = float(Configuration.modelDecoderConfig['teacher_forcing']) - # Different layers # Symbols Embedding self.symbols_embedder = SymbolEmbedding(self.hidden_dim, self.symbols_vocab_size, @@ -35,7 +35,14 @@ class RNNDecoderLayer(Module): self.dropout = Dropout(dropout) # rnn Layer self.rnn = LSTM(input_size=self.hidden_dim, hidden_size=self.hidden_dim, num_layers=self.num_rnn_layers, + dropout=dropout, bidirectional=self.bidirectional, batch_first=True) + + self.intermediate = ModuleList() + for _ in range(3): + self.intermediate.append(Linear(self.hidden_dim, self.hidden_dim)) + self.activation = F.gelu + # Projection on vocab_size if self.bidirectional: self.proj = Linear(self.hidden_dim * 2, self.symbols_vocab_size) @@ -57,22 +64,22 @@ class RNNDecoderLayer(Module): # y_hat[batch_size, max_len_sentence, vocab_size] init with probability pad =1 y_hat = torch.zeros(batch_size, self.max_symbols_in_sentence, self.symbols_vocab_size, - dtype=torch.float,device="cuda" if torch.cuda.is_available() else "cpu") + dtype=torch.float, device="cuda" if torch.cuda.is_available() else "cpu") y_hat[:, :, self.symbols_padding_id] = 1 - use_teacher_forcing = True if random.random() < 0.05 else False - if use_teacher_forcing: - print("\n FORCING TEACHING \n") - decoded_i = symbols_tokenized_batch[:, 0].unsqueeze(1) - else : - decoded_i = torch.ones(batch_size, 1, dtype=torch.long,device="cuda" if torch.cuda.is_available() else "cpu")* self.symbols_start_id - sos_mask = torch.zeros(batch_size, dtype=torch.bool,device="cuda" if torch.cuda.is_available() else "cpu") + decoded_i = torch.ones(batch_size, 1, dtype=torch.long, + device="cuda" if torch.cuda.is_available() else "cpu") * self.symbols_start_id + + sos_mask = torch.zeros(batch_size, dtype=torch.bool, device="cuda" if torch.cuda.is_available() else "cpu") # hidden_state goes through multiple linear layers hidden_state = pooler_output.unsqueeze(0).repeat(self.num_rnn_layers * (1 + self.bidirectional), 1, 1) c_state = torch.zeros(self.num_rnn_layers * (1 + self.bidirectional), batch_size, hidden_size, - dtype=torch.float,device="cuda" if torch.cuda.is_available() else "cpu") + dtype=torch.float, device="cuda" if torch.cuda.is_available() else "cpu") + for intermediate in self.intermediate: + hidden_state = self.dropout(self.activation(intermediate(hidden_state))) + use_teacher_forcing = True if random.random() < self.teacher_forcing else False # for each symbol for i in range(self.max_symbols_in_sentence): @@ -97,8 +104,6 @@ class RNNDecoderLayer(Module): y_hat[~sos_mask, i, :-2] = proj[~sos_mask, -1, :] sos_mask = sos_mask_i | sos_mask - use_teacher_forcing = True if random.random() < 0.25 else False - # Stop if every sentence says padding or if we are full if not torch.any(~sos_mask): break @@ -116,20 +121,23 @@ class RNNDecoderLayer(Module): # contains the predictions y_hat = torch.zeros(batch_size, self.max_symbols_in_sentence, self.symbols_vocab_size, - dtype=torch.float,device="cuda" if torch.cuda.is_available() else "cpu") + dtype=torch.float, device="cuda" if torch.cuda.is_available() else "cpu") y_hat[:, :, self.symbols_padding_id] = 1 # input of the embedder, a created vector that replace the true value - decoded_i = torch.ones(batch_size, 1, dtype=torch.long,device="cuda" if torch.cuda.is_available() else "cpu") * self.symbols_start_id + decoded_i = torch.ones(batch_size, 1, dtype=torch.long, + device="cuda" if torch.cuda.is_available() else "cpu") * self.symbols_start_id - sos_mask = torch.zeros(batch_size, dtype=torch.bool,device="cuda" if torch.cuda.is_available() else "cpu") + sos_mask = torch.zeros(batch_size, dtype=torch.bool, device="cuda" if torch.cuda.is_available() else "cpu") hidden_state = pooler_output.unsqueeze(0).repeat(self.num_rnn_layers * (1 + self.bidirectional), 1, 1) c_state = torch.zeros(self.num_rnn_layers * (1 + self.bidirectional), batch_size, hidden_size, - dtype=torch.float,device="cuda" if torch.cuda.is_available() else "cpu") + dtype=torch.float, device="cuda" if torch.cuda.is_available() else "cpu") + for intermediate in self.intermediate: + hidden_state = self.dropout(self.activation(intermediate(hidden_state))) for i in range(self.max_symbols_in_sentence): symbols_embedding = self.symbols_embedder(decoded_i) - #symbols_embedding = self.dropout(symbols_embedding) + symbols_embedding = self.dropout(symbols_embedding) output, (hidden_state, c_state) = self.rnn(symbols_embedding, (hidden_state, c_state)) diff --git a/SuperTagger/EncoderDecoder.py b/SuperTagger/EncoderDecoder.py index 4efa870990d8f2b5b4fda48f1d6623d7e1a830f6..0519bc2a855b31f023851406f4932ff91883edaf 100644 --- a/SuperTagger/EncoderDecoder.py +++ b/SuperTagger/EncoderDecoder.py @@ -41,7 +41,7 @@ class EncoderDecoder(Module): symbols_tokenized_batch: [batch_size, max_symbols_in_sentence] the true symbols for each sentence. """ last_hidden_state, pooler_output = self.encoder([sents_tokenized_batch, sents_mask_batch]) - #last_hidden_state = self.dropout(last_hidden_state) + last_hidden_state = self.dropout(last_hidden_state) return self.decoder(symbols_tokenized_batch, last_hidden_state, pooler_output) def decode_greedy_rnn(self, sents_tokenized_batch, sents_mask_batch): @@ -52,7 +52,7 @@ class EncoderDecoder(Module): sents_mask_batch : mask output from the encoder tokenizer """ last_hidden_state, pooler_output = self.encoder([sents_tokenized_batch, sents_mask_batch]) - #last_hidden_state = self.dropout(last_hidden_state) + last_hidden_state = self.dropout(last_hidden_state) predictions = self.decoder.predict_rnn(last_hidden_state, pooler_output) @@ -78,10 +78,11 @@ class EncoderDecoder(Module): print("\nsub true (", l, ") : ", [token for token in true_trad if token != '[PAD]']) print("\nsub predict (", len([i for i in predict_trad if i != '[PAD]']), ") : ", - [token for token in predict_trad[:l] if token != '[PAD]']) + [token for token in predict_trad if token != '[PAD]']) return measure_supertagging_accuracy(pred, b_symbols_tokenized, - ignore_idx=self.symbols_map["[PAD]"]), float(cross_entropy_loss(type_predictions, b_symbols_tokenized)) + ignore_idx=self.symbols_map["[PAD]"]), float( + cross_entropy_loss(type_predictions, b_symbols_tokenized)) def eval_epoch(self, dataloader, cross_entropy_loss): r"""Average the evaluation of all the batch. @@ -93,7 +94,7 @@ class EncoderDecoder(Module): for step, batch in enumerate(dataloader): batch = batch - batch_output , loss = self.eval_batch(batch, cross_entropy_loss) + batch_output, loss = self.eval_batch(batch, cross_entropy_loss) ((bs_correct, bs_total), (bw_correct, bw_total)) = batch_output s_total += bs_total s_correct += bs_correct diff --git a/SuperTagger/utils.py b/SuperTagger/utils.py index 5c0ec76ed9082510c5b0d5264e10d42e48d9070e..8712cca081d5897d460d451b05814dfa46bfc538 100644 --- a/SuperTagger/utils.py +++ b/SuperTagger/utils.py @@ -5,7 +5,6 @@ import torch from tqdm import tqdm - def read_csv_pgbar(csv_path, nrows=float('inf'), chunksize=500): print("\n" + "#" * 20) print("Loading csv...") @@ -38,8 +37,8 @@ def format_time(elapsed): # Format as hh:mm:ss return str(datetime.timedelta(seconds=elapsed_rounded)) -def checkpoint_save(model, opt, epoch, dir, loss): +def checkpoint_save(model, opt, epoch, dir, loss): torch.save({ 'epoch': epoch, 'model_state_dict': model.state_dict(), @@ -51,7 +50,7 @@ def checkpoint_save(model, opt, epoch, dir, loss): def checkpoint_load(model, opt, path): epoch = 0 loss = 0 - print("#" *15) + print("#" * 15) try: checkpoint = torch.load(path) @@ -62,8 +61,6 @@ def checkpoint_load(model, opt, path): print("\n The loading checkpoint was successful ! \n") print("#" * 10) except Exception as e: - print("\nCan't load checkpoint model because : "+ str(e) +"\n\nUse default model \n") + print("\nCan't load checkpoint model because : " + str(e) + "\n\nUse default model \n") print("#" * 15) return model, opt, epoch, loss - - diff --git a/bash_GPU.sh b/bash_GPU.sh index 834ebf556347599d1c9d108c5aa4740dfcb844dd..665f769d8046d6fd61167efbbbdd8e46d5495d94 100644 --- a/bash_GPU.sh +++ b/bash_GPU.sh @@ -4,10 +4,10 @@ #SBATCH --gres=gpu:1 #SBATCH --mem=32000 #SBATCH --gres-flags=enforce-binding -#SBATCH --error="/users/celdev/jrabault/PNRIA - DeepGrail/OUT/error_rtx1.err" -#SBATCH --output="/users/celdev/jrabault/PNRIA - DeepGrail/OUT/out_rtx1.out" +#SBATCH --error="error_rtx1.err" +#SBATCH --output="out_rtx1.out" module purge module load singularity/3.0.3 -srun singularity exec /logiciels/containerCollections/CUDA11/pytorch-NGC-21-03-py3.sif python "/users/celdev/jrabault/PNRIA - DeepGrail/train.py" \ No newline at end of file +srun singularity exec /logiciels/containerCollections/CUDA11/pytorch-NGC-21-03-py3.sif python "train.py" \ No newline at end of file diff --git a/main.py b/main.py deleted file mode 100644 index 20a033ab9212e826f91eabd0ea1069dd2fd63a08..0000000000000000000000000000000000000000 --- a/main.py +++ /dev/null @@ -1,16 +0,0 @@ -# This is a sample Python script. - -# Press Maj+F10 to execute it or replace it with your code. -# Press Double Shift to search everywhere for classes, files, tool windows, actions, and settings. - - -def print_hi(name): - # Use a breakpoint in the code line below to debug your script. - print(f'Hi, {name}') # Press Ctrl+F8 to toggle the breakpoint. - - -# Press the green button in the gutter to run the script. -if __name__ == '__main__': - print_hi('PyCharm') - -# See PyCharm help at https://www.jetbrains.com/help/pycharm/ diff --git a/push pull texte b/push pull texte new file mode 100644 index 0000000000000000000000000000000000000000..f80b3ef41b4ce5aa94eed2543895c4d8e50613b9 --- /dev/null +++ b/push pull texte @@ -0,0 +1 @@ + https://gitlab-ci-token:glpat-AZdpzmAPDFCSK8nPZxCw@gitlab.irit.fr/pnria/global-helper/deepgrail-rnn.git \ No newline at end of file diff --git a/train.py b/train.py index 6353e912a4068745a9f3a1ad5ad1e01dd88c4a20..63070df10f3281c279634763701fa0d4c66e51c1 100644 --- a/train.py +++ b/train.py @@ -1,16 +1,16 @@ -import math +import os import os import time +from datetime import datetime import numpy as np import torch -from torch.optim import AdamW -import transformers import torch.nn.functional as F +import transformers +from torch.optim import SGD from torch.utils.data import Dataset, TensorDataset, random_split from transformers import (AutoTokenizer, get_cosine_schedule_with_warmup) from transformers import (CamembertModel) -from torch.utils.tensorboard import SummaryWriter from Configuration import Configuration from SuperTagger.Encoder.EncoderInput import EncoderInput @@ -20,7 +20,7 @@ from SuperTagger.Symbol.symbol_map import symbol_map from SuperTagger.eval import NormCrossEntropy from SuperTagger.utils import format_time, read_csv_pgbar, checkpoint_save, checkpoint_load -from datetime import datetime +from torch.utils.tensorboard import SummaryWriter transformers.TOKENIZERS_PARALLELISM = True torch.cuda.empty_cache() @@ -38,8 +38,7 @@ num_gru_layers = int(Configuration.modelDecoderConfig['num_rnn_layers']) file_path = 'Datasets/m2_dataset.csv' batch_size = int(Configuration.modelTrainingConfig['batch_size']) -nb_sentences = batch_size * 10 -# Number of training epochs. The BERT authors recommend between 2 and 4. +nb_sentences = batch_size * 50 epochs = int(Configuration.modelTrainingConfig['epoch']) seed_val = int(Configuration.modelTrainingConfig['seed_val']) learning_rate = float(Configuration.modelTrainingConfig['learning_rate']) @@ -57,7 +56,7 @@ logs_dir = os.path.join(training_dir, 'logs') checkpoint_dir = training_dir writer = SummaryWriter(log_dir=logs_dir) -use_checkpoint_SAVE = bool(Configuration.modelTrainingConfig['use_checkpoint_SAVE']) +use_checkpoint_SAVE = bool(Configuration.modelTrainingConfig.getboolean('use_checkpoint_SAVE')) # endregion OutputTraining @@ -66,8 +65,7 @@ use_checkpoint_SAVE = bool(Configuration.modelTrainingConfig['use_checkpoint_SAV input_path = str(Configuration.modelTrainingConfig['input_path']) model_to_load = str(Configuration.modelTrainingConfig['model_to_load']) model_to_load_path = os.path.join(input_path, model_to_load) -use_checkpoint_LOAD = bool(Configuration.modelTrainingConfig['use_checkpoint_LOAD']) -print(use_checkpoint_LOAD) +use_checkpoint_LOAD = bool(Configuration.modelTrainingConfig.getboolean('use_checkpoint_LOAD')) # endregion InputTraining @@ -136,7 +134,7 @@ sents_tokenized, sents_mask = sents_tokenizer.fit_transform_tensors(df['Sentence dataset = TensorDataset(sents_tokenized, sents_mask, symbols_tokenized) # Calculate the number of samples to include in each set. -train_size = int(0.95 * len(dataset)) +train_size = int(0.9 * len(dataset)) val_size = len(dataset) - train_size # Divide the dataset by randomly selecting samples. @@ -153,23 +151,20 @@ validation_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batc # region Fit tunning # Optimizer -# optimizer_encoder = AdamW(model.encoder.parameters(), -# lr=5e-5, -# eps=1e-8) -# optimizer_decoder = AdamW(model.decoder.parameters(), -# lr=learning_rate, -# eps=1e-8) - -optimizer = AdamW(model.parameters(), - lr=learning_rate, - eps=1e-8) +optimizer_encoder = SGD(model.encoder.parameters(), + lr=5e-5) +optimizer_decoder = SGD(model.decoder.parameters(), + lr=learning_rate) # Total number of training steps is [number of batches] x [number of epochs]. # (Note that this is not the same as the number of training samples). total_steps = len(training_dataloader) * epochs # Create the learning rate scheduler. -scheduler = get_cosine_schedule_with_warmup(optimizer, +scheduler_encoder = get_cosine_schedule_with_warmup(optimizer_encoder, + num_warmup_steps=0, + num_training_steps=5) +scheduler_decoder = get_cosine_schedule_with_warmup(optimizer_decoder, num_warmup_steps=0, num_training_steps=total_steps) @@ -199,7 +194,7 @@ total_t0 = time.time() validate = True if use_checkpoint_LOAD: - model, optimizer, last_epoch, loss = checkpoint_load(model, optimizer, model_to_load_path) + model, optimizer_decoder, last_epoch, loss = checkpoint_load(model, optimizer_decoder, model_to_load_path) epochs = epochs - last_epoch @@ -242,46 +237,40 @@ def run_epochs(epochs): b_sents_mask = batch[1].to("cuda" if torch.cuda.is_available() else "cpu") b_symbols_tokenized = batch[2].to("cuda" if torch.cuda.is_available() else "cpu") - # optimizer_encoder.zero_grad() - # optimizer_decoder.zero_grad() - - optimizer.zero_grad() + optimizer_encoder.zero_grad() + optimizer_decoder.zero_grad() logits_predictions = model(b_sents_tokenized, b_sents_mask, b_symbols_tokenized) predict_trad = [{v: k for k, v in symbol_map.items()}[int(i)] for i in torch.argmax(F.softmax(logits_predictions, dim=2), dim=2)[0]] true_trad = [{v: k for k, v in symbol_map.items()}[int(i)] for i in b_symbols_tokenized[0]] - l = len([i for i in true_trad if i != '[PAD]']) - if step % 40 == 0 and not step == 0: - writer.add_text("Sample", "\ntrain true (" + str(l) + ") : " + str([token for token in true_trad if token != '[PAD]']) + "\ntrain predict (" + str(len([i for i in predict_trad if i != '[PAD]'])) + ") : " + str([token for token in predict_trad[:l] if token != '[PAD]'])) + writer.add_text("Sample", "\ntrain true (" + str(l) + ") : " + str( + [token for token in true_trad if token != '[PAD]']) + "\ntrain predict (" + str( + len([i for i in predict_trad if i != '[PAD]'])) + ") : " + str( + [token for token in predict_trad[:l] if token != '[PAD]'])) loss = cross_entropy_loss(logits_predictions, b_symbols_tokenized) # Perform a backward pass to calculate the gradients. total_train_loss += float(loss) loss.backward() - # Clip the norm of the gradients to 1.0. # This is to help prevent the "exploding gradients" problem. - # torch.nn.utils.clip_grad_norm_(model.encoder.parameters(), 1.0) - # torch.nn.utils.clip_grad_norm_(model.decoder.parameters(), 1.0) + torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0, norm_type=2) # Update parameters and take a step using the computed gradient. - # The optimizer dictates the "update rule"--how the parameters are - # modified based on their gradients, the learning rate, etc. - # optimizer_encoder.step() - # optimizer_decoder.step() + optimizer_encoder.step() + optimizer_decoder.step() - optimizer.step() - - scheduler.step() + scheduler_encoder.step() + scheduler_decoder.step() # checkpoint if use_checkpoint_SAVE: - checkpoint_save(model, optimizer, epoch_i, checkpoint_dir, loss) + checkpoint_save(model, optimizer_decoder, epoch_i, checkpoint_dir, loss) avg_train_loss = total_train_loss / len(training_dataloader) @@ -313,5 +302,3 @@ def run_epochs(epochs): run_epochs(epochs) # endregion Train - -