diff --git a/Datasets/index_to_pos1.pkl b/Datasets/index_to_pos1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..005a0bc309f087dd5b4af75a12d59239b9b723b6 Binary files /dev/null and b/Datasets/index_to_pos1.pkl differ diff --git a/Datasets/index_to_pos2.pkl b/Datasets/index_to_pos2.pkl new file mode 100644 index 0000000000000000000000000000000000000000..ab2916fbf6a280de8ecbffdf6108dcad81cce6a2 Binary files /dev/null and b/Datasets/index_to_pos2.pkl differ diff --git a/Datasets/index_to_super.pkl b/Datasets/index_to_super.pkl new file mode 100644 index 0000000000000000000000000000000000000000..606848f7652155e0048c281320e36f46886661e5 Binary files /dev/null and b/Datasets/index_to_super.pkl differ diff --git a/SuperTagger/Decoder/RNNDecoderLayer.py b/SuperTagger/Decoder/RNNDecoderLayer.py deleted file mode 100644 index 69ca8fcaa7e0dcdfef2db99c8c31e90edee7b7fb..0000000000000000000000000000000000000000 --- a/SuperTagger/Decoder/RNNDecoderLayer.py +++ /dev/null @@ -1,177 +0,0 @@ -import random - -import torch -import torch.nn.functional as F -from torch.nn import (Dropout, Module, ModuleList, Linear, LSTM, GRU) - -from Configuration import Configuration - - -class RNNDecoderLayer(Module): - def __init__(self, symbols_map): - super(RNNDecoderLayer, self).__init__() - - # init params - self.dim_encoder = int(Configuration.modelDecoderConfig['dim_encoder']) - self.dim_decoder = int(Configuration.modelDecoderConfig['dim_decoder']) - self.max_symbols_per_word = int(Configuration.modelDecoderConfig['max_symbols_per_word']) - self.max_len_sentence = int(Configuration.modelDecoderConfig['max_len_sentence']) - self.symbols_vocab_size = int(Configuration.modelDecoderConfig['symbols_vocab_size']) - dropout = float(Configuration.modelDecoderConfig['dropout']) - self.num_rnn_layers = int(Configuration.modelDecoderConfig['num_rnn_layers']) - self.teacher_forcing = float(Configuration.modelDecoderConfig['teacher_forcing']) - - self.bidirectional = False - self.use_attention = True - self.symbols_map = symbols_map - self.symbols_padding_id = self.symbols_map["[PAD]"] - self.symbols_sep_id = self.symbols_map["[SEP]"] - self.symbols_start_id = self.symbols_map["[START]"] - self.symbols_sos_id = self.symbols_map["[SOS]"] - - # Different layers - # Symbols Embedding - - # For hidden_state - self.dropout = Dropout(dropout) - # rnn Layer - if self.use_attention: - self.rnn = LSTM(input_size=self.dim_encoder, hidden_size=self.dim_encoder, num_layers=self.num_rnn_layers, - dropout=dropout, - bidirectional=self.bidirectional, batch_first=True) - else : - self.rnn = LSTM(input_size=self.dim_decoder, hidden_size=self.dim_encoder, num_layers=self.num_rnn_layers, - dropout=dropout, - bidirectional=self.bidirectional, batch_first=True) - - # Projection on vocab_size - if self.bidirectional: - self.proj = Linear(self.dim_encoder * 2, self.symbols_vocab_size) - else: - self.proj = Linear(self.dim_encoder, self.symbols_vocab_size) - - self.attn_combine = Linear(self.dim_decoder + self.dim_encoder, self.dim_encoder) - - def sos_mask(self, y): - return torch.eq(y, self.symbols_sos_id) - - def forward(self, symbols_tokenized_batch, last_hidden_state, pooler_output): - r"""Training the translation from encoded sentences to symbols - - Args: - symbols_tokenized_batch: [batch_size, max_symbols_in_sentence] the true symbols for each sentence. - last_hidden_state: [batch_size, max_len_sentence, dim_encoder] Sequence of hidden-states at the output of the last layer of the model. - pooler_output: [batch_size, dim_encoder] Last layer hidden-state of the first token of the sequence (classification token) after further processing through the layers used for the auxiliary pretraining task - """ - batch_size, sequence_length, hidden_size = last_hidden_state.shape - - # y_hat[batch_size, max_len_sentence, vocab_size] init with probability pad =1 - y_hat = torch.zeros(batch_size, self.max_len_sentence, self.max_symbols_per_word, self.symbols_vocab_size, - dtype=torch.float, device="cuda" if torch.cuda.is_available() else "cpu") - y_hat[:, :, self.symbols_padding_id] = 1 - - decoded_ij = torch.ones(batch_size, 1, dtype=torch.long, - device="cuda" if torch.cuda.is_available() else "cpu") * self.symbols_start_id - - sos_mask = torch.zeros(batch_size, dtype=torch.bool, device="cuda" if torch.cuda.is_available() else "cpu") - - # hidden_state goes through multiple linear layers - hidden_state = pooler_output.unsqueeze(0).repeat(self.num_rnn_layers * (1 + self.bidirectional), 1, 1) - - c_state = torch.zeros(self.num_rnn_layers * (1 + self.bidirectional), batch_size, hidden_size, - dtype=torch.float, device="cuda" if torch.cuda.is_available() else "cpu") - - use_teacher_forcing = False - - # for each symbol - for i in range(self.max_len_sentence): - for j in range(self.max_symbols_per_word) : - symbols_embedding = self.symbols_embedder(decoded_ij) - symbols_embedding = self.dropout(symbols_embedding) - - output = symbols_embedding - if self.use_attention: - output = torch.cat((symbols_embedding, last_hidden_state[:, i, :].unsqueeze(1)), 2) - output = self.attn_combine(output) - output = F.relu(output) - - # rnn layer - output, (hidden_state, c_state) = self.rnn(output, (hidden_state, c_state)) - - # Projection of the output of the rnn omitting the last probability (which is pad) so we dont predict PAD - proj = self.proj(output)[:, :, :-2] - - if use_teacher_forcing: - decoded_ij = symbols_tokenized_batch[:, i, j].unsqueeze(1) - else: - decoded_ij = torch.argmax(F.softmax(proj, dim=2), dim=2) - - # Calculate sos and pad - sos_mask_ij = self.sos_mask(torch.argmax(F.softmax(proj, dim=2), dim=2)[:, -1]) - y_hat[~sos_mask, i, j, self.symbols_padding_id] = 0 - y_hat[~sos_mask, i, j, :-2] = proj[~sos_mask, -1, :] - sos_mask = sos_mask_ij | sos_mask - - # Stop if every sentence says padding or if we are full - if not torch.any(~sos_mask): - break - return y_hat - - def predict_rnn(self, last_hidden_state, pooler_output): - r"""Predicts the symbols from the output of the encoder. - - Args: - last_hidden_state: [batch_size, max_len_sentence, dim_encoder] the output of the encoder - pooler_output: [batch_size, dim_encoder] Last layer hidden-state of the first token of the sequence (classification token) after further processing through the layers used for the auxiliary pretraining task - """ - batch_size, sequence_length, hidden_size = last_hidden_state.shape - - # y_hat[batch_size, max_len_sentence, vocab_size] init with probability pad =1 - y_hat = torch.zeros(batch_size, self.max_len_sentence, self.max_symbols_per_word, self.symbols_vocab_size, - dtype=torch.float, device="cuda" if torch.cuda.is_available() else "cpu") - y_hat[:, :, self.symbols_padding_id] = 1 - - decoded_ij = torch.ones(batch_size, 1, dtype=torch.long, - device="cuda" if torch.cuda.is_available() else "cpu") * self.symbols_start_id - - sos_mask = torch.zeros(batch_size, dtype=torch.bool, device="cuda" if torch.cuda.is_available() else "cpu") - - # hidden_state goes through multiple linear layers - hidden_state = pooler_output.unsqueeze(0).repeat(self.num_rnn_layers * (1 + self.bidirectional), 1, 1) - - c_state = torch.zeros(self.num_rnn_layers * (1 + self.bidirectional), batch_size, hidden_size, - dtype=torch.float, device="cuda" if torch.cuda.is_available() else "cpu") - symbols_embedding = self.symbols_embedder(decoded_ij) - - symbols_embedding = self.dropout(symbols_embedding) - - # for each symbol - for i in range(self.max_len_sentence): - output = symbols_embedding - if self.use_attention: - output = torch.cat((symbols_embedding, last_hidden_state[:, i, :].unsqueeze(1)), 2) - output = self.attn_combine(output) - output = F.relu(output) - for j in range(self.max_symbols_per_word) : - symbols_embedding = self.symbols_embedder(decoded_ij) - - symbols_embedding = self.dropout(symbols_embedding) - - # rnn layer - output, (hidden_state, c_state) = self.rnn(output, (hidden_state, c_state)) - - # Projection of the output of the rnn omitting the last probability (which is pad) so we dont predict PAD - proj_softmax = F.softmax(self.proj(output)[:, :, :-2], dim=2) - decoded_ij = torch.argmax(proj_softmax, dim=2) - - # Set sos and pad - sos_mask_ij = self.sos_mask(decoded_ij[:, -1]) - y_hat[~sos_mask, i, j, self.symbols_padding_id] = 0 - y_hat[~sos_mask, i, j, :-2] = proj_softmax[~sos_mask, -1, :] - sos_mask = sos_mask_ij | sos_mask - - # Stop if every sentence says padding or if we are full - if not torch.any(~sos_mask): - break - - return y_hat diff --git a/SuperTagger/Encoder/EncoderLayer.py b/SuperTagger/Encoder/EncoderLayer.py deleted file mode 100644 index c954584f332ff6207371cda0bc93aae8fe6edfea..0000000000000000000000000000000000000000 --- a/SuperTagger/Encoder/EncoderLayer.py +++ /dev/null @@ -1,67 +0,0 @@ -import sys - -import torch -from torch import nn - -from Configuration import Configuration - - -class EncoderLayer(nn.Module): - """Encoder class, imput of supertagger""" - - def __init__(self, model): - super(EncoderLayer, self).__init__() - self.name = "Encoder" - - self.bert = model - - self.hidden_size = self.bert.config.hidden_size - - def forward(self, batch): - r""" - Args : - batch: list[str,mask], list of sentences (NOTE: untokenized, continuous sentences) - Returns : - last_hidden_state: [batch_size, max_len_sentence, dim_encoder] Sequence of hidden-states at the output of the last layer of the model. - pooler_output: [batch_size, dim_encoder] Last layer hidden-state of the first token of the sequence (classification token) after further processing through the layers used for the auxiliary pretraining task - """ - b_input_ids = batch[0] - b_input_mask = batch[1] - - outputs = self.bert( - input_ids=b_input_ids, attention_mask=b_input_mask) - - return outputs[0], outputs[1] - - @staticmethod - def load(model_path: str): - r""" Load the model from a file. - Args : - model_path (str): path to model - Returns : - model (nn.Module): model with saved parameters - """ - params = torch.load( - model_path, map_location=lambda storage, loc: storage) - args = params['args'] - model = EncoderLayer(**args) - model.load_state_dict(params['state_dict']) - - return model - - def save(self, path: str): - r""" Save the model to a file. - Args : - path (str): path to the model - """ - print('save model parameters to [%s]' % path, file=sys.stderr) - - params = { - 'args': dict(bert_config=self.bert.config, dropout_rate=self.dropout_rate), - 'state_dict': self.state_dict() - } - - torch.save(params, path) - - def to_dict(self): - return {} diff --git a/SuperTagger/EncoderDecoder.py b/SuperTagger/EncoderDecoder.py index 273b3e8bac9fc0844c6c102c5856d6f1953b1ed8..1c8e51090a71ba46c8b70eda8a687696276579b4 100644 --- a/SuperTagger/EncoderDecoder.py +++ b/SuperTagger/EncoderDecoder.py @@ -4,9 +4,7 @@ from torch.nn import Dropout, LSTM from torch.nn import Module from Configuration import Configuration -from SuperTagger.Decoder.RNNDecoderLayer import RNNDecoderLayer from torch.nn.utils.rnn import pack_padded_sequence -from SuperTagger.Encoder.EncoderLayer import EncoderLayer from SuperTagger.eval import measure_supertagging_accuracy @@ -21,6 +19,8 @@ class EncoderDecoder(Module): def __init__(self, BASE_MODEL, numPos1Classes, numPos2Classes, numSuperClasses): super(EncoderDecoder, self).__init__() + self.bert = BASE_MODEL + self.dim_encoder = int(Configuration.modelDecoderConfig['dim_encoder']) self.dim_decoder = int(Configuration.modelDecoderConfig['dim_decoder']) self.num_rnn_layers = int(Configuration.modelDecoderConfig['num_rnn_layers']) @@ -28,18 +28,18 @@ class EncoderDecoder(Module): dropout = float(Configuration.modelDecoderConfig['dropout']) self.dropout = Dropout(dropout) - self.encoder = EncoderLayer(BASE_MODEL) + self.bert = BASE_MODEL self.lstm_shared = LSTM(input_size=self.dim_encoder, hidden_size=self.dim_encoder, num_layers=self.num_rnn_layers, dropout=dropout, bidirectional=self.bidirectional, batch_first=True, ) #Pos1 - self.pos1_1 = nn.Linear(self.dim_encoder*2,self.dim_decoder) + self.pos1_1 = nn.Linear(self.dim_encoder,self.dim_decoder) self.pos1_2 = nn.Linear(self.dim_decoder, numPos1Classes) #Pos2 - self.pos2_1 = nn.Linear(self.dim_encoder*2, self.dim_decoder) + self.pos2_1 = nn.Linear(self.dim_encoder, self.dim_decoder) self.pos2_2 = nn.Linear(self.dim_decoder, numPos2Classes) #super @@ -47,24 +47,27 @@ class EncoderDecoder(Module): num_layers=self.num_rnn_layers, dropout=dropout, bidirectional=self.bidirectional, batch_first=True, ) - self.pos_super_1 = nn.Linear(self.dim_encoder*2,self.dim_decoder) + self.pos_super_1 = nn.Linear(self.dim_encoder,self.dim_decoder) self.pos_super_2 = nn.Linear(self.dim_decoder, numSuperClasses) def forward(self, batch): - encoded_layers, pooled_output = self.encoder(batch) - encoded_layers = self.dropout(encoded_layers) - # encoded_layers = encoded_layers.permute(1, 0, 2) - print("encoded_layers : ", encoded_layers.size()) + b_input_ids = batch[0] + b_input_mask = batch[1] + encoded_layers, _ = self.bert( + input_ids=b_input_ids, attention_mask=b_input_mask, return_dict=False) - lstm_output, (h, c) = self.lstm_shared(encoded_layers) ## extract the 1st token's embeddings - print("last_hidden : ", lstm_output.size()) - # output_shared = torch.cat((lstm_output[:, :, :self.dim_encoder], lstm_output[:, :, self.dim_encoder:]), dim=2) + lstm_output = self.dropout(encoded_layers) + + print("encoded_layers : ", encoded_layers.size()) - print("output_shared : ", lstm_output.size()) + # lstm_output, _ = self.lstm_shared(encoded_layers) ## extract the 1st token's embeddings + # print("last_hidden : ", lstm_output.size()) + # + # print("output_shared : ", lstm_output.size()) # Pos1 pos_1_output= self.pos1_1(lstm_output) @@ -77,9 +80,8 @@ class EncoderDecoder(Module): pos_2_output = self.pos2_2(pos_2_output) # super - enc_hiddens, (last_hidden_super, last_cell_super) = self.lstm_super(lstm_output) - print(enc_hiddens.size()) - super_output = self.pos_super_1(enc_hiddens) + # enc_hiddens, _ = self.lstm_super(lstm_output) + super_output = self.pos_super_1(lstm_output) super_output = self.dropout(super_output) super_output = self.pos_super_2(super_output) diff --git a/SuperTagger/Encoder/EncoderInput.py b/SuperTagger/EncoderTokenizer.py similarity index 95% rename from SuperTagger/Encoder/EncoderInput.py rename to SuperTagger/EncoderTokenizer.py index e19da7d0d28e27e7b191d4333659f58c27e59f09..865f5a731eaa233fe928ce34311a7013557d01a1 100644 --- a/SuperTagger/Encoder/EncoderInput.py +++ b/SuperTagger/EncoderTokenizer.py @@ -1,7 +1,7 @@ import torch -class EncoderInput(): +class EncoderTokenizer(): def __init__(self, tokenizer): """@params tokenizer (PretrainedTokenizer): Tokenizer that tokenizes text """ diff --git a/SuperTagger/SymbolTokenizer.py b/SuperTagger/SymbolTokenizer.py new file mode 100644 index 0000000000000000000000000000000000000000..8a4948c5b8c541b5b9d49266eed1507151bc822d --- /dev/null +++ b/SuperTagger/SymbolTokenizer.py @@ -0,0 +1,56 @@ +import pickle + +import numpy as np +import torch + + +def load_obj(name): + with open(name + '.pkl', 'rb') as f: + return pickle.load(f) + + +class SymbolTokenizer(): + + def __init__(self): + """@params tokenizer (PretrainedTokenizer): Tokenizer that tokenizes text """ + self.index_to_super = load_obj('Datasets/index_to_super') + self.index_to_pos1 = load_obj('Datasets/index_to_pos1') + self.index_to_pos2 = load_obj('Datasets/index_to_pos2') + self.super_to_index = {v: int(k) for k, v in self.index_to_super.items()} + self.pos1_to_index = {v: int(k) for k, v in self.index_to_pos1.items()} + self.pos2_to_index = {v: int(k) for k, v in self.index_to_pos2.items()} + + def lenPOS1(self): + print(self.pos1_to_index) + return len(self.index_to_pos1) + 1 + + def lenPOS2(self): + return len(self.index_to_pos2) + 1 + + def lenSuper(self): + return len(self.index_to_super) + 1 + + def convert_batchs_to_ids(self, Y1, Y2, Super): + max_len_Y1 = max(len(elem) for elem in Y1) + max_len_Y2 = max(len(elem) for elem in Y2) + max_len_S = max(len(elem) for elem in Super) + Y1_tok = torch.as_tensor(pad_sequence([[self.pos1_to_index[str(symbol)] for symbol in sents] for sents in Y1])) + Y2_tok = torch.as_tensor(pad_sequence( + [[self.pos2_to_index[str(symbol)] for symbol in sents] for sents in Y2])) + super_tok = torch.as_tensor(pad_sequence( + [[self.super_to_index[str(symbol)] for symbol in sents] for sents in Super])) + + return Y1_tok, Y2_tok, super_tok + + # def convert_ids_to_symbols(self, ids): + # return [self.inverse_symbol_map[int(i)] for i in ids] + +def pad_sequence(sequences, max_len=400): + sequences_pad = [] + for s in sequences: + padded = [0] * max_len + padded[:len(s)] = s + sequences_pad.append(padded) + return sequences_pad + + diff --git a/SuperTagger/eval.py b/SuperTagger/eval.py index 3017c7f52dc53a1ce9cc8ba21d75c63446af1b83..dd9975739ef9a5fd51c2ff06308cfdd27c76ca17 100644 --- a/SuperTagger/eval.py +++ b/SuperTagger/eval.py @@ -33,12 +33,14 @@ class NormCrossEntropy(Module): r"""Loss based on the cross entropy, it considers the number of words and ignore the padding. """ - def __init__(self, ignore_index, sep_id, weights=None): + def __init__(self, ignore_index, weights=None): super(NormCrossEntropy, self).__init__() self.ignore_index = ignore_index - self.sep_id = sep_id self.weights = weights def forward(self, predictions, truths): - return cross_entropy(predictions.flatten(0, -2), truths.flatten(), weight=self.weights, - reduction='sum', ignore_index=self.ignore_index) / count_sep(truths.flatten(), self.sep_id) + print() + print("predictions : ", predictions.size()) + print("truths : ", truths.size()) + return cross_entropy(predictions.flatten(0, -2), truths.flatten(), weight=torch.tensor(self.weights,device="cuda" if torch.cuda.is_available() else "cpu"), + reduction='sum', ignore_index=self.ignore_index) diff --git a/SuperTagger/utils.py b/SuperTagger/utils.py index a0438f5c342dc22e10821e1f32d6145a4ddab73c..e257eb72cd4bdae65bee5f2ed1ec517e50e4cade 100644 --- a/SuperTagger/utils.py +++ b/SuperTagger/utils.py @@ -5,7 +5,7 @@ import torch from tqdm import tqdm -def read_csv_pgbar(csv_path, nrows=float('inf'), chunksize=500): +def read_csv_pgbar(csv_path, nrows=float('inf'), chunksize=100): print("\n" + "#" * 20) print("Loading csv...") diff --git a/Utils/PostpreprocesTXT.py b/Utils/PostpreprocesTXT.py index a4168c8afc2992e0aacbe95537397c7330429940..fbfcb8d22e849e787ee91fb09b47ddc09d214f6f 100644 --- a/Utils/PostpreprocesTXT.py +++ b/Utils/PostpreprocesTXT.py @@ -1,4 +1,5 @@ import itertools +import pickle import re import numpy as np @@ -115,17 +116,15 @@ def read_maxentdata(file): Z = np.asarray(allsuper) return X, Y1, Y2, Z, vocabulary, vnorm, partsofspeech1, partsofspeech2, superset, maxlen +def save_obj(obj, name): + with open(name + '.pkl', 'wb+') as f: + pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL) -# Txt_to_csv("m2.txt") - -X, Y1, Y2, Z, vocabulary, vnorm, partsofspeech1, partsofspeech2, superset, maxlen = read_maxentdata("m2.txt") +def load_obj(name): + with open(name + '.pkl', 'rb') as f: + return pickle.load(f) -print("X[17] (", np.array(X[17]).shape ,") : ") -print(X[17]) -print("Y1[17] (", np.array(Y1[17]).shape ,") : ") -print(Y1[17]) -print("Y2[17] (", np.array(Y2[17]).shape ,") : ") -print(Y2[17]) +# Txt_to_csv("m2.txt") X, Y1, Y2, Z, vocabulary, vnorm, partsofspeech1, partsofspeech2, superset, maxlen = read_maxentdata("m2.txt") @@ -136,6 +135,13 @@ df['Y1'] = Y1 df['Y2'] = Y2 df['Z'] = Z +df.to_csv("../Datasets/m2_dataset_V2.csv", index=False) + + +t = np.unique(np.array(list(itertools.chain(*Z)))) + +print(t.size) +dict = { i : t[i] for i in range(0, len(t) ) } -df.to_csv("../Datasets/m2_dataset_V2.csv", index=False) \ No newline at end of file +save_obj(dict,"../Datasets/index_to_super") \ No newline at end of file diff --git a/train.py b/train.py index 187b93a5ec08df10672627868e4865811469727f..a46b4a599eda798dd364fd6485a9b94822d16f2e 100644 --- a/train.py +++ b/train.py @@ -6,16 +6,18 @@ import numpy as np import torch import torch.nn.functional as F import transformers -from torch.optim import SGD, Adam +from torch.optim import Adam, RMSprop from torch.utils.data import Dataset, TensorDataset, random_split from transformers import (AutoTokenizer, get_cosine_schedule_with_warmup) from transformers import (CamembertModel) from Configuration import Configuration -from SuperTagger.Encoder.EncoderInput import EncoderInput + from SuperTagger.EncoderDecoder import EncoderDecoder +from SuperTagger.EncoderTokenizer import EncoderTokenizer +from SuperTagger.SymbolTokenizer import SymbolTokenizer from SuperTagger.eval import NormCrossEntropy -from SuperTagger.utils import format_time, read_csv_pgbar, checkpoint_save, checkpoint_load +from SuperTagger.utils import format_time, read_csv_pgbar from torch.utils.tensorboard import SummaryWriter @@ -24,9 +26,7 @@ torch.cuda.empty_cache() # region ParamsModel -# max_symbols_per_word = int(Configuration.modelDecoderConfig['max_symbols_per_word']) -# max_len_sentence = int(Configuration.modelDecoderConfig['max_len_sentence']) -# symbol_vocab_size = int(Configuration.modelDecoderConfig['symbols_vocab_size']) + num_gru_layers = int(Configuration.modelDecoderConfig['num_rnn_layers']) # endregion ParamsModel @@ -35,7 +35,7 @@ num_gru_layers = int(Configuration.modelDecoderConfig['num_rnn_layers']) file_path = 'Datasets/m2_dataset_V2.csv' batch_size = int(Configuration.modelTrainingConfig['batch_size']) -nb_sentences = batch_size * 300 +nb_sentences = batch_size * 50 epochs = int(Configuration.modelTrainingConfig['epoch']) seed_val = int(Configuration.modelTrainingConfig['seed_val']) learning_rate = float(Configuration.modelTrainingConfig['learning_rate']) @@ -115,9 +115,8 @@ BASE_TOKENIZER = AutoTokenizer.from_pretrained( 'camembert-base', do_lower_case=True) BASE_MODEL = CamembertModel.from_pretrained("camembert-base") -sents_tokenizer = EncoderInput(BASE_TOKENIZER) -model = EncoderDecoder(BASE_MODEL, 20,20,20) -model = model.to("cuda" if torch.cuda.is_available() else "cpu") +sents_tokenizer = EncoderTokenizer(BASE_TOKENIZER) +symbol_tokenizer = SymbolTokenizer() # endregion Model @@ -126,7 +125,9 @@ df = read_csv_pgbar(file_path, nb_sentences) sents_tokenized, sents_mask = sents_tokenizer.fit_transform_tensors(df['X'].tolist()) -dataset = TensorDataset(sents_tokenized, sents_mask) +y1, y2, super = symbol_tokenizer.convert_batchs_to_ids(df['Y1'].tolist(),df['Y2'].tolist(),df['Z'].tolist()) + +dataset = TensorDataset(sents_tokenized, sents_mask, y1, y2, super) # , torch.tensor(df['Y1'].tolist()), torch.tensor(df['Y2'].tolist()), torch.tensor(df['Z'].tolist()) # Calculate the number of samples to include in each set. @@ -144,13 +145,14 @@ validation_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batc # endregion Data loader + +model = EncoderDecoder(BASE_MODEL, symbol_tokenizer.lenPOS1(),symbol_tokenizer.lenPOS2(),symbol_tokenizer.lenSuper()) +model = model.to("cuda" if torch.cuda.is_available() else "cpu") + # region Fit tunning # Optimizer -# optimizer_encoder = Adam(model.encoder.parameters(), -# lr=5e-5) -# optimizer_decoder = Adam(model.decoder.parameters(), -# lr=learning_rate) +optimizer = RMSprop(model.parameters()) # Total number of training steps is [number of batches] x [number of epochs]. # (Note that this is not the same as the number of training samples). @@ -165,15 +167,9 @@ total_steps = len(training_dataloader) * epochs # num_training_steps=total_steps) # # Loss -# if loss_scaled_by_freq: -# weights = torch.as_tensor( -# [6.9952, 1.0763, 1.0317, 43.274, 16.5276, 11.8821, 28.2416, 2.7548, 1.0728, 3.1847, 8.4521, 6.77, 11.1887, -# 6.6692, 23.1277, 11.8821, 4.4338, 1.2303, 5.0238, 8.4376, 1.0656, 4.6886, 1.028, 4.273, 4.273, 0], -# device=torch.device("cuda" if torch.cuda.is_available() else "cpu")) -# cross_entropy_loss = NormCrossEntropy(symbols_tokenizer.pad_token_id, symbols_tokenizer.sep_token_id, -# weights=weights) -# else: -# cross_entropy_loss = NormCrossEntropy(symbols_tokenizer.pad_token_id, symbols_tokenizer.sep_token_id) +cross_entropy_loss_Y1 = NormCrossEntropy(0,0.15) +cross_entropy_loss_Y2 = NormCrossEntropy(0,.35) +cross_entropy_loss_S = NormCrossEntropy(0,.5) np.random.seed(seed_val) torch.manual_seed(seed_val) @@ -211,7 +207,9 @@ def run_epochs(epochs): t0 = time.time() # Reset the total loss for this epoch. - total_train_loss = 0 + total_train_loss_Y1 =0 + total_train_loss_Y2 =0 + total_train_loss_S =0 model.train() @@ -221,8 +219,8 @@ def run_epochs(epochs): # if epoch_i == 0 and step == 0: # writer.add_graph(model, input_to_model=batch[0], verbose=False) - # Progress update every 40 batches. - if step % 40 == 0 and not step == 0: + # Progress update every 10 batches. + if step % 10 == 0 and not step == 0: # Calculate elapsed time in minutes. elapsed = format_time(time.time() - t0) # Report progress. @@ -231,23 +229,36 @@ def run_epochs(epochs): # Unpack this training batch from our dataloader. b_sents_tokenized = batch[0].to("cuda" if torch.cuda.is_available() else "cpu") b_sents_mask = batch[1].to("cuda" if torch.cuda.is_available() else "cpu") - b_symbols_tokenized = batch[2].to("cuda" if torch.cuda.is_available() else "cpu") - # optimizer_encoder.zero_grad() - # optimizer_decoder.zero_grad() + optimizer.zero_grad() + + logits_predictions = model((b_sents_tokenized, b_sents_mask)) + + output_dim_Y1 = logits_predictions[0].shape[1] + print(output_dim_Y1) + # output_Y1 = logits_predictions[0][1:].view(-1, output_dim_Y1) + output_dim_Y2 = logits_predictions[1].shape[1] + # output_Y2 = logits_predictions[1][1:].view(-1, output_dim_Y2) + output_dim_S = logits_predictions[2].shape[1] + # output_S = logits_predictions[2][1:].view(-1, output_dim_S) - logits_predictions = model(b_sents_tokenized, b_sents_mask, b_symbols_tokenized) + loss_Y1 = cross_entropy_loss_Y1(logits_predictions[0], batch[2][:output_dim_Y1]) + loss_Y2 = cross_entropy_loss_Y2(logits_predictions[1], batch[3][:output_dim_Y2]) + loss_S = cross_entropy_loss_S(logits_predictions[2], batch[4][:output_dim_S]) - # loss = cross_entropy_loss(logits_predictions, b_symbols_tokenized) - # total_train_loss += float(loss) - # loss.backward() + total_train_loss_Y1 += float(loss_Y1) + total_train_loss_Y2 += float(loss_Y2) + total_train_loss_S += float(loss_S) + + loss_Y1.backward() + loss_Y2.backward() + loss_S.backward() # This is to help prevent the "exploding gradients" problem. #torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0, norm_type=2) # Update parameters and take a step using the computed gradient. - # optimizer_encoder.step() - # optimizer_decoder.step() + optimizer.step() # # scheduler_encoder.step() # scheduler_decoder.step() @@ -257,7 +268,9 @@ def run_epochs(epochs): # if use_checkpoint_SAVE: # checkpoint_save(model, optimizer_decoder, epoch_i, checkpoint_dir, loss) - avg_train_loss = total_train_loss / len(training_dataloader) + avg_train_loss_Y1 = total_train_loss_Y1 / len(training_dataloader) + avg_train_loss_Y2 = total_train_loss_Y2 / len(training_dataloader) + avg_train_loss_S = total_train_loss_S / len(training_dataloader) # Measure how long this epoch took. training_time = format_time(time.time() - t0) @@ -274,7 +287,9 @@ def run_epochs(epochs): # writer.add_scalar('Accuracy/symbol', accuracy_symbol, epoch_i + 1) print("") - print(" Average training loss: {0:.2f}".format(avg_train_loss)) + print(" Average training loss: {0:.2f}".format(avg_train_loss_Y1)) + print(" Average training loss: {0:.2f}".format(avg_train_loss_Y2)) + print(" Average training loss: {0:.2f}".format(avg_train_loss_S)) print(" Training epcoh took: {:}".format(training_time)) # writer.add_scalar('Loss/train', total_train_loss, epoch_i+1) @@ -287,10 +302,13 @@ def run_epochs(epochs): # run_epochs(epochs) # endregion Train -b = next(iter(training_dataloader)) -# , y1,y2,y3 -a = model(b) -print(len(b)) -print(a[0].size(),a[1].size(),a[2].size()) +# b1, b2 , y1,y2,y3 = next(iter(training_dataloader)) +# b =(b1, b2) +# # , y1,y2,y3 +# a = model(b) +# print(len(b)) +# print(a[0].size(),a[1].size(),a[2].size()) +print(symbol_tokenizer.lenPOS1()) +