diff --git a/SuperTagger/SuperTagger.py b/SuperTagger/SuperTagger.py index 3222dcfe75386809f42fd82f57f9ed3b8140d391..54072932deb45f25312f819ee8285ec6ef6b7b00 100644 --- a/SuperTagger/SuperTagger.py +++ b/SuperTagger/SuperTagger.py @@ -8,9 +8,11 @@ import datetime import numpy as np import torch +from torch import nn from torch.utils.tensorboard import SummaryWriter from tqdm import tqdm from transformers import AutoTokenizer +from transformers import AdamW from torch.utils.data import Dataset, TensorDataset, random_split @@ -22,8 +24,13 @@ from SuperTagger.Utils.Tagging_bert_model import Tagging_bert_model def categorical_accuracy(preds, truth): flat_preds = preds.flatten() flat_labels = truth.flatten() + good_label = 0 + for i in range(len(flat_preds)): + if flat_labels[i] == flat_preds[i] and flat_labels[i]!=0: + good_label += 1 + + return good_label / len(flat_labels) - return np.sum(flat_preds == flat_labels) / len(flat_labels) def format_time(elapsed): ''' @@ -48,7 +55,6 @@ class SuperTagger: self.model = None self.optimizer = None - self.loss = None self.epoch_i = 0 self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -56,13 +62,12 @@ class SuperTagger: self.trainable = False self.model_load = False - def load_weights(self, model_file): self.trainable = False print("#" * 15) try: - params = torch.load(model_file , map_location=self.device) + params = torch.load(model_file, map_location=self.device) args = params['args'] self.bert_name = args['bert_name'] self.index_to_tags = args['index_to_tags'] @@ -73,11 +78,11 @@ class SuperTagger: self.bert_name, do_lower_case=True)) self.model.load_state_dict(params['state_dict']) - self.optimizer=params['optimizer'] + self.optimizer = params['optimizer'] self.epoch_i = args['epoch'] print("\n The loading checkpoint was successful ! \n") print("\tBert model : ", self.bert_name) - print("\tLast epoch : ", self.epoch_i) + # print("\tLast epoch : ", self.epoch_i) print() except Exception as e: print("\n/!\ Can't load checkpoint model /!\ because :\n\n " + str(e), file=sys.stderr) @@ -92,15 +97,16 @@ class SuperTagger: assert len( index_to_tags) == num_label, f" len(index_to_tags) : {len(index_to_tags)} must be equels with num_label: {num_label}" - self.model = Tagging_bert_model(bert_name, num_label+1) + self.model = Tagging_bert_model(bert_name, num_label + 1) index_to_tags = {k + 1: v for k, v in index_to_tags.items()} index_to_tags[0] = '<unk>' + print(index_to_tags) self.index_to_tags = index_to_tags self.bert_name = bert_name self.sent_tokenizer = SentencesTokenizer(AutoTokenizer.from_pretrained( bert_name, do_lower_case=True)) - self.optimizer = torch.optim.Adam(params=self.model.parameters(), lr=2e-05) + self.optimizer = AdamW(params=self.model.parameters(), lr=2e-5, eps=1e-8, correct_bias=False) self.tags_tokenizer = SymbolTokenizer(index_to_tags) self.trainable = True self.model_load = True @@ -126,7 +132,7 @@ class SuperTagger: checkpoint_dir, writer = self.__output_create() training_dataloader, validation_dataloader = self.__preprocess_data(batch_size, sentences, tags, - 1-validation_rate) + 1 - validation_rate) epochs = epochs - self.epoch_i self.model = self.model.to(self.device) self.model.train() @@ -138,7 +144,7 @@ class SuperTagger: epoch_acc, epoch_loss, training_time = self.__train_epoch(training_dataloader) - if validation_rate>0.0: + if validation_rate > 0.0: eval_accuracy, eval_loss, nb_eval_steps = self.__eval_epoch(validation_dataloader) print("") @@ -154,14 +160,14 @@ class SuperTagger: if validation_rate > 0.0: writer.add_scalars(f'Validation_Accuracy/Loss', { 'Accuracy_val': eval_accuracy, - 'Loss_val': eval_loss,}, epoch_i + 1) + 'Loss_val': eval_loss, }, epoch_i + 1) if checkpoint: self.__checkpoint_save(path=os.path.join(checkpoint_dir, 'model_check.pt')) def __preprocess_data(self, batch_size, sentences, tags, validation_rate): - validation_dataloader=None + validation_dataloader = None sents_tokenized_t, sents_mask_t = self.sent_tokenizer.fit_transform_tensors(sentences) tags_t = self.tags_tokenizer.convert_batchs_to_ids(tags, sents_tokenized_t) @@ -169,7 +175,7 @@ class SuperTagger: train_size = int(validation_rate * len(dataset)) print('{:>5,} training samples'.format(train_size)) - if validation_rate>0: + if validation_rate < 1: val_size = len(dataset) - train_size train_dataset, val_dataset = random_split(dataset, [train_size, val_size]) print('{:>5,} validation samples'.format(val_size)) @@ -188,13 +194,13 @@ class SuperTagger: return training_dir, writer def __train_epoch(self, training_dataloader): + self.model.train() epoch_loss = 0 epoch_acc = 0 t0 = time.time() i = 0 with tqdm(training_dataloader, unit="batch") as tepoch: for batch in tepoch: - # Unpack this training batch from our dataloader. b_sents_tokenized = batch[0].to(self.device) b_sents_mask = batch[1].to(self.device) @@ -204,22 +210,24 @@ class SuperTagger: loss, logit = self.model((b_sents_tokenized, b_sents_mask, targets)) - acc = categorical_accuracy(np.argmax(logit.detach().cpu().numpy(), axis=2), targets.detach().cpu().numpy()) + predictions = torch.argmax(logit, dim=2).detach().cpu().numpy() + label_ids = targets.cpu().numpy() - epoch_acc += acc.item() - epoch_loss += loss.item() + acc = categorical_accuracy(predictions, label_ids) loss.backward() - self.optimizer.step() - i+=1 + epoch_acc += acc + epoch_loss += loss.item() + self.optimizer.step() + i += 1 # Measure how long this epoch took. training_time = format_time(time.time() - t0) - epoch_acc = epoch_acc / len(training_dataloader) - epoch_loss = epoch_loss / len(training_dataloader) + epoch_acc = epoch_acc / i + epoch_loss = epoch_loss / i return epoch_acc, epoch_loss, training_time @@ -227,7 +235,6 @@ class SuperTagger: self.model.eval() eval_loss = 0 eval_accuracy = 0 - predictions, true_labels = [], [] nb_eval_steps, nb_eval_examples = 0, 0 with torch.no_grad(): print("Start eval") @@ -236,17 +243,13 @@ class SuperTagger: b_sents_mask = batch[1].to(self.device) b_symbols_tokenized = batch[2].to(self.device) - logits = self.predict((b_sents_tokenized, b_sents_mask, b_symbols_tokenized)) + loss, logits = self.model((b_sents_tokenized, b_sents_mask, b_symbols_tokenized)) - logits = logits.detach().cpu().numpy() + predictions = torch.argmax(logits, dim=2).detach().cpu().numpy() label_ids = b_symbols_tokenized.cpu().numpy() - predictions.extend([list(p) for p in np.argmax(logits, axis=2)]) - true_labels.append(label_ids) - accuracy = categorical_accuracy(logits, label_ids) - print(logits[0][:50]) - print(label_ids[0][:50]) - #eval_loss += loss.mean().item() + accuracy = categorical_accuracy(predictions, label_ids) + eval_loss += loss.item() eval_accuracy += accuracy nb_eval_examples += b_sents_tokenized.size(0) nb_eval_steps += 1 @@ -265,4 +268,3 @@ class SuperTagger: 'optimizer': self.optimizer, }, path) self.model.to(self.device) - diff --git a/SuperTagger/Utils/SentencesTokenizer.py b/SuperTagger/Utils/SentencesTokenizer.py index 7aee1d4c360faeb450f979cbc21ce4cf704fb898..1cdb1ee9aea72095840845970bde33f09959b9e0 100644 --- a/SuperTagger/Utils/SentencesTokenizer.py +++ b/SuperTagger/Utils/SentencesTokenizer.py @@ -13,10 +13,29 @@ class SentencesTokenizer(): def fit_transform_tensors(self, sents): # , return_tensors = 'pt' - temp = self.tokenizer(sents, padding=True, return_tensors = 'pt') + temp = self.tokenizer(sents, padding=True, return_offsets_mapping = True) + len_sent_max = len(temp['attention_mask'][0]) - return temp['input_ids'], temp['attention_mask'] + input_ids = np.ones((len(sents),len_sent_max)) + attention_mask = np.zeros((len(sents),len_sent_max)) + + for i in range(len(temp['offset_mapping'])): + h = 1 + input_ids[i][0] = self.tokenizer.cls_token_id + attention_mask[i][0] = 1 + for j in range (1,len_sent_max-1): + if temp['offset_mapping'][i][j][1] != temp['offset_mapping'][i][j+1][0]: + input_ids[i][h] = temp['input_ids'][i][j] + attention_mask[i][h] = 1 + h += 1 + input_ids[i][h] = self.tokenizer.eos_token_id + attention_mask[i][h] = 1 + + input_ids = torch.tensor(input_ids).long() + attention_mask = torch.tensor(attention_mask) + + return input_ids, attention_mask def convert_ids_to_tokens(self, inputs_ids, skip_special_tokens=False): return self.tokenizer.batch_decode(inputs_ids, skip_special_tokens=skip_special_tokens) diff --git a/SuperTagger/Utils/SymbolTokenizer.py b/SuperTagger/Utils/SymbolTokenizer.py index 48543dab9ec07ebe7c22e20bee1e56e26521dd07..62228c94621372a0ed19e8e8d8ebf41516df5552 100644 --- a/SuperTagger/Utils/SymbolTokenizer.py +++ b/SuperTagger/Utils/SymbolTokenizer.py @@ -25,8 +25,8 @@ class SymbolTokenizer(): encoded_labels = [] labels = [[self.super_to_index[str(symbol)] for symbol in sents] for sents in tags] for l, s in zip(labels, sents_tokenized): - super_tok = torch.tensor(pad_sequence(l,len(s))) - encoded_labels.append(super_tok.tolist()) + super_tok = pad_sequence(l,len(s)) + encoded_labels.append(super_tok) return torch.tensor(encoded_labels) @@ -37,7 +37,7 @@ class SymbolTokenizer(): def pad_sequence(sequences, max_len=400): padded = [0] * max_len - padded[:len(sequences)] = sequences + padded[1:len(sequences)+1] = sequences return padded diff --git a/train.py b/train.py index dbf105a2a38f871f005524f15bd314938e83d6e1..e1d34ea0e5ab7293519709701fe3e8a4dc7c7feb 100644 --- a/train.py +++ b/train.py @@ -31,7 +31,7 @@ tagger = SuperTagger() tagger.create_new_model(len(index_to_super),'camembert-base',index_to_super) -tagger.train(texts,tags,tensorboard=True,checkpoint=True) +tagger.train(texts,tags,validation_rate=0,tensorboard=True,checkpoint=True) pred = tagger.predict(test_s)