diff --git a/README.md b/README.md index 5ece8c34aedf24cfb542a22c40d3263305b501eb..23a2c052c4add9cba536c5935da858a2efb41450 100644 --- a/README.md +++ b/README.md @@ -74,14 +74,15 @@ tagger.create_new_model(len(index_to_super), bert_name, index_to_super) # You can load your model for re-train this # tagger.load_weights("your/model/path") -tagger.train(texts,tags, checkpoint=True) +tagger.train(texts, tags, checkpoint=True) pred_without_argmax, pred_convert, bert_hidden_state = tagger.predict(texts[7]) ``` -In train, if you use `checkpoint=True`, the model is automatically saved in a folder: Training_XX-XX_XX-XX. It saves after each epoch. -Use `tensorboard=True` for log in same folder. (`tensorboard --logdir=logs` for see logs) +In train, if you use `checkpoint=True`, the model is automatically saved in a folder: Training_XX-XX_XX-XX. It saves +after each epoch. Use `tensorboard=True` for log in same folder. (`tensorboard --logdir=logs` for see logs) +`bert_name` can be any model available on [Hugging Face](https://huggingface.co/models) ## Authors [Rabault Julien](https://www.linkedin.com/in/julienrabault), de Pourtales Caroline \ No newline at end of file diff --git a/SuperTagger/SuperTagger.py b/SuperTagger/SuperTagger.py index 047acb41e3f83fa3d5210b5eeee88558c4634d9e..b49485ed9e9caa59968c58e8ba7826545c66a6ac 100644 --- a/SuperTagger/SuperTagger.py +++ b/SuperTagger/SuperTagger.py @@ -5,7 +5,7 @@ import time import torch import transformers -from torch import nn, Tensor +from torch import Tensor from torch.optim import Adam from torch.utils.data import Dataset, TensorDataset, random_split, DataLoader from torch.utils.tensorboard import SummaryWriter @@ -20,10 +20,12 @@ from SuperTagger.Utils.Tagging_bert_model import Tagging_bert_model logging.set_verbosity(logging.ERROR) +# region Utils + def output_create_dir(): """ - - @return: + Create le output dir for tensorboard and checkpoint + @return: output dir, tensorboard writter """ from datetime import datetime outpout_path = 'TensorBoard' @@ -35,10 +37,10 @@ def output_create_dir(): def categorical_accuracy(preds: list[list[int]], truth: list[list[int]]) -> float: """ - - @param preds: - @param truth: - @return: + Calculates how often predictions match argmax labels. + @param preds: batch of prediction. (argmax) + @param truth: batch of truth label. + @return: scoring of batch prediction. (Categorical accuracy values) """ good_label = 0 nb_label = 0 @@ -64,11 +66,17 @@ def format_time(elapsed): return str(datetime.timedelta(seconds=elapsed_rounded)) +# endregion Utils + +# region Class + class SuperTagger: + # region Constructor + def __init__(self): """ - + Python implementation of BertForTokenClassification using TLGbank data to develop supertaggers. """ self.index_to_tags = None self.num_label = None @@ -81,15 +89,19 @@ class SuperTagger: self.epoch_i = 0 self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") - self.loss = None self.trainable = False self.model_load = False + # endregion Constructor + + # region Instanciation + def load_weights(self, model_file: str): """ - yo mec - @param model_file: + Loads an SupperTagger saved with SupperTagger.__checkpoint_save() (during a train) from a file. + + @param model_file: path of .pt save of model """ self.trainable = False @@ -107,10 +119,10 @@ class SuperTagger: do_lower_case=True)) self.model.load_state_dict(params['state_dict']) self.optimizer = params['optimizer'] - self.epoch_i = args['epoch'] + # self.epoch_i = args['epoch'] print("\n The loading checkpoint was successful ! \n") print("\tBert model : ", self.bert_name) - # print("\tLast epoch : ", self.epoch_i) + print("\tLast epoch : ", self.epoch_i) print() except Exception as e: print("\n/!\ Can't load checkpoint model /!\ because :\n\n " + str(e), file=sys.stderr) @@ -122,35 +134,45 @@ class SuperTagger: def create_new_model(self, num_label: int, bert_name: str, index_to_tags: dict): """ + Instantiation and parameterization of a new bert model - @param num_label: - @param bert_name: - @param index_to_tags: + @param num_label: number of diferent labels (tags) + @param bert_name: name of model available on Hugging Face `<https://huggingface.co/models>` + @param index_to_tags: Dict for convert ID to tags """ assert len( index_to_tags) == num_label, f" len(index_to_tags) : {len(index_to_tags)} must be equels with num_label: {num_label}" self.model = Tagging_bert_model(bert_name, num_label + 1) index_to_tags = {k + 1: v for k, v in index_to_tags.items()} + # <unk> is used for the pad AND unknown tags index_to_tags[0] = '<unk>' + self.index_to_tags = index_to_tags self.bert_name = bert_name self.sent_tokenizer = SentencesTokenizer(AutoTokenizer.from_pretrained( bert_name, do_lower_case=True)) - self.loss = nn.CrossEntropyLoss(ignore_index=0) self.optimizer = Adam(params=self.model.parameters(), lr=2e-4, eps=1e-8) self.tags_tokenizer = SymbolTokenizer(index_to_tags) self.trainable = True self.model_load = True - def predict(self, sentences: list[str]) -> (list[list[list[float]]], list[list[str]], Tensor): + # endregion Instanciation + + # region Usage + + def predict(self, sentences) -> (list[list[list[float]]], list[list[str]], Tensor): """ + Predict and convert sentences in tags (depends on the dictation given when the model was created) - @param sentences: - @return: + @param sentences: list of sentences : list[str] OR one sentences : str + @return: tags prediction for all sentences (no argmax tags, convert tags, embedding layer of bert ) """ - assert self.trainable or self.model is None, "Please use the create_new_model(...) or load_weights(...) function before the predict, the model is not integrated" + assert self.trainable or self.model is None, "Please use the create_new_model(...) or load_weights(...) " \ + "function before the predict, the model is not integrated " + assert type(sentences) == str or type(sentences) == list[str], "param sentences: list of sentences : list[" \ + "str] OR one sentences : str " sentences = [sentences] if type(sentences) == str else sentences self.model.eval() @@ -163,21 +185,34 @@ class SuperTagger: return preds, self.tags_tokenizer.convert_ids_to_tags(torch.argmax(preds, dim=2).detach()), hidden - def train(self, sentences: list[str], tags: list[list[str]], validation_rate=0.1, epochs=20, batch_size=32, + def forward(self, b_sents_tokenized: Tensor, b_sents_mask: Tensor) -> (Tensor, Tensor): + """ + Function used for the linker (same of predict) + """ + with torch.no_grad(): + logit, hidden = self.model.predict((b_sents_tokenized, b_sents_mask)) + return logit, hidden + + def train(self, sentences: list[str], tags: list[list[str]], validation_rate=0.1, epochs=20, batch_size=16, tensorboard=False, checkpoint=False): """ - - @param sentences: - @param tags: - @param validation_rate: - @param epochs: - @param batch_size: - @param tensorboard: - @param checkpoint: + Starts the training of the model, either new or previously loaded + + @param sentences: list of sentences for train (X) + @param tags: list of tags for train (Y) + @param validation_rate: percentage of validation data [0-1] + @param epochs: number of epoch (50 recommended) + @param batch_size: number of sample in batch (32 recommended, attention to memory) + @param tensorboard: use tensorboard for see loss and accuracy + @param checkpoint: save the model after each epoch """ assert self.trainable or self.model is None, "Please use the create_new_model(...) or load_weights(...) function before the train, the model is not integrated" + assert len(sentences) == len( + tags), f" num of sentences (X): {len(sentences)} must be equals with num of labels " \ + f"(Y): {len(tags)} " + if checkpoint or tensorboard: checkpoint_dir, writer = output_create_dir() @@ -189,49 +224,59 @@ class SuperTagger: for epoch_i in range(0, epochs): print("") - print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs)) + print('======== Epoch {:} / {:} ========'.format(epoch_i, epochs)) print('Training...') + # Train epoch_acc, epoch_loss, training_time = self.__train_epoch(training_dataloader) + # Validation if validation_rate > 0.0: eval_accuracy, eval_loss, nb_eval_steps = self.__eval_epoch(validation_dataloader) print("") - print(f'Epoch: {epoch_i + 1:02} | Epoch Time: {training_time}') + print(f'Epoch: {epoch_i:02} | Epoch Time: {training_time}') print(f'\tTrain Loss: {epoch_loss:.3f} | Train Acc: {epoch_acc * 100:.2f}%') if validation_rate > 0.0: print(f'\tVal Loss: {eval_loss:.3f} | Val Acc: {eval_accuracy * 100:.2f}%') if tensorboard: writer.add_scalars(f'Accuracy', { - 'Train': epoch_acc}, epoch_i + 1) + 'Train': epoch_acc}, epoch_i) writer.add_scalars(f'Loss', { - 'Train': epoch_loss}, epoch_i + 1) + 'Train': epoch_loss}, epoch_i) if validation_rate > 0.0: writer.add_scalars(f'Accuracy', { - 'Validation': eval_accuracy}, epoch_i + 1) + 'Validation': eval_accuracy}, epoch_i) writer.add_scalars(f'Loss', { - 'Validation': eval_loss}, epoch_i + 1) + 'Validation': eval_loss}, epoch_i) + + self.epoch_i += 1 if checkpoint: self.__checkpoint_save(path=os.path.join(checkpoint_dir, 'model_check.pt')) + # endregion Usage + + # region Private + def __preprocess_data(self, batch_size: int, sentences: list[str], tags: list[list[str]], validation_rate: float) -> (DataLoader, DataLoader): """ + Create torch dataloader for training - @param batch_size: - @param sentences: - @param tags: - @param validation_rate: - @return: + @param batch_size: number of sample in batch + @param sentences: list of sentences for train (X) + @param tags: list of tags for train (Y) + @param validation_rate: percentage of validation data [0-1] + @return: (training dataloader, validation dataloader) """ validation_dataloader = None sents_tokenized_t, sents_mask_t = self.sent_tokenizer.fit_transform_tensors(sentences) tags_t = self.tags_tokenizer.convert_batchs_to_ids(tags, sents_tokenized_t) dataset = TensorDataset(sents_tokenized_t, sents_mask_t, tags_t) + train_size = int(validation_rate * len(dataset)) print('{:>5,} training samples'.format(train_size)) @@ -247,9 +292,10 @@ class SuperTagger: def __train_epoch(self, training_dataloader: DataLoader) -> (float, float, str): """ + Train on epoch - @param training_dataloader: - @return: + @param training_dataloader: dataloader of training data + @return: (epoch accuracy, epoch loss, training time) """ self.model.train() epoch_loss = 0 @@ -258,7 +304,7 @@ class SuperTagger: i = 0 with tqdm(training_dataloader, unit="batch") as tepoch: for batch in tepoch: - # Unpack this training batch from our dataloader. + # Convert to device b_sents_tokenized = batch[0].to(self.device) b_sents_mask = batch[1].to(self.device) targets = batch[2].to(self.device) @@ -287,17 +333,13 @@ class SuperTagger: return epoch_acc, epoch_loss, training_time - def foward(self, b_sents_tokenized: Tensor, b_sents_mask: Tensor) -> (Tensor, Tensor): + def __eval_epoch(self, validation_dataloader: DataLoader) -> (float, float, int): """ + Validation on epoch - @param b_sents_tokenized: - @param b_sents_mask: - @return: + @param validation_dataloader: dataloader of validation data + @return: (epoch accuracy, epoch loss, num step) """ - _, logit, hidden = self.model((b_sents_tokenized, b_sents_mask)) - return logit, hidden - - def __eval_epoch(self, validation_dataloader: DataLoader) -> (float, float, int): self.model.eval() eval_loss = 0 eval_accuracy = 0 @@ -305,6 +347,7 @@ class SuperTagger: with torch.no_grad(): print("Start eval") for step, batch in enumerate(validation_dataloader): + # Convert to device b_sents_tokenized = batch[0].to(self.device) b_sents_mask = batch[1].to(self.device) b_symbols_tokenized = batch[2].to(self.device) @@ -326,8 +369,8 @@ class SuperTagger: def __checkpoint_save(self, path='/model_check.pt'): """ - - @param path: + Save the model with good parameters + @param path: poth and name for save """ self.model.cpu() # print('save model parameters to [%s]' % path, file=sys.stderr) @@ -338,3 +381,7 @@ class SuperTagger: 'optimizer': self.optimizer, }, path) self.model.to(self.device) + + # endregion Private + +# endregion Class diff --git a/SuperTagger/Utils/Tagging_bert_model.py b/SuperTagger/Utils/Tagging_bert_model.py index da37bfececbf081c48c2662e41e7abffa4879e90..83ef46f6791561549123c5901ea65f691bc07283 100644 --- a/SuperTagger/Utils/Tagging_bert_model.py +++ b/SuperTagger/Utils/Tagging_bert_model.py @@ -7,10 +7,7 @@ from transformers import logging class Tagging_bert_model(Module): """ - A standard Encoder-Decoder architecture. Base for this and many - other models. - decoder : instance of Decoder """ def __init__(self, bert_name, num_labels): diff --git a/SuperTagger/Utils/utils.py b/SuperTagger/Utils/utils.py index 863e7e276efdfae961c612d9e44071710ceee69a..7c4a2a010627656972c531f8b57118693ac85150 100644 --- a/SuperTagger/Utils/utils.py +++ b/SuperTagger/Utils/utils.py @@ -22,12 +22,14 @@ def read_csv_pgbar(csv_path, nrows=float('inf'), chunksize=100): print("#" * 20) return df + def load_obj(name): with open(name + '.pkl', 'rb') as f: import pickle return pickle.load(f) -def categorical_accuracy_str(preds : list[list[float]], truth: list[list[float]]) -> float: + +def categorical_accuracy_str(preds: list[list[float]], truth: list[list[float]]) -> float: nb_label = 0 good_label = 0 for i in range(len(truth)): @@ -39,6 +41,3 @@ def categorical_accuracy_str(preds : list[list[float]], truth: list[list[float]] nb_label += 1 return good_label / nb_label - - -