diff --git a/.gitignore b/.gitignore index 371503a80d900dde316cbdc92efdf8fa6ce1812f..f57c436fa6463b2f0a0f382938ddf148bd516eea 100644 --- a/.gitignore +++ b/.gitignore @@ -1,17 +1,11 @@ .idea -tests venv *.pyc .DS_Store -.env -./bash_GPU.sh -push pull texte -logs -Output .data TensorBoard models -*.pkl -good_models/model_check.pt +good_models main.py *.pt +Datasets/Utils diff --git a/Configuration/Configuration.py b/Configuration/Configuration.py deleted file mode 100644 index 3d94c9b2e3aad20cebafe0dd5015caee1c5e50b3..0000000000000000000000000000000000000000 --- a/Configuration/Configuration.py +++ /dev/null @@ -1,17 +0,0 @@ -import os -from configparser import ConfigParser - -# Read configuration file -path_current_directory = os.path.dirname(__file__) -path_config_file = os.path.join(path_current_directory, 'config.ini') -config = ConfigParser() -config.read(path_config_file) - -# region Get section - -version = config["VERSION"] - -modelDecoderConfig = config["MODEL_DECODER"] -modelTrainingConfig = config["MODEL_TRAINING"] - -# endregion Get section diff --git a/Configuration/config.ini b/Configuration/config.ini deleted file mode 100644 index 3fb0157dd2a41afc67607cb94f996daa3236ae0b..0000000000000000000000000000000000000000 --- a/Configuration/config.ini +++ /dev/null @@ -1,18 +0,0 @@ -[VERSION] -transformers = 4.16.2 -[MODEL_DECODER] -dim_encoder = 768 -dim_decoder = 128 -num_rnn_layers=1 -dropout=0.1 -teacher_forcing=0.05 -[MODEL_TRAINING] -batch_size=16 -epoch=10 -seed_val=42 -learning_rate=0.005 -use_checkpoint_SAVE=1 -output_path=Output -use_checkpoint_LOAD=1 -input_path=models_save -model_to_load=model_check.pt \ No newline at end of file diff --git a/Datasets/index_to_pos1.pkl b/Datasets/index_to_pos1.pkl new file mode 100644 index 0000000000000000000000000000000000000000..c212d7e65f99a80671dfe2dba3481bbd49c22c6e Binary files /dev/null and b/Datasets/index_to_pos1.pkl differ diff --git a/Datasets/index_to_super.pkl b/Datasets/index_to_super.pkl new file mode 100644 index 0000000000000000000000000000000000000000..606848f7652155e0048c281320e36f46886661e5 Binary files /dev/null and b/Datasets/index_to_super.pkl differ diff --git a/Datasets/m2_dataset_V2.csv b/Datasets/m2_dataset.csv similarity index 100% rename from Datasets/m2_dataset_V2.csv rename to Datasets/m2_dataset.csv diff --git a/README.md b/README.md index 5994f1455440e7055fec3c5dd2f7e9baaa7e0cd5..5ece8c34aedf24cfb542a22c40d3263305b501eb 100644 --- a/README.md +++ b/README.md @@ -1,15 +1,87 @@ # DeepGrail +This repository contains a Python implementation of BertForTokenClassification using TLGbank data to develop +part-of-speech taggers and supertaggers. + +This code was designed to work with the [DeepGrail Linker](https://gitlab.irit.fr/pnria/global-helper/deepgrail-linker) +to provide a wide coverage syntactic and semantic parser for French. But the Tagger is independent, you can use it for your own tags. + ## Usage +### Structure + +``` +. +├── Datasets # TLGbank data +├── SuperTagger # Implementation of BertForTokenClassification +│ ├── SuperTagger.py # Main class +│ └── Tagging_bert_model.py # Bert model +├── predict.py # Example of prediction +└── train.py # Example of train +``` + ### Installation + Python 3.9.10 **(Warning don't use Python 3.10**+**)** Clone the project locally. In a clean python venv do `pip install -r requirements.txt` +Download already trained models or prepare data for **your** train. + ## How To use -TODO ... +**predict.py** and **train.py** show simple examples of how to use the model, feel free to look at them before using the +SupperTagger + +### Utils + +For load **m2_dataset.csv**, you can use `SuperTagger.Utils.utils.read_csv_pgbar(...)`. This function return a pandas +dataframe. + +### Prediction + +For predict on your data you need to load a model (save with this code). + +``` +df = read_csv_pgbar(file_path,20) +texts = df['X'].tolist() + +tagger = SuperTagger() + +tagger.load_weights("your/model/path") + +pred_without_argmax, pred_convert, bert_hidden_state = tagger.predict(texts[7]) + +print(pred_convert) +#['let', 'dr(0,s,s)', 'let', 'dr(0,dr(0,s,s),np)', 'dr(0,np,n)', 'dr(0,n,n)', 'let', 'n', 'let', 'dl(0,n,n)', 'dr(0,dl(0,dl(0,n,n),dl(0,n,n)),dl(0,n,n))', 'dl(0,n,n)', 'let', 'dr(0,np,np)', 'np', 'dr(0,dl(0,np,np),np)', 'np', 'dr(0,dl(0,np,np),np)', 'np', 'dr(0,dl(0,np,s),dl(0,np,s))', 'dr(0,dl(0,np,s),np)', 'dl(1,s,s)', 'np', 'dr(0,dl(0,np,np),n)', 'n', 'dl(0,s,txt)'] +``` + +### Training + +``` +df = read_csv_pgbar(file_path,1000) +texts = df['X'].tolist() +tags = df['Z'].tolist() + +#Dict for convert ID to token (The dict is save with the model for prediction) +index_to_super = load_obj('Datasets/index_to_super') + +tagger = SuperTagger() + +bert_name = 'camembert-base' + +tagger.create_new_model(len(index_to_super), bert_name, index_to_super) +# You can load your model for re-train this +# tagger.load_weights("your/model/path") + +tagger.train(texts,tags, checkpoint=True) + +pred_without_argmax, pred_convert, bert_hidden_state = tagger.predict(texts[7]) +``` + +In train, if you use `checkpoint=True`, the model is automatically saved in a folder: Training_XX-XX_XX-XX. It saves after each epoch. +Use `tensorboard=True` for log in same folder. (`tensorboard --logdir=logs` for see logs) -tensorboard --logdir=logs +## Authors +[Rabault Julien](https://www.linkedin.com/in/julienrabault), de Pourtales Caroline \ No newline at end of file diff --git a/SuperTagger/SuperTagger.py b/SuperTagger/SuperTagger.py index 70c15924fe51a8d75b35490393e984175dfe18c1..95b3d59d8704efb3efc9ffed80382a8fc7146cfb 100644 --- a/SuperTagger/SuperTagger.py +++ b/SuperTagger/SuperTagger.py @@ -1,26 +1,45 @@ +import datetime import os import sys - import time -import datetime -from tkinter import Variable import torch import transformers -from torch import nn +from torch import nn, Tensor from torch.optim import Adam +from torch.utils.data import Dataset, TensorDataset, random_split, DataLoader from torch.utils.tensorboard import SummaryWriter from tqdm import tqdm from transformers import AutoTokenizer - -from torch.utils.data import Dataset, TensorDataset, random_split +from transformers import logging from SuperTagger.Utils.SentencesTokenizer import SentencesTokenizer from SuperTagger.Utils.SymbolTokenizer import SymbolTokenizer from SuperTagger.Utils.Tagging_bert_model import Tagging_bert_model +logging.set_verbosity(logging.ERROR) + + +def output_create_dir(): + """ + + @return: + """ + from datetime import datetime + outpout_path = 'TensorBoard' + training_dir = os.path.join(outpout_path, 'Tranning_' + datetime.today().strftime('%d-%m_%H-%M')) + logs_dir = os.path.join(training_dir, 'logs') + writer = SummaryWriter(log_dir=logs_dir) + return training_dir, writer + -def categorical_accuracy(preds, truth): +def categorical_accuracy(preds: list[list[int]], truth: list[list[int]]) -> float: + """ + + @param preds: + @param truth: + @return: + """ good_label = 0 nb_label = 0 for i in range(len(truth)): @@ -48,7 +67,9 @@ def format_time(elapsed): class SuperTagger: def __init__(self): + """ + """ self.index_to_tags = None self.num_label = None self.bert_name = None @@ -65,7 +86,11 @@ class SuperTagger: self.trainable = False self.model_load = False - def load_weights(self, model_file): + def load_weights(self, model_file: str): + """ + yo mec + @param model_file: + """ self.trainable = False print("#" * 15) @@ -95,8 +120,13 @@ class SuperTagger: self.model_load = True self.trainable = True - def create_new_model(self, num_label, bert_name, index_to_tags: dict): + def create_new_model(self, num_label: int, bert_name: str, index_to_tags: dict): + """ + @param num_label: + @param bert_name: + @param index_to_tags: + """ assert len( index_to_tags) == num_label, f" len(index_to_tags) : {len(index_to_tags)} must be equels with num_label: {num_label}" @@ -114,9 +144,15 @@ class SuperTagger: self.trainable = True self.model_load = True - def predict(self, sentences): + def predict(self, sentences: list[str]) -> (list[list[list[float]]], list[list[str]], Tensor): + """ + @param sentences: + @return: + """ assert self.trainable or self.model is None, "Please use the create_new_model(...) or load_weights(...) function before the predict, the model is not integrated" + sentences = [sentences] if type(sentences) == str else sentences + self.model.eval() with torch.no_grad(): sents_tokenized_t, sents_mask_t = self.sent_tokenizer.fit_transform_tensors(sentences) @@ -127,13 +163,23 @@ class SuperTagger: return preds, self.tags_tokenizer.convert_ids_to_tags(preds.detach()), hidden - def train(self, sentences, tags, validation_rate=0.1, epochs=20, batch_size=32, tensorboard=False, + def train(self, sentences: list[str], tags: list[list[str]], validation_rate=0.1, epochs=20, batch_size=32, + tensorboard=False, checkpoint=False): - + """ + + @param sentences: + @param tags: + @param validation_rate: + @param epochs: + @param batch_size: + @param tensorboard: + @param checkpoint: + """ assert self.trainable or self.model is None, "Please use the create_new_model(...) or load_weights(...) function before the train, the model is not integrated" if checkpoint or tensorboard: - checkpoint_dir, writer = self.__output_create() + checkpoint_dir, writer = output_create_dir() training_dataloader, validation_dataloader = self.__preprocess_data(batch_size, sentences, tags, 1 - validation_rate) @@ -171,8 +217,16 @@ class SuperTagger: if checkpoint: self.__checkpoint_save(path=os.path.join(checkpoint_dir, 'model_check.pt')) - def __preprocess_data(self, batch_size, sentences, tags, validation_rate): + def __preprocess_data(self, batch_size: int, sentences: list[str], tags: list[list[str]], + validation_rate: float) -> (DataLoader, DataLoader): + """ + @param batch_size: + @param sentences: + @param tags: + @param validation_rate: + @return: + """ validation_dataloader = None sents_tokenized_t, sents_mask_t = self.sent_tokenizer.fit_transform_tensors(sentences) @@ -191,15 +245,12 @@ class SuperTagger: training_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True) return training_dataloader, validation_dataloader - def __output_create(self): - from datetime import datetime - outpout_path = 'TensorBoard' - training_dir = os.path.join(outpout_path, 'Tranning_' + datetime.today().strftime('%d-%m_%H-%M')) - logs_dir = os.path.join(training_dir, 'logs') - writer = SummaryWriter(log_dir=logs_dir) - return training_dir, writer + def __train_epoch(self, training_dataloader: DataLoader) -> (float, float, str): + """ - def __train_epoch(self, training_dataloader): + @param training_dataloader: + @return: + """ self.model.train() epoch_loss = 0 epoch_acc = 0 @@ -217,8 +268,6 @@ class SuperTagger: predictions = torch.argmax(logit, dim=2).detach().cpu().numpy() label_ids = targets.cpu().numpy() - # torch.nn.functional.one_hot(targets).long() - # torch.argmax(logit) acc = categorical_accuracy(predictions, label_ids) @@ -238,11 +287,17 @@ class SuperTagger: return epoch_acc, epoch_loss, training_time - def foward(self,b_sents_tokenized, b_sents_mask): + def foward(self, b_sents_tokenized: Tensor, b_sents_mask: Tensor) -> (Tensor, Tensor): + """ + + @param b_sents_tokenized: + @param b_sents_mask: + @return: + """ _, logit, hidden = self.model((b_sents_tokenized, b_sents_mask)) return logit, hidden - def __eval_epoch(self, validation_dataloader): + def __eval_epoch(self, validation_dataloader: DataLoader) -> (float, float, int): self.model.eval() eval_loss = 0 eval_accuracy = 0 @@ -270,6 +325,10 @@ class SuperTagger: return eval_accuracy, eval_loss, nb_eval_steps def __checkpoint_save(self, path='/model_check.pt'): + """ + + @param path: + """ self.model.cpu() # print('save model parameters to [%s]' % path, file=sys.stderr) @@ -279,5 +338,3 @@ class SuperTagger: 'optimizer': self.optimizer, }, path) self.model.to(self.device) - - diff --git a/SuperTagger/Utils/SentencesTokenizer.py b/SuperTagger/Utils/SentencesTokenizer.py index f1fbea51286ffb4f86e8a0b4f199bd78eb292772..ee72006edd06bac408e2415af2202d10cf226954 100644 --- a/SuperTagger/Utils/SentencesTokenizer.py +++ b/SuperTagger/Utils/SentencesTokenizer.py @@ -1,6 +1,3 @@ -import numpy as np -import torch - class SentencesTokenizer(): @@ -12,28 +9,7 @@ class SentencesTokenizer(): return self.tokenizer(sents, padding=True) def fit_transform_tensors(self, sents): - # , return_tensors = 'pt' temp = self.tokenizer(sents, padding=True, return_tensors = 'pt') - # - # len_sent_max = len(temp['attention_mask'][0]) - # - # input_ids = np.ones((len(sents),len_sent_max)) - # attention_mask = np.zeros((len(sents),len_sent_max)) - # - # for i in range(len(temp['offset_mapping'])): - # h = 1 - # input_ids[i][0] = self.tokenizer.cls_token_id - # attention_mask[i][0] = 1 - # for j in range (1,len_sent_max-1): - # if temp['offset_mapping'][i][j][1] != temp['offset_mapping'][i][j+1][0]: - # input_ids[i][h] = temp['input_ids'][i][j] - # attention_mask[i][h] = 1 - # h += 1 - # input_ids[i][h] = self.tokenizer.eos_token_id - # attention_mask[i][h] = 1 - # - # input_ids = torch.tensor(input_ids).long() - # attention_mask = torch.tensor(attention_mask) return temp["input_ids"], temp["attention_mask"] diff --git a/SuperTagger/Utils/utils.py b/SuperTagger/Utils/utils.py index 03aadfeebc90e85a8b15d912c62459efdc2c9cc1..4c22a68bbb5e2f77093792025439626b0011413b 100644 --- a/SuperTagger/Utils/utils.py +++ b/SuperTagger/Utils/utils.py @@ -1,7 +1,4 @@ -import datetime - import pandas as pd -import torch from tqdm import tqdm @@ -16,7 +13,8 @@ def read_csv_pgbar(csv_path, nrows=float('inf'), chunksize=100): rows = nrows with tqdm(total=rows, desc='Rows read: ') as bar: - for chunk in pd.read_csv(csv_path, converters={'Y1': pd.eval,'Y2': pd.eval,'Z': pd.eval}, chunksize=chunksize, nrows=rows): + for chunk in pd.read_csv(csv_path, converters={'Y1': pd.eval, 'Y2': pd.eval, 'Z': pd.eval}, chunksize=chunksize, + nrows=rows): chunk_list.append(chunk) bar.update(len(chunk)) @@ -24,5 +22,10 @@ def read_csv_pgbar(csv_path, nrows=float('inf'), chunksize=100): print("#" * 20) return df +def load_obj(name): + with open(name + '.pkl', 'rb') as f: + import pickle + return pickle.load(f) + diff --git a/bash_GPU.sh b/bash_GPU.sh deleted file mode 100644 index 665f769d8046d6fd61167efbbbdd8e46d5495d94..0000000000000000000000000000000000000000 --- a/bash_GPU.sh +++ /dev/null @@ -1,13 +0,0 @@ -#!/bin/sh -#SBATCH --job-name=N-tensorboard -#SBATCH --partition=RTX6000Node -#SBATCH --gres=gpu:1 -#SBATCH --mem=32000 -#SBATCH --gres-flags=enforce-binding -#SBATCH --error="error_rtx1.err" -#SBATCH --output="out_rtx1.out" - -module purge -module load singularity/3.0.3 - -srun singularity exec /logiciels/containerCollections/CUDA11/pytorch-NGC-21-03-py3.sif python "train.py" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index c611e9b46c45eac6d06ab14c47a951c0c641796c..41ce3b5b133a0d8ce94376ac0e32283b3c9f2417 100644 Binary files a/requirements.txt and b/requirements.txt differ diff --git a/train.py b/train.py index d5e66aaba2d1810492e8ec908eeed05c214fc7e3..ddcda4c8eb610e6377b9842357390fa017267105 100644 --- a/train.py +++ b/train.py @@ -1,14 +1,7 @@ from SuperTagger.SuperTagger import SuperTagger -from SuperTagger.Utils.utils import read_csv_pgbar +from SuperTagger.Utils.utils import read_csv_pgbar, load_obj - -def load_obj(name): - with open(name + '.pkl', 'rb') as f: - import pickle - return pickle.load(f) - - -file_path = 'Datasets/m2_dataset_V2.csv' +file_path = 'Datasets/m2_dataset.csv' df = read_csv_pgbar(file_path,1000) @@ -25,7 +18,6 @@ tags = tags[4:] index_to_super = load_obj('Datasets/index_to_pos1') -super_to_index = {v: int(k) for k, v in index_to_super.items()} tagger = SuperTagger()