Skip to content
Snippets Groups Projects
Commit 48c6ce7e authored by Caroline DE POURTALES's avatar Caroline DE POURTALES
Browse files

update clean

parent 7aa67b28
Branches
No related tags found
1 merge request!6Linker with transformer
Showing with 0 additions and 16478 deletions
.idea
venv
*.pyc
.DS_Store
.data
TensorBoard
models
good_models
main.py
*.pt
Datasets/Utils
*.zip
File deleted
File deleted
This diff is collapsed.
# DeepGrail
This repository contains a Python implementation of BertForTokenClassification using TLGbank data to develop
part-of-speech taggers and supertaggers.
This code was designed to work with the [DeepGrail Linker](https://gitlab.irit.fr/pnria/global-helper/deepgrail-linker)
to provide a wide coverage syntactic and semantic parser for French. But the Tagger is independent, you can use it for your own tags.
## Usage
### Structure
```
.
├── Datasets # TLGbank data
├── SuperTagger # Implementation of BertForTokenClassification
│ ├── SuperTagger.py # Main class
│ └── Tagging_bert_model.py # Bert model
├── predict.py # Example of prediction
└── train.py # Example of train
```
### Installation
Python 3.9.10 **(Warning don't use Python 3.10**+**)**
Clone the project locally. In a clean python venv do `pip install -r requirements.txt`
Download already trained models or prepare data for **your** train.
## How To use
**predict.py** and **train.py** show simple examples of how to use the model, feel free to look at them before using the
SupperTagger
### Utils
For load **m2_dataset.csv**, you can use `SuperTagger.Utils.utils.read_csv_pgbar(...)`. This function return a pandas
dataframe.
### Prediction
For predict on your data you need to load a model (save with this code).
```
df = read_csv_pgbar(file_path,20)
texts = df['X'].tolist()
tagger = SuperTagger()
tagger.load_weights("your/model/path")
pred_without_argmax, pred_convert, bert_hidden_state = tagger.predict(texts[7])
print(pred_convert)
#['let', 'dr(0,s,s)', 'let', 'dr(0,dr(0,s,s),np)', 'dr(0,np,n)', 'dr(0,n,n)', 'let', 'n', 'let', 'dl(0,n,n)', 'dr(0,dl(0,dl(0,n,n),dl(0,n,n)),dl(0,n,n))', 'dl(0,n,n)', 'let', 'dr(0,np,np)', 'np', 'dr(0,dl(0,np,np),np)', 'np', 'dr(0,dl(0,np,np),np)', 'np', 'dr(0,dl(0,np,s),dl(0,np,s))', 'dr(0,dl(0,np,s),np)', 'dl(1,s,s)', 'np', 'dr(0,dl(0,np,np),n)', 'n', 'dl(0,s,txt)']
```
### Training
```
df = read_csv_pgbar(file_path,1000)
texts = df['X'].tolist()
tags = df['Z'].tolist()
#Dict for convert ID to token (The dict is save with the model for prediction)
index_to_super = load_obj('Datasets/index_to_super')
tagger = SuperTagger()
bert_name = 'camembert-base'
tagger.create_new_model(len(index_to_super), bert_name, index_to_super)
# You can load your model for re-train this
# tagger.load_weights("your/model/path")
tagger.train(texts, tags, checkpoint=True)
pred_without_argmax, pred_convert, bert_hidden_state = tagger.predict(texts[7])
```
In train, if you use `checkpoint=True`, the model is automatically saved in a folder: Training_XX-XX_XX-XX. It saves
after each epoch. Use `tensorboard=True` for log in same folder. (`tensorboard --logdir=logs` for see logs)
`bert_name` can be any model available on [Hugging Face](https://huggingface.co/models)
## Authors
[Rabault Julien](https://www.linkedin.com/in/julienrabault), de Pourtales Caroline
\ No newline at end of file
import datetime
import os
import sys
import time
import torch
import transformers
from torch import Tensor
from torch.optim import Adam
from torch.utils.data import Dataset, TensorDataset, random_split, DataLoader
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from transformers import AutoTokenizer
from transformers import logging
from .Utils.SentencesTokenizer import SentencesTokenizer
from .Utils.SymbolTokenizer import SymbolTokenizer
from .Utils.Tagging_bert_model import Tagging_bert_model
logging.set_verbosity(logging.ERROR)
# region Utils
def output_create_dir():
"""
Create le output dir for tensorboard and checkpoint
@return: output dir, tensorboard writter
"""
from datetime import datetime
outpout_path = 'TensorBoard'
training_dir = os.path.join(outpout_path, 'Tranning_' + datetime.today().strftime('%d-%m_%H-%M'))
logs_dir = os.path.join(training_dir, 'logs')
writer = SummaryWriter(log_dir=logs_dir)
return training_dir, writer
def categorical_accuracy(preds, truth) -> float:
"""
Calculates how often predictions match argmax labels.
@param preds: batch of prediction. (argmax)
@param truth: batch of truth label.
@return: scoring of batch prediction. (Categorical accuracy values)
"""
good_label = 0
nb_label = 0
for i in range(len(truth)):
sublist_truth = truth[i]
sublist_preds = preds[i]
for j in range(len(sublist_truth)):
if sublist_truth[j] != 0:
if sublist_truth[j] == sublist_preds[j]:
good_label += 1
nb_label += 1
return good_label / nb_label
def format_time(elapsed):
'''
Takes a time in seconds and returns a string hh:mm:ss
'''
# Round to the nearest second.
elapsed_rounded = int(round(elapsed))
# Format as hh:mm:ss
return str(datetime.timedelta(seconds=elapsed_rounded))
# endregion Utils
# region Class
class SuperTagger:
# region Constructor
def __init__(self):
"""
Python implementation of BertForTokenClassification using TLGbank data to develop supertaggers.
"""
self.index_to_tags = None
self.num_label = None
self.bert_name = None
self.sent_tokenizer = None
self.tags_tokenizer = None
self.model = None
self.optimizer = None
self.epoch_i = 0
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.trainable = False
self.model_load = False
# endregion Constructor
# region Instanciation
def load_weights(self, model_file):
"""
Loads an SupperTagger saved with SupperTagger.__checkpoint_save() (during a train) from a file.
@param model_file: path of .pt save of model
"""
self.trainable = False
print("#" * 20)
print("\n Loading...")
try:
params = torch.load(model_file, map_location=self.device)
args = params['args']
self.bert_name = args['bert_name']
self.index_to_tags = args['index_to_tags']
self.num_label = len(self.index_to_tags)
self.model = Tagging_bert_model(self.bert_name, self.num_label)
self.tags_tokenizer = SymbolTokenizer(self.index_to_tags)
self.sent_tokenizer = SentencesTokenizer(transformers.AutoTokenizer.from_pretrained(
self.bert_name,
do_lower_case=True))
self.model.load_state_dict(params['state_dict'])
self.optimizer = params['optimizer']
# self.epoch_i = args['epoch']
print("\n The loading checkpoint was successful ! \n")
print("\tBert model : ", self.bert_name)
print("\tLast epoch : ", self.epoch_i)
print()
except Exception as e:
print("\n/!\ Can't load checkpoint model /!\ because :\n\n " + str(e), file=sys.stderr)
raise e
print("#" * 20)
self.model_load = True
self.trainable = True
def create_new_model(self, num_label, bert_name, index_to_tags):
"""
Instantiation and parameterization of a new bert model
@param num_label: number of diferent labels (tags)
@param bert_name: name of model available on Hugging Face `<https://huggingface.co/models>`
@param index_to_tags: Dict for convert ID to tags
"""
assert len(
index_to_tags) == num_label, f" len(index_to_tags) : {len(index_to_tags)} must be equels with num_label: {num_label}"
self.model = Tagging_bert_model(bert_name, num_label + 1)
index_to_tags = {k + 1: v for k, v in index_to_tags.items()}
# <unk> is used for the pad AND unknown tags
index_to_tags[0] = '<unk>'
self.index_to_tags = index_to_tags
self.bert_name = bert_name
self.sent_tokenizer = SentencesTokenizer(AutoTokenizer.from_pretrained(
bert_name,
do_lower_case=True))
self.optimizer = Adam(params=self.model.parameters(), lr=2e-4, eps=1e-8)
self.tags_tokenizer = SymbolTokenizer(index_to_tags)
self.trainable = True
self.model_load = True
# endregion Instanciation
# region Usage
def predict(self, sentences):
"""
Predict and convert sentences in tags (depends on the dictation given when the model was created)
@param sentences: list of sentences : list[str] OR one sentences : str
@return: tags prediction for all sentences (no argmax tags, convert tags, embedding layer of bert )
"""
assert self.trainable or self.model is None, "Please use the create_new_model(...) or load_weights(...) " \
"function before the predict, the model is not integrated "
assert type(sentences) == str or type(sentences) == list[str], "param sentences: list of sentences : list[" \
"str] OR one sentences : str "
sentences = [sentences] if type(sentences) == str else sentences
self.model.eval()
with torch.no_grad():
sents_tokenized_t, sents_mask_t = self.sent_tokenizer.fit_transform_tensors(sentences)
self.model = self.model.cpu()
output = self.model.predict((sents_tokenized_t, sents_mask_t))
return output['logit'], self.tags_tokenizer.convert_ids_to_tags(torch.argmax(output['logit'], dim=2).detach())
def forward(self, b_sents_tokenized, b_sents_mask):
"""
Function used for the linker (same of predict)
"""
with torch.no_grad():
output = self.model.predict((b_sents_tokenized, b_sents_mask))
return output
def train(self, sentences, tags, validation_rate=0.1, epochs=20, batch_size=16,
tensorboard=False,
checkpoint=False):
"""
Starts the training of the model, either new or previously loaded
@param sentences: list of sentences for train (X)
@param tags: list of tags for train (Y)
@param validation_rate: percentage of validation data [0-1]
@param epochs: number of epoch (50 recommended)
@param batch_size: number of sample in batch (32 recommended, attention to memory)
@param tensorboard: use tensorboard for see loss and accuracy
@param checkpoint: save the model after each epoch
"""
assert self.trainable or self.model is None, "Please use the create_new_model(...) or load_weights(...) function before the train, the model is not integrated"
assert len(sentences) == len(
tags), f" num of sentences (X): {len(sentences)} must be equals with num of labels " \
f"(Y): {len(tags)} "
if checkpoint or tensorboard:
checkpoint_dir, writer = output_create_dir()
training_dataloader, validation_dataloader = self.__preprocess_data(batch_size, sentences, tags,
1 - validation_rate)
epochs = epochs - self.epoch_i
self.model = self.model.to(self.device)
self.model.train()
for epoch_i in range(0, epochs):
print("")
print('======== Epoch {:} / {:} ========'.format(epoch_i+1, epochs))
print('Training...')
# Train
epoch_acc, epoch_loss, training_time = self.__train_epoch(training_dataloader)
# Validation
if validation_rate > 0.0:
eval_accuracy, eval_loss, nb_eval_steps = self.__eval_epoch(validation_dataloader)
print("")
print(f'Epoch: {epoch_i+1:02} | Epoch Time: {training_time}')
print(f'\tTrain Loss: {epoch_loss:.3f} | Train Acc: {epoch_acc * 100:.2f}%')
if validation_rate > 0.0:
print(f'\tVal Loss: {eval_loss:.3f} | Val Acc: {eval_accuracy * 100:.2f}%')
if tensorboard:
writer.add_scalars(f'Accuracy', {
'Train': epoch_acc}, epoch_i+1)
writer.add_scalars(f'Loss', {
'Train': epoch_loss}, epoch_i+1)
if validation_rate > 0.0:
writer.add_scalars(f'Accuracy', {
'Validation': eval_accuracy}, epoch_i+1)
writer.add_scalars(f'Loss', {
'Validation': eval_loss}, epoch_i+1)
self.epoch_i += 1
if checkpoint:
self.__checkpoint_save(path=os.path.join(checkpoint_dir, 'model_check.pt'))
# endregion Usage
# region Private
def __preprocess_data(self, batch_size, sentences, tags,
validation_rate):
"""
Create torch dataloader for training
@param batch_size: number of sample in batch
@param sentences: list of sentences for train (X)
@param tags: list of tags for train (Y)
@param validation_rate: percentage of validation data [0-1]
@return: (training dataloader, validation dataloader)
"""
validation_dataloader = None
sents_tokenized_t, sents_mask_t = self.sent_tokenizer.fit_transform_tensors(sentences)
tags_t = self.tags_tokenizer.convert_batchs_to_ids(tags, sents_tokenized_t)
dataset = TensorDataset(sents_tokenized_t, sents_mask_t, tags_t)
train_size = int(validation_rate * len(dataset))
print('{:>5,} training samples'.format(train_size))
if validation_rate < 1:
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
print('{:>5,} validation samples'.format(val_size))
validation_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
else:
train_dataset = dataset
training_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
return training_dataloader, validation_dataloader
def __train_epoch(self, training_dataloader):
"""
Train on epoch
@param training_dataloader: dataloader of training data
@return: (epoch accuracy, epoch loss, training time)
"""
self.model.train()
epoch_loss = 0
epoch_acc = 0
t0 = time.time()
i = 0
with tqdm(training_dataloader, unit="batch") as tepoch:
for batch in tepoch:
# Convert to device
b_sents_tokenized = batch[0].to(self.device)
b_sents_mask = batch[1].to(self.device)
targets = batch[2].to(self.device)
self.optimizer.zero_grad()
output = self.model((b_sents_tokenized, b_sents_mask, targets))
loss = output['loss']
predictions = torch.argmax(output['logit'], dim=2).detach().cpu().numpy()
label_ids = targets.cpu().numpy()
acc = categorical_accuracy(predictions, label_ids)
loss.backward()
epoch_acc += acc
epoch_loss += loss.item()
self.optimizer.step()
i += 1
# Measure how long this epoch took.
training_time = format_time(time.time() - t0)
epoch_acc = epoch_acc / i
epoch_loss = epoch_loss / i
return epoch_acc, epoch_loss, training_time
def __eval_epoch(self, validation_dataloader):
"""
Validation on epoch
@param validation_dataloader: dataloader of validation data
@return: (epoch accuracy, epoch loss, num step)
"""
self.model.eval()
eval_loss = 0
eval_accuracy = 0
nb_eval_steps, nb_eval_examples = 0, 0
with torch.no_grad():
print("Start eval")
for step, batch in enumerate(validation_dataloader):
# Convert to device
b_sents_tokenized = batch[0].to(self.device)
b_sents_mask = batch[1].to(self.device)
b_symbols_tokenized = batch[2].to(self.device)
output = self.model((b_sents_tokenized, b_sents_mask, b_symbols_tokenized))
loss = output['loss']
predictions = torch.argmax(output['logit'], dim=2).detach().cpu().numpy()
label_ids = b_symbols_tokenized.cpu().numpy()
accuracy = categorical_accuracy(predictions, label_ids)
eval_loss += loss.item()
eval_accuracy += accuracy
nb_eval_examples += b_sents_tokenized.size(0)
nb_eval_steps += 1
eval_loss = eval_loss / nb_eval_steps
eval_accuracy = eval_accuracy / nb_eval_steps
return eval_accuracy, eval_loss, nb_eval_steps
def __checkpoint_save(self, path='/model_check.pt'):
"""
Save the model with good parameters
@param path: poth and name for save
"""
self.model.cpu()
# print('save model parameters to [%s]' % path, file=sys.stderr)
torch.save({
'args': dict(bert_name=self.bert_name, index_to_tags=self.index_to_tags, epoch=self.epoch_i),
'state_dict': self.model.state_dict(),
'optimizer': self.optimizer,
}, path)
self.model.to(self.device)
# endregion Private
# endregion Class
class SentencesTokenizer():
def __init__(self, tokenizer):
"""@params tokenizer (PretrainedTokenizer): Tokenizer that tokenizes text """
self.tokenizer = tokenizer
def fit_transform(self, sents):
return self.tokenizer(sents, padding=True)
def fit_transform_tensors(self, sents):
temp = self.tokenizer(sents, padding=True, return_tensors = 'pt')
return temp["input_ids"], temp["attention_mask"]
def convert_ids_to_tokens(self, inputs_ids, skip_special_tokens=False):
return self.tokenizer.batch_decode(inputs_ids, skip_special_tokens=skip_special_tokens)
import pickle
import numpy as np
import torch
def load_obj(name):
with open(name + '.pkl', 'rb') as f:
return pickle.load(f)
class SymbolTokenizer():
def __init__(self, index_to_super):
"""@params index_to_super: Dict for convert ID to tags """
self.index_to_super = index_to_super
self.super_to_index = {v: int(k) for k, v in self.index_to_super.items()}
def lenSuper(self):
"""@return len of dict for convert ID to tags """
return len(self.index_to_super) + 1
def convert_batchs_to_ids(self, tags, sents_tokenized):
encoded_labels = []
labels = [[self.super_to_index[str(symbol)] for symbol in sents] for sents in tags]
for l, s in zip(labels, sents_tokenized):
super_tok = pad_sequence(l, len(s))
encoded_labels.append(super_tok)
return torch.tensor(encoded_labels)
def convert_ids_to_tags(self, tags_ids):
labels = [[self.index_to_super[int(symbol)] for symbol in sents if self.index_to_super[int(symbol)] != '<unk>']
for sents in tags_ids]
return labels
def pad_sequence(sequences, max_len=400):
padded = [0] * max_len
padded[:len(sequences)] = sequences
return padded
import torch
import transformers
from torch.nn import Module
from transformers import logging
class Tagging_bert_model(Module):
"""
"""
def __init__(self, bert_name, num_labels):
super(Tagging_bert_model, self).__init__()
self.bert_name = bert_name
self.num_labels = num_labels
config = transformers.AutoConfig.from_pretrained(bert_name, output_hidden_states=True, num_labels=num_labels)
self.bert = transformers.AutoModelForTokenClassification.from_pretrained(bert_name, config=config)
def forward(self, batch):
b_input_ids = batch[0]
b_input_mask = batch[1]
labels = batch[2]
output = self.bert(
input_ids=b_input_ids, attention_mask=b_input_mask, labels=labels)
result = {'loss': output[0],'logit': output[1], 'word_embeding': output[2][0], 'last_hidden_state': output[2][1]}
return result
def predict(self, batch):
b_input_ids = batch[0]
b_input_mask = batch[1]
output = self.bert(
input_ids=b_input_ids, attention_mask=b_input_mask)
result = {'logit' : output[0], 'word_embeding': output[1][0], 'last_hidden_state':output[1][1]}
return result
import pandas as pd
from tqdm import tqdm
def read_csv_pgbar(csv_path, nrows=float('inf'), chunksize=100):
print("\n" + "#" * 20)
print("Loading csv...")
rows = sum(1 for _ in open(csv_path, 'r', encoding="utf8")) - 1 # minus the header
chunk_list = []
if rows > nrows:
rows = nrows
with tqdm(total=rows, desc='Rows read: ') as bar:
for chunk in pd.read_csv(csv_path, converters={'Y1': pd.eval, 'Y2': pd.eval, 'Z': pd.eval}, chunksize=chunksize,
nrows=rows):
chunk_list.append(chunk)
bar.update(len(chunk))
df = pd.concat((f for f in chunk_list), axis=0)
print("#" * 20)
return df
def load_obj(name):
with open(name + '.pkl', 'rb') as f:
import pickle
return pickle.load(f)
def categorical_accuracy_str(preds, truth):
nb_label = 0
good_label = 0
for i in range(len(truth)):
sublist_truth = truth[i]
sublist_preds = preds[i]
for j in range(min(len(sublist_truth), len(sublist_preds))):
if str(sublist_truth[j]) == str(sublist_preds[j]):
good_label += 1
nb_label += 1
return good_label / nb_label
from .SuperTagger.SuperTagger import SuperTagger
from .SuperTagger.Utils.SentencesTokenizer import *
from .SuperTagger.Utils.SymbolTokenizer import *
from .SuperTagger.Utils.Tagging_bert_model import *
\ No newline at end of file
from SuperTagger.SuperTagger import SuperTagger
from SuperTagger.Utils.helpers import categorical_accuracy_str
#### DATA ####
a_s = "( 1 ) parmi les huit \" partants \" acquis ou potentiels , MM. Lacombe , Koehler et Laroze ne sont pas membres " \
"du PCF . "
tags_s = [['let', 'dr(0,s,s)', 'let', 'dr(0,dr(0,s,s),np)', 'dr(0,np,n)', 'dr(0,n,n)', 'let', 'n', 'let', 'dl(0,n,n)',
'dr(0,dl(0,dl(0,n,n),dl(0,n,n)),dl(0,n,n))', 'dl(0,n,n)', 'let', 'dr(0,np,np)', 'np', 'dr(0,dl(0,np,np),np)',
'np', 'dr(0,dl(0,np,np),np)', 'np', 'dr(0,dl(0,np,s),dl(0,np,s))', 'dr(0,dl(0,np,s),np)', 'dl(1,s,s)', 'np',
'dr(0,dl(0,np,np),n)', 'n', 'dl(0,s,txt)']]
#### MODEL ####
tagger = SuperTagger()
model = "models/flaubert_super_98%_V2_50e/flaubert_super_98%_V2_50e.pt"
tagger.load_weights(model)
#### TEST ####
_, pred_convert = tagger.predict(a_s)
print("Model : ", model)
print("\tLen Text : ", len(a_s.split()))
print("\tLen tags : ", len(tags_s[0]))
print("\tLen pred_convert : ", len(pred_convert[0]))
print()
print("\tText : ", a_s)
print()
print("\tTags : ", tags_s[0])
print()
print("\tPred_convert : ", pred_convert[0])
print()
print("\tScore :", categorical_accuracy_str(pred_convert, tags_s))
File suppressed by a .gitattributes entry or the file's encoding is unsupported.
from SuperTagger.SuperTagger import SuperTagger
from SuperTagger.Utils.helpers import read_csv_pgbar, load_obj
#### DATA ####
file_path = 'Datasets/m2_dataset.csv'
df = read_csv_pgbar(file_path,100)
texts = df['X'].tolist()
tags = df['Z'].tolist()
test_s = texts[:4]
tags_s = tags[:4]
texts = texts[4:]
tags = tags[4:]
index_to_super = load_obj('Datasets/index_to_super')
#### MODEL ####
tagger = SuperTagger()
tagger.create_new_model(len(index_to_super),'camembert-base',index_to_super)
# tagger.load_weights("models/model_check.pt")
tagger.train(texts,tags,batch_size=16,validation_rate=0.1,tensorboard=True,checkpoint=True)
#### TEST ####
pred = tagger.predict(test_s)
print(test_s)
print()
print(pred)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment