Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision

Target

Select target project
  • pnria/global-helper/deepgrail-tagger-linker
1 result
Select Git revision
Show changes
Commits on Source (2)
Showing
with 48277 additions and 256 deletions
......@@ -4,7 +4,7 @@ transformers = 4.16.2
[DATASET_PARAMS]
symbols_vocab_size = 26
atom_vocab_size = 18
max_len_sentence = 290
max_len_sentence = 300
max_atoms_in_sentence = 900
max_atoms_in_one_type = 360
......@@ -20,11 +20,4 @@ dim_cat_out = 256
dim_intermediate_ffn = 128
dim_pre_sinkhorn_transfo = 32
dropout = 0.1
sinkhorn_iters = 5
[MODEL_TRAINING]
batch_size = 32
pretrain_linker_epochs = 10
epoch = 20
seed_val = 42
learning_rate = 2e-3
\ No newline at end of file
sinkhorn_iters = 5
\ No newline at end of file
......@@ -8,55 +8,16 @@ import pandas as pd
# dr = /
# dl = \
#
# def sub_tree_word(word_with_data: str):
# word = ""
# if not word_with_data.startswith("GOAL:"):
# s = word_with_data.split('|')
# word = s[0]
# tree = s[1]
# else:
# tree = word_with_data
# return word, tree
#
#
# def sub_tree_line(line_with_data: str):
# line_list = line_with_data.split()
# sentence = ""
# sub_trees = []
# for word_with_data in line_list:
# w, t = sub_tree_word(word_with_data)
# sentence += ' ' + w
# if t not in ["\\", "/", "let"] and len(t) > 0:
# sub_trees.append([t])
# """if ('ppp' in list(itertools.chain(*sub_trees))):
# print(sentence)"""
# return sentence, list(itertools.chain(*sub_trees))
#
#
# def Txt_to_csv(file_name: str, result_name):
# file = open(file_name, "r", encoding="utf8")
# text = file.readlines()
# sub = [sub_tree_line(data) for data in text]
# df = pd.DataFrame(data=sub, columns=['X', 'Y'])
# df.to_csv("../Datasets/" + result_name + "_dataset_links.csv", mode='a', index=False, header=False)
#
# def Txt_to_csv_header(file_name: str, result_name):
# file = open(file_name, "r", encoding="utf8")
# text = file.readlines()
# sub = [sub_tree_line(data) for data in text]
# df = pd.DataFrame(data=sub, columns=['X', 'Y'])
# df.to_csv("../Datasets/" + result_name + "_dataset_links.csv", index=False)
def normalize_word(orig_word):
word = orig_word.lower()
if (word is "["):
if (word == "["):
word = "("
if (word is "]"):
if (word == "]"):
word = ")"
return word
def read_maxentdata(path):
allwords = []
allsuper = []
......
import datetime
import math
import os
import sys
......@@ -11,7 +10,6 @@ from torch.nn import Sequential, LayerNorm, Module, Linear, Dropout, Transformer
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import TensorDataset, random_split
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from Configuration import Configuration
......@@ -21,41 +19,15 @@ from Linker.Sinkhorn import sinkhorn_fn_no_exp as sinkhorn
from Linker.atom_map import atom_map, atom_map_redux
from Linker.eval import measure_accuracy, SinkhornLoss
from Linker.utils_linker import FFN, get_axiom_links, get_GOAL, get_pos_idx, get_neg_idx, get_atoms_batch, \
find_pos_neg_idexes, get_num_atoms_batch
find_pos_neg_idexes, get_num_atoms_batch, generate_square_subsequent_mask
from SuperTagger import SuperTagger
from utils import pad_sequence
def format_time(elapsed):
'''
Takes a time in seconds and returns a string hh:mm:ss
'''
# Round to the nearest second.
elapsed_rounded = int(round(elapsed))
# Format as hh:mm:ss
return str(datetime.timedelta(seconds=elapsed_rounded))
def output_create_dir():
"""
Create le output dir for tensorboard and checkpoint
@return: output dir, tensorboard writter
"""
from datetime import datetime
outpout_path = 'TensorBoard'
training_dir = os.path.join(outpout_path, 'Tranning_' + datetime.today().strftime('%d-%m_%H-%M'))
logs_dir = os.path.join(training_dir, 'logs')
writer = SummaryWriter(log_dir=logs_dir)
return training_dir, writer
def generate_square_subsequent_mask(sz):
"""Generates an upper-triangular matrix of -inf, with zeros on diag."""
return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
from utils import pad_sequence, format_time, output_create_dir
class Linker(Module):
# region initialization
def __init__(self, supertagger_path_model):
super(Linker, self).__init__()
......@@ -64,11 +36,9 @@ class Linker(Module):
datasetConfig = config["DATASET_PARAMS"]
modelEncoderConfig = config["MODEL_ENCODER"]
modelLinkerConfig = config["MODEL_LINKER"]
modelTrainingConfig = config["MODEL_TRAINING"]
dim_encoder = int(modelEncoderConfig['dim_encoder'])
# atom settings
atom_vocab_size = int(datasetConfig['atom_vocab_size'])
# Transformer
self.nhead = int(modelLinkerConfig['nhead'])
self.dim_emb_atom = int(modelLinkerConfig['dim_emb_atom'])
......@@ -85,7 +55,6 @@ class Linker(Module):
self.max_len_sentence = int(datasetConfig['max_len_sentence'])
self.max_atoms_in_sentence = int(datasetConfig['max_atoms_in_sentence'])
self.max_atoms_in_one_type = int(datasetConfig['max_atoms_in_one_type'])
learning_rate = float(modelTrainingConfig['learning_rate'])
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# endregion
......@@ -113,27 +82,44 @@ class Linker(Module):
Linear(dim_cat, self.dim_cat_out),
GELU(),
Dropout(dropout),
LayerNorm(self.dim_cat_out, eps=1e-8)
)
LayerNorm(self.dim_cat_out, eps=1e-8))
# Division into positive and negative
self.pos_transformation = Sequential(
FFN(self.dim_cat_out, dim_intermediate_FFN, dropout, d_out=dim_pre_sinkhorn_transfo),
LayerNorm(dim_pre_sinkhorn_transfo, eps=1e-8)
)
LayerNorm(dim_pre_sinkhorn_transfo, eps=1e-8))
self.neg_transformation = Sequential(
FFN(self.dim_cat_out, dim_intermediate_FFN, dropout, d_out=dim_pre_sinkhorn_transfo),
LayerNorm(dim_pre_sinkhorn_transfo, eps=1e-8)
)
LayerNorm(dim_pre_sinkhorn_transfo, eps=1e-8))
# Learning
self.cross_entropy_loss = SinkhornLoss()
self.optimizer = AdamW(self.parameters(),
lr=learning_rate)
self.optimizer = AdamW(self.parameters(), lr=0.001)
self.scheduler = StepLR(self.optimizer, step_size=2, gamma=0.5)
self.to(self.device)
def load_weights(self, model_file):
print("#" * 15)
try:
params = torch.load(model_file, map_location=self.device)
self.atom_encoder.load_state_dict(params['atom_encoder'])
self.position_encoder.load_state_dict(params['position_encoder'])
self.transformer.load_state_dict(params['transformer'])
self.linker_encoder.load_state_dict(params['linker_encoder'])
self.pos_transformation.load_state_dict(params['pos_transformation'])
self.neg_transformation.load_state_dict(params['neg_transformation'])
self.cross_entropy_loss.load_state_dict(params['cross_entropy_loss'])
self.optimizer = params['optimizer']
print("\n The loading checkpoint was successful ! \n")
except Exception as e:
print("\n/!\ Can't load checkpoint model /!\ because :\n\n " + str(e), file=sys.stderr)
raise e
print("#" * 15)
#endregion
# region data
def __preprocess_data(self, batch_size, df_axiom_links, validation_rate=0.1):
r"""
Args:
......@@ -177,6 +163,26 @@ class Linker(Module):
print("End preprocess Data")
return training_dataloader, validation_dataloader
#endregion
# region training
def make_sinkhorn_inputs(self, bsd_tensor, positional_ids, atom_type):
"""
:param bsd_tensor:
Tensor of shape batch size \times sequence length \times feature dimensionality.
:param positional_ids:
A List of batch_size elements, each being a List of num_atoms LongTensors.
Each LongTensor in positional_ids[b][a] indexes the location of atoms of type a in sentence b.
:param atom_type:
:return:
"""
return torch.stack([torch.stack([bsd_tensor.select(0, index=i).select(0, index=int(atom)).to(self.device)
if atom != -1 else torch.zeros(self.dim_cat_out, device=self.device)
for atom in sentence])
for i, sentence in enumerate(positional_ids[:, self.atom_map_redux[atom_type], :])])
def forward(self, batch_num_atoms_per_word, batch_atoms, batch_pos_idx, batch_neg_idx, sents_embedding):
r"""
Args:
......@@ -307,7 +313,7 @@ class Linker(Module):
# Run the Linker on the atoms
logits_predictions = self(batch_num_atoms, batch_atoms_tok, batch_pos_idx, batch_neg_idx,
output['word_embeding'])
output['word_embedding'])
linker_loss = self.cross_entropy_loss(logits_predictions, batch_true_links)
# Perform a backward pass to calculate the gradients.
......@@ -332,6 +338,10 @@ class Linker(Module):
return avg_train_loss, avg_accuracy_train, training_time
#endregion
# region evaluation
def eval_batch(self, batch):
batch_num_atoms = batch[0].to(self.device)
batch_atoms_tok = batch[1].to(self.device)
......@@ -344,12 +354,13 @@ class Linker(Module):
output = self.Supertagger.forward(batch_sentences_tokens, batch_sentences_mask)
logits_predictions = self(batch_num_atoms, batch_atoms_tok, batch_pos_idx, batch_neg_idx, output[
'word_embeding']) # atom_vocab, batch_size, max atoms in one type, max atoms in one type
'word_embedding']) # atom_vocab, batch_size, max atoms in one type, max atoms in one type
axiom_links_pred = torch.argmax(logits_predictions, dim=3) # atom_vocab, batch_size, max atoms in one type
print('\n')
print("Les vrais liens de la catégorie n : ", batch_true_links[1][2][:100])
print("Les prédictions : ", axiom_links_pred[2][1][:100])
print(batch_true_links)
print("Les vrais liens de la catégorie n : ", batch_true_links[0][2][:100])
print("Les prédictions : ", axiom_links_pred[2][0][:100])
print('\n')
accuracy = measure_accuracy(batch_true_links, axiom_links_pred)
......@@ -374,6 +385,10 @@ class Linker(Module):
return loss_average / len(dataloader), accuracy_average / len(dataloader)
#endregion
#region prediction
def predict_with_categories(self, sentence, categories):
r""" Predict the links from a sentence and its categories
......@@ -406,7 +421,7 @@ class Linker(Module):
output = self.Supertagger.forward(sentences_tokens, sentences_mask)
logits_predictions = self(num_atoms_per_word, atoms_tokenized, pos_idx, neg_idx, output['word_embeding'])
logits_predictions = self(num_atoms_per_word, atoms_tokenized, pos_idx, neg_idx, output['word_embedding'])
axiom_links_pred = torch.argmax(logits_predictions, dim=3)
return axiom_links_pred
......@@ -444,28 +459,12 @@ class Linker(Module):
pos_idx = get_pos_idx(atoms, polarities, self.max_atoms_in_one_type)
neg_idx = get_neg_idx(atoms, polarities, self.max_atoms_in_one_type)
logits_predictions = self(num_atoms_per_word, atoms_tokenized, pos_idx, neg_idx, output['word_embeding'])
logits_predictions = self(num_atoms_per_word, atoms_tokenized, pos_idx, neg_idx, output['word_embedding'])
axiom_links_pred = torch.argmax(logits_predictions, dim=3)
return categories, axiom_links_pred
def load_weights(self, model_file):
print("#" * 15)
try:
params = torch.load(model_file, map_location=self.device)
self.atom_encoder.load_state_dict(params['atom_encoder'])
self.position_encoder.load_state_dict(params['position_encoder'])
self.transformer.load_state_dict(params['transformer'])
self.linker_encoder.load_state_dict(params['linker_encoder'])
self.pos_transformation.load_state_dict(params['pos_transformation'])
self.neg_transformation.load_state_dict(params['neg_transformation'])
self.cross_entropy_loss.load_state_dict(params['cross_entropy_loss'])
self.optimizer.load_state_dict(params['optimizer'])
print("\n The loading checkpoint was successful ! \n")
except Exception as e:
print("\n/!\ Can't load checkpoint model /!\ because :\n\n " + str(e), file=sys.stderr)
raise e
print("#" * 15)
#endregion
def __checkpoint_save(self, path='/linker.pt'):
"""
......@@ -475,28 +474,12 @@ class Linker(Module):
torch.save({
'atom_encoder': self.atom_encoder.state_dict(),
'position_encoder': self.position_encoder,
'position_encoder': self.position_encoder.state_dict(),
'transformer': self.transformer.state_dict(),
'linker_encoder': self.linker_encoder.state_dict(),
'pos_transformation': self.pos_transformation.state_dict(),
'neg_transformation': self.neg_transformation.state_dict(),
'cross_entropy_loss': self.cross_entropy_loss,
'cross_entropy_loss': self.cross_entropy_loss.state_dict(),
'optimizer': self.optimizer,
}, path)
self.to(self.device)
def make_sinkhorn_inputs(self, bsd_tensor, positional_ids, atom_type):
"""
:param bsd_tensor:
Tensor of shape batch size \times sequence length \times feature dimensionality.
:param positional_ids:
A List of batch_size elements, each being a List of num_atoms LongTensors.
Each LongTensor in positional_ids[b][a] indexes the location of atoms of type a in sentence b.
:param atom_type:
:return:
"""
return torch.stack([torch.stack([bsd_tensor.select(0, index=i).select(0, index=int(atom)).to(self.device)
if atom != -1 else torch.zeros(self.dim_cat_out, device=self.device)
for atom in sentence])
for i, sentence in enumerate(positional_ids[:, self.atom_map_redux[atom_type], :])])
......@@ -25,7 +25,10 @@ class FFN(Module):
def forward(self, x):
return self.ffn(x)
def generate_square_subsequent_mask(sz):
"""Generates an upper-triangular matrix of -inf, with zeros on diag."""
return torch.triu(torch.ones(sz, sz) * float('-inf'), diagonal=1)
################################ Regex ########################################
regex_categories_axiom_links = r'\w+\(\d+,(?:((?R))|(\w+))*,?(?:((?R))|(\w+))*\)'
regex_categories = r'\w+\(\d+,(?:((?R))|(\w+))*,?(?:((?R))|(\w+))*\)'
......@@ -106,15 +109,6 @@ def get_atoms_links_batch(category_batch):
return batch
print("test to create links ",
get_axiom_links(20, torch.stack([torch.as_tensor(
[True, False, True, False, False, False, True, False, True, False,
False, True, False, False, False, True, False, False, True, False,
True, False, False, True, False, False, False, False, False, False])]),
[['dr(0,np_1,n_2)', 'n_2', 'dr(0,dl(0,np_1,np_3),np_4)', 'dr(0,np_4,n_5)', 'n_6', 'dl(0,n_6,n_5)',
'dr(0,dl(0,np_3,np_7),np_8)', 'dr(0,np_8,np_9)', 'np_9', 'GOAL:np_7']]))
# endregion
# region get atoms in sentence
......@@ -159,10 +153,6 @@ def get_atoms_batch(category_batch):
return batch
print(" test for get atoms in categories on ['dr(0,np,n)', 'n', 'dr(0,dl(0,np,np),np)', 'let']",
get_atoms_batch([['dr(0,np,n)', 'n', 'dr(0,dl(0,np,np),np)', 'let']]))
# endregion
# region calculate num atoms per category
......@@ -211,11 +201,6 @@ def get_num_atoms_batch(category_batch, max_len_sentence):
batch.append(torch.as_tensor(num_atoms_sentence))
return pad_sequence(batch, max_len=max_len_sentence, padding_value=0)
print(" test for get number of atoms in categories on ['dr(0,s,np)', 'let']",
get_num_atoms_batch([["dr(0,s,np)", "let"]], 10))
# endregion
# region get polarity
......@@ -309,11 +294,6 @@ def find_pos_neg_idexes(atoms_batch):
return list_batch
print(" test for get polarities for atoms in categories on ['dr(0,np,n)', 'n', 'dr(0,dl(0,np,np),np)', 'dr(0,np,n)', 'n', 'dl(0,n,n)', 'dr(0,dl(0,np,np),np)', 'dr(0,np,np)', 'np'] \n",
find_pos_neg_idexes([['dr(0,np,n)', 'n', 'dr(0,dl(0,np,np),np)', 'dr(0,np,n)', 'n', 'dl(0,n,n)',
'dr(0,dl(0,np,np),np)', 'dr(0,np,np)', 'np']]))
# endregion
# region get atoms and polarities with GOAL
......@@ -336,15 +316,6 @@ def get_GOAL(max_len_sentence, df_axiom_links):
return atoms_batch, polarities, num_atoms_batch
df_axiom_links = pd.DataFrame({"Z": [['dr(0,np,n)', 'n', 'dr(0,dl(0,np,np),np)', 'dr(0,np,n)', 'n', 'dl(0,n,n)',
'dr(0,dl(0,np,np),np)', 'dr(0,np,np)', 'np']],
"Y": [['dr(0,np_1,n_2)', 'n_2', 'dr(0,dl(0,np_1,np_3),np_4)', 'dr(0,np_4,n_5)', 'n_6',
'dl(0,n_6,n_5)', 'dr(0,dl(0,np_3,np_7),np_8)', 'dr(0,np_8,np_9)', 'np_9',
'GOAL:np_7']]})
print(" test for get GOAL ", get_GOAL(10, df_axiom_links))
# endregion
# region get idx for pos and neg
......@@ -370,13 +341,40 @@ def get_neg_idx(atoms_batch, atoms_polarity_batch, max_atoms_in_one_type):
return torch.stack(pos_idx).permute(1, 0, 2)
# endregion
print(" test for cut into pos neg on ['dr(0,s,np)', 's']",
get_neg_idx([['s', 's', 'np', 's', 'np', '[SEP]', 's', '[SEP]']],
torch.as_tensor(
[[True, True, False, False,
True, False, False, False,
False, False,
False, False]]), 10))
# endregion
\ No newline at end of file
if __name__ == 'main ':
print("test to create links ",
get_axiom_links(20, torch.stack([torch.as_tensor(
[True, False, True, False, False, False, True, False, True, False,
False, True, False, False, False, True, False, False, True, False,
True, False, False, True, False, False, False, False, False, False])]),
[['dr(0,np_1,n_2)', 'n_2', 'dr(0,dl(0,np_1,np_3),np_4)', 'dr(0,np_4,n_5)', 'n_6', 'dl(0,n_6,n_5)',
'dr(0,dl(0,np_3,np_7),np_8)', 'dr(0,np_8,np_9)', 'np_9', 'GOAL:np_7']]))
print(" test for get atoms in categories on ['dr(0,np,n)', 'n', 'dr(0,dl(0,np,np),np)', 'let']",
get_atoms_batch([['dr(0,np,n)', 'n', 'dr(0,dl(0,np,np),np)', 'let']]))
print(" test for get number of atoms in categories on ['dr(0,s,np)', 'let']",
get_num_atoms_batch([["dr(0,s,np)", "let"]], 10))
print(" test for get polarities for atoms in categories on ['dr(0,np,n)', 'n', 'dr(0,dl(0,np,np),np)', 'dr(0,np,n)', 'n', 'dl(0,n,n)', 'dr(0,dl(0,np,np),np)', 'dr(0,np,np)', 'np'] \n",
find_pos_neg_idexes([['dr(0,np,n)', 'n', 'dr(0,dl(0,np,np),np)', 'dr(0,np,n)', 'n', 'dl(0,n,n)',
'dr(0,dl(0,np,np),np)', 'dr(0,np,np)', 'np']]))
df_axiom_links = pd.DataFrame({"Z": [['dr(0,np,n)', 'n', 'dr(0,dl(0,np,np),np)', 'dr(0,np,n)', 'n', 'dl(0,n,n)',
'dr(0,dl(0,np,np),np)', 'dr(0,np,np)', 'np']],
"Y": [['dr(0,np_1,n_2)', 'n_2', 'dr(0,dl(0,np_1,np_3),np_4)', 'dr(0,np_4,n_5)', 'n_6',
'dl(0,n_6,n_5)', 'dr(0,dl(0,np_3,np_7),np_8)', 'dr(0,np_8,np_9)', 'np_9',
'GOAL:np_7']]})
print(" test for get GOAL ", get_GOAL(10, df_axiom_links))
print(" test for cut into pos neg on ['dr(0,s,np)', 's']",
get_neg_idx([['s', 's', 'np', 's', 'np', '[SEP]', 's', '[SEP]']],
torch.as_tensor(
[[True, True, False, False,
True, False, False, False,
False, False,
False, False]]), 10))
\ No newline at end of file
import os
import datetime
import os
import time
import torch
......@@ -8,7 +5,6 @@ from torch.nn import Module
from torch.optim import AdamW
from torch.optim.lr_scheduler import StepLR
from torch.utils.data import TensorDataset, random_split
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from Configuration import Configuration
......@@ -16,70 +12,39 @@ from Linker import Linker
from Linker.eval import measure_accuracy, SinkhornLoss
from Linker.utils_linker import get_axiom_links, get_GOAL, get_pos_idx, get_num_atoms_batch, get_neg_idx
from NeuralProofNet.utils_proofnet import get_info_for_tagger
from utils import pad_sequence
def format_time(elapsed):
'''
Takes a time in seconds and returns a string hh:mm:ss
'''
# Round to the nearest second.
elapsed_rounded = int(round(elapsed))
# Format as hh:mm:ss
return str(datetime.timedelta(seconds=elapsed_rounded))
def output_create_dir():
"""
Create le output dir for tensorboard and checkpoint
@return: output dir, tensorboard writter
"""
from datetime import datetime
outpout_path = 'TensorBoard'
training_dir = os.path.join(outpout_path, 'Tranning_' + datetime.today().strftime('%d-%m_%H-%M'))
logs_dir = os.path.join(training_dir, 'logs')
writer = SummaryWriter(log_dir=logs_dir)
return training_dir, writer
from utils import pad_sequence, format_time, output_create_dir
class NeuralProofNet(Module):
def __init__(self, supertagger_path_model, linker_path_model=None):
super(NeuralProofNet, self).__init__()
config = Configuration.read_config()
datasetConfig = config["DATASET_PARAMS"]
modelTrainingConfig = config["MODEL_TRAINING"]
# pretrain settings
self.pretrain_linker_epochs = int(modelTrainingConfig['pretrain_linker_epochs'])
# settings
self.max_len_sentence = int(datasetConfig['max_len_sentence'])
self.max_atoms_in_sentence = int(datasetConfig['max_atoms_in_sentence'])
self.max_atoms_in_one_type = int(datasetConfig['max_atoms_in_one_type'])
learning_rate = float(modelTrainingConfig['learning_rate'])
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.batch_size = int(modelTrainingConfig['batch_size'])
linker = Linker(supertagger_path_model)
if linker_path_model is not None:
linker.load_weights(linker_path_model)
self.pretrain_linker_epochs = 0
self.linker = linker
# Learning
self.linker_loss = SinkhornLoss()
self.linker_optimizer = AdamW(self.linker.parameters(),
lr=learning_rate)
lr=0.001)
self.linker_scheduler = StepLR(self.linker_optimizer, step_size=2, gamma=0.5)
self.to(self.device)
def __pretrain_linker__(self, df_axiom_links, checkpoint=False, tensorboard=True):
def __pretrain_linker__(self, df_axiom_links, pretrain_linker_epochs, batch_size, checkpoint=False, tensorboard=True):
print("\nLinker Pre-Training\n")
self.linker.train_linker(df_axiom_links, validation_rate=0.05, epochs=self.pretrain_linker_epochs,
batch_size=self.batch_size,
checkpoint=checkpoint,
tensorboard=tensorboard)
self.linker.train_linker(df_axiom_links, validation_rate=0.05, epochs=pretrain_linker_epochs,
batch_size=batch_size, checkpoint=checkpoint, tensorboard=tensorboard)
print("\nEND Linker Pre-Training\n")
def __preprocess_data(self, batch_size, df_axiom_links, validation_rate=0.1):
......@@ -143,11 +108,11 @@ class NeuralProofNet(Module):
batch_neg_idx = batch_neg_idx.to(self.device)
logits_links = self.linker(batch_num_atoms_per_word, atoms_batch_tokenized, batch_pos_idx, batch_neg_idx,
output['word_embeding'])
output['word_embedding'])
return torch.log_softmax(logits_links, dim=3)
def train_neuralproofnet(self, df_axiom_links, validation_rate=0.1, epochs=20,
def train_neuralproofnet(self, df_axiom_links, validation_rate=0.1, epochs=20, pretrain_linker_epochs=0,
batch_size=32, checkpoint=True, tensorboard=False):
r"""
Args:
......@@ -161,7 +126,7 @@ class NeuralProofNet(Module):
Final accuracy and final loss
"""
# Pretrain the linker
self.__pretrain_linker__(df_axiom_links)
self.__pretrain_linker__(df_axiom_links, pretrain_linker_epochs, batch_size)
# Start learning with output from tagger
training_dataloader, validation_dataloader = self.__preprocess_data(batch_size, df_axiom_links,
......@@ -261,8 +226,8 @@ class NeuralProofNet(Module):
dim=3) # atom_vocab, batch_size, max atoms in one type
print('\n')
print("Les vrais liens de la catégorie n : ", batch_true_links[1][2][:100])
print("Les prédictions : ", axiom_links_pred[2][1][:100])
print("Les vrais liens de la catégorie n : ", batch_true_links[0][2][:100])
print("Les prédictions : ", axiom_links_pred[2][0][:100])
print('\n')
accuracy = measure_accuracy(batch_true_links, axiom_links_pred)
......@@ -295,12 +260,12 @@ class NeuralProofNet(Module):
torch.save({
'atom_encoder': self.linker.atom_encoder.state_dict(),
'position_encoder': self.linker.position_encoder,
'position_encoder': self.linker.position_encoder.state_dict(),
'transformer': self.linker.transformer.state_dict(),
'linker_encoder': self.linker.linker_encoder.state_dict(),
'pos_transformation': self.linker.pos_transformation.state_dict(),
'neg_transformation': self.linker.neg_transformation.state_dict(),
'cross_entropy_loss': self.linker_loss,
'cross_entropy_loss': self.linker_loss.state_dict(),
'optimizer': self.linker_optimizer,
}, path)
self.to(self.device)
\ No newline at end of file
......@@ -6,7 +6,8 @@ This code was designed to work with the [DeepGrail Tagger](https://gitlab.irit.f
[DeepGrail Linker](https://gitlab.irit.fr/pnria/global-helper/deepgrail-linker).
In this version the tagger is not retrained with the linker.
In this version the tagger is not retrained with the linker. Meaning they are both trained separately in training phase but in inference phase, predictions of tagger feeds inputs of linker.
## Usage
......@@ -17,24 +18,14 @@ Clone the project locally.
### Libraries installation
Run the following script :
```bash
python3 -m venv env
source env/bin/activate
pip install -r requirements.txt
git clone https://gitlab.irit.fr/pnria/global-helper/deepgrail_tagger
mkdir Output
mkdir TensorBoard
```
Run the script init.sh
Optional : Upload the tagger.pt and linker.pt in models. (You may need to modify 'model_tagger' in train.py.)
Optional : Upload the .pt files containing models weights in the **models** directory.
### Structure
The structure should look like this :
```
.
.
......@@ -43,23 +34,28 @@ The structure should look like this :
│ └── config.ini # contains parameters
├── requirements.txt # librairies needed
├── Datasets # TLGbank data with links
├── SuperTagger # The Supertagger directory (that you need to install)
│ ├── ...
│ └── SuperTagger # Implementation of BertForTokenClassification
│ ├── SuperTagger.py # Main class
│ └── Tagging_bert_model.py # Bert model
├── SuperTagger # The Supertagger directory (that you need
│ ├── Datasets # TLGbank data with supertags
│ └── SuperTagger # BertForTokenClassification
│ ├── SuperTagger.py # Main class
│ ├── Tagging_bert_model.py # Bert model
│ ├── SymbolTokenizer # Tags tokenizer
│ └── SentencesTokenizer # Words tokenizer
├── Linker # The Linker directory (that you need to install)
│ ├── ...
│ └── Linker.py # Linker class containing the neural network
├── NeuralProofNet # The NeuralProofNet directory
│ ├── ...
│ └── NeuralProofNet.py # NeuralProofNet class containing the linker and supertagger
│ ├── utils_proofnet # utils for NeuralProofNet
│ └── NeuralProofNet.py # NeuralProofNet class
├── models
│ ├── linker.pt # OPTIONAL : the pt file contaning the pretrained linker (you need to install it)
│ └── supertagger.pt # the pt file contaning the pretrained supertagger (you need to install it)
├── Output # Directory where your linker models will be saved if checkpoint=True in train
├── TensorBoard # Directory where the stats will be saved if tensorboard=True in train
└── train.py # Example of train
│ ├── linker.pt # OPTIONAL : pretrained linker
│ └── supertagger.pt # pretrained supertagger
├── Output # Directory with models backups while training
├── TensorBoard # Directory with stats
├── train_neuralproofnet.py # train for linker with the pretrained supertager
├── train_supertagger.py # train for the supertager
├── predict_supertags.py # tags predictions
└── predict_links.py # links predictions
```
......@@ -68,15 +64,72 @@ The structure should look like this :
The sentences should be in a column "X", the links with '_x' postfix should be in a column "Y" and the categories in a column "Z".
For the links each atom_x goes with the one and only other atom_x in the sentence.
### Utils
In order to load **m2_dataset.csv**, you can use `utils.read_csv_pgbar(...)`. This function return a pandas
dataframe.
## Training
### Training of supertagger
```
df = read_csv_pgbar(file_path,1000)
texts = df['X'].tolist()
tags = df['Z'].tolist()
#Dict for convert ID to token (The dict is save with the model for prediction)
index_to_super = load_obj('Datasets/index_to_super')
tagger = SuperTagger()
bert_name = 'camembert-base'
tagger.create_new_model(len(index_to_super), bert_name, index_to_super)
# You can load your model for re-train this
# tagger.load_weights("your/model/path")
tagger.train(texts, tags, checkpoint=True)
pred_without_argmax, pred_convert, bert_hidden_state = tagger.predict(texts[7])
```
In train, if you use `checkpoint=True`, the model is automatically saved in a folder: Training_XX-XX_XX-XX. It saves
after each epoch. Use `tensorboard=True` for log in same folder. (`tensorboard --logdir=logs` for see logs)
`bert_name` can be any model available on [Hugging Face](https://huggingface.co/models)
### Training of linker
Launch train.py, if you look at it you can give another dataset file and another tagging model.
In train, if you use `checkpoint=True`, the model is automatically saved in a folder: Training_XX-XX_XX-XX. It saves
after each epoch. Use `tensorboard=True` for log in same folder. (`tensorboard --logdir=logs` for see logs)
## Predicting
### Prediction of supertags
For predict on your data you need to load a model (save with this code).
```
df = read_csv_pgbar(file_path,20)
texts = df['X'].tolist()
tagger = SuperTagger()
tagger.load_weights("your/model/path")
pred_without_argmax, pred_convert, bert_hidden_state = tagger.predict(texts[7])
print(pred_convert)
#['let', 'dr(0,s,s)', 'let', 'dr(0,dr(0,s,s),np)', 'dr(0,np,n)', 'dr(0,n,n)', 'let', 'n', 'let', 'dl(0,n,n)', 'dr(0,dl(0,dl(0,n,n),dl(0,n,n)),dl(0,n,n))', 'dl(0,n,n)', 'let', 'dr(0,np,np)', 'np', 'dr(0,dl(0,np,np),np)', 'np', 'dr(0,dl(0,np,np),np)', 'np', 'dr(0,dl(0,np,s),dl(0,np,s))', 'dr(0,dl(0,np,s),np)', 'dl(1,s,s)', 'np', 'dr(0,dl(0,np,np),n)', 'n', 'dl(0,s,txt)']
```
### Prediction of links
For predict on your data you need to load a model (save with this code).
```
......@@ -85,9 +138,9 @@ links = linker.predict_without_categories("le chat est noir")
print(links)
```
The file ```postprocessing.py``` will allow you to draw the prediction. (limited sentence length otherwise it will be confusing)
The file ```postprocessing.py``` will allow you to draw the prediction with graphviz (you need to install it). Be careful to predict on limited sentence length otherwise the graph will not be helpful.
You can also use the function ```predict_without_categories``` which only needs the sentence.
You can also use the function ```predict_without_categories``` which only needs the sentence (it uses the supertagger to predict the tags) or ```predict_with_categories``` so you can give directlythe categories (useful to check the links without bias from the supertager).
## LICENSE
......
File added
File added
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
import itertools
import pickle
import re
import numpy as np
import pandas as pd
"""
Format data for training supertagger from txt to csv and tags pkl
"""
# dr = /
# dl = \
def sub_tree_word(word_with_data: str):
s = word_with_data.split('|')
word = s[0]
tree = s[2]
tree = re.sub("dr", "/", tree)
tree = re.sub("dl", "\\\\", tree)
tree = re.sub("dia", "dia,", tree)
tree = re.sub("box", "box,", tree)
tree = re.sub("dl", "\\\\,", tree)
tree = re.sub(",\(1,|,\(0,|\(1,|\(0,", ",", tree)
tree = re.sub("|\)", "", tree)
return word, tree.split(',')
def sub_tree_line(line_with_data: str):
line_list = line_with_data.split()
sentence = ""
sub_trees = []
#sub_trees.append(["[START]"])
for word_with_data in line_list:
w, t = sub_tree_word(word_with_data)
sentence += ' ' +w
t.append("[SEP]")
sub_trees.append(t)
"""if ('ppp' in list(itertools.chain(*sub_trees))):
print(sentence)"""
sub_trees.append(["[SOS]"])
return sentence, list(itertools.chain(*sub_trees))
def Txt_to_csv(file_name: str, csv_name:str = "../Datasets/m2V2_dataset.csv"):
file = open(file_name, "r", encoding="utf8")
text = file.readlines()
sub = [sub_tree_line(data) for data in text]
df = pd.DataFrame(data=sub, columns = ['Sentences', 'sub_tree'])
df.to_csv(csv_name, index=False)
def normalize_word(orig_word):
word = orig_word.lower()
if (word == "["):
word = "("
if (word == "]"):
word = ")"
return word
def read_maxentdata(file):
with open(file, 'r', encoding="UTF8") as f:
vocabulary = set()
vnorm = set()
partsofspeech1 = set()
partsofspeech2 = set()
superset = set()
sentno = 0
maxlen = 0
words = ""
postags1 = []
postags2 = []
supertags = []
allwords = []
allpos1 = []
allpos2 = []
allsuper = []
for line in f:
line = line.strip().split()
length = len(line)
if (length > maxlen):
maxlen = length
for l in range(length):
item = line[l].split('|')
if len(item) > 2:
orig_word = item[0]
word = normalize_word(orig_word)
postag = item[1]
supertag = item[2]
poslist = postag.split('-')
pos1 = poslist[0]
pos2 = poslist[1]
vocabulary.add(orig_word)
vnorm.add(word)
partsofspeech1.add(pos1)
partsofspeech2.add(pos2)
superset.add(supertag)
# words += ' ' +(str(orig_word))
words += ' ' + (str(orig_word))
postags1.append(pos1)
postags2.append(pos2)
supertags.append(supertag)
allwords.append(words)
allpos1.append(postags1)
allpos2.append(postags2)
allsuper.append(supertags)
words = ""
postags1 = []
postags2 = []
supertags = []
X = np.asarray(allwords)
Y1 = np.asarray(allpos1)
Y2 = np.asarray(allpos2)
Z = np.asarray(allsuper)
return X, Y1, Y2, Z, vocabulary, vnorm, partsofspeech1, partsofspeech2, superset, maxlen
def save_obj(obj, name):
with open(name + '.pkl', 'wb+') as f:
pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)
# Format from txt to csv
# Txt_to_csv("m2.txt")
X, Y1, Y2, Z, vocabulary, vnorm, partsofspeech1, partsofspeech2, superset, maxlen = read_maxentdata("SuperTagger/Datasets/m2.txt")
df = pd.DataFrame({"X":X[:-1], "Y1":Y1[:-1], "Y2":Y2[:-1], "Z":Z[:-1]})
df.to_csv("SuperTagger/Datasets/m2_dataset_V2.csv", index=False)
# Dictionary for supertags
t = np.unique(np.array(list(itertools.chain(*Z))))
dict = { i : t[i] for i in range(0, len(t) ) }
save_obj(dict,"SuperTagger/Datasets/index_to_super")
# Dictionary for grammar tags (not used)
t = np.unique(np.array(list(itertools.chain(*Y1))))
dict = { i : t[i] for i in range(0, len(t) ) }
save_obj(dict,"SuperTagger/Datasets/index_to_pos1")
\ No newline at end of file
# DeepGrail
This repository contains a Python implementation of BertForTokenClassification using TLGbank data to develop
part-of-speech taggers and supertaggers.
This code was designed to work with the [DeepGrail Linker](https://gitlab.irit.fr/pnria/global-helper/deepgrail-linker)
to provide a wide coverage syntactic and semantic parser for French. But the Tagger is independent, you can use it for your own tags.
## Structure
```
.
├── Datasets # TLGbank data
└── SuperTagger # BertForTokenClassification
├── SuperTagger.py # Main class
├── Tagging_bert_model.py # Bert model
├── SymbolTokenizer # Tags tokenizer
├── SentencesTokenizer # Words tokenizer
└── helpers # utils
```
class SentencesTokenizer():
"""
Tokenizer for sentences : Based on a pretrained tokenzer
Atributes:
----------
tokenizer : Tokenizer
Pretrained Tokenizer
max_length :
Maximal length of a sentence (i.e maximum number of words)
"""
def __init__(self, tokenizer, max_length):
"""
Parameters :
------------
tokenizer (PretrainedTokenizer): Tokenizer that tokenizes text
max_length : Maximal length of a sentence
"""
self.tokenizer = tokenizer
self.max_length = max_length
def fit_transform(self, sents):
"""
Tokenizes the given sentences
"""
return self.tokenizer(sents, padding=True)
def fit_transform_tensors(self, sents):
"""
Tokenizes the sentences and returns tensor
"""
temp = self.tokenizer(sents, padding='max_length', truncation=True, return_tensors = 'pt', max_length=self.max_length)
return temp["input_ids"], temp["attention_mask"]
def convert_ids_to_tokens(self, inputs_ids, skip_special_tokens=False):
"""
Decodes a sentence.
"""
return self.tokenizer.batch_decode(inputs_ids, skip_special_tokens=skip_special_tokens)
import os
import sys
import time
import torch
import transformers
from torch.optim import Adam
from torch.utils.data import TensorDataset, random_split
from tqdm import tqdm
from transformers import AutoTokenizer
from transformers import logging
from Configuration import Configuration
from .SentencesTokenizer import SentencesTokenizer
from .SymbolTokenizer import SymbolTokenizer
from .Tagging_bert_model import Tagging_bert_model
from .eval import categorical_accuracy
from utils import format_time, output_create_dir
logging.set_verbosity(logging.ERROR)
# region Class
class SuperTagger:
"""
Implements the SuperTagger to assign each word a supertag (also named symbol). A supertag is a tree of tags such as np, s, ...
Attributes:
-----------
max_len_sentence : int
Maximum length of sentence, equals to the maximum number of supertags
index_to_tags : dic
num_label : int
number of possible supertags
bert_name :
name of BERT model
sent_tokenizer ! Tokenizer
Tokenize words to word_token
tags_tokenizer : Tokenizer
Tokenize supertag to supertag_token
model : TokenClassifier
Model for classification of tokens. Classify word_token to supertag_token.
optimizer : Optimizer
Optimizer to repropagate gradients
epoch_i : int
Current number of epoch
device : Device
CPU or cuda
trainable : bool
model_load : bool
"""
# region Instanciation
def __init__(self):
"""
Python implementation of BertForTokenClassification using TLGbank data to develop supertaggers.
"""
config = Configuration.read_config()
datasetConfig = config["DATASET_PARAMS"]
self.max_len_sentence = int(datasetConfig['max_len_sentence'])
self.index_to_tags = None
self.num_label = None
self.bert_name = None
self.sent_tokenizer = None
self.tags_tokenizer = None
self.model = None
self.optimizer = None
self.epoch_i = 0
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
self.trainable = False
self.model_load = False
def load_weights(self, model_file):
"""
Loads an SupperTagger saved with SupperTagger.__checkpoint_save() (during a train) from a file.
Parameters:
-----------
model_file:
path of .pt save of model
"""
self.trainable = False
print("#" * 20)
print("\n Loading model for supertagger ...")
try:
params = torch.load(model_file, map_location=self.device)
args = params['args']
self.bert_name = args['bert_name']
self.index_to_tags = args['index_to_tags']
self.num_label = len(self.index_to_tags)
self.model = Tagging_bert_model(self.bert_name, self.num_label)
self.tags_tokenizer = SymbolTokenizer(self.index_to_tags)
self.sent_tokenizer = SentencesTokenizer(transformers.AutoTokenizer.from_pretrained(self.bert_name,do_lower_case=True),
self.max_len_sentence)
self.model.load_state_dict(params['state_dict'])
self.optimizer = params['optimizer']
# self.epoch_i = args['epoch']
print("\n The loading checkpoint was successful ! \n")
print("\tBert model : ", self.bert_name)
print("\tLast epoch : ", self.epoch_i)
print()
except Exception as e:
print("\n/!\ Can't load checkpoint model /!\ because :\n\n " + str(e), file=sys.stderr)
raise e
print("#" * 20)
self.model_load = True
self.trainable = True
def create_new_model(self, num_label, bert_name, index_to_tags):
"""
Instantiation and parameterization of a new bert model.
Parameters:
-----------
num_label:
number of diferent labels (tags)
bert_name:
name of model available on Hugging Face `<https://huggingface.co/models>`
index_to_tags:
Dict for convert ID to tags
"""
assert len(
index_to_tags) == num_label, f" len(index_to_tags) : {len(index_to_tags)} must be equels with num_label: {num_label}"
self.model = Tagging_bert_model(bert_name, num_label + 1)
index_to_tags = {k + 1: v for k, v in index_to_tags.items()}
# <unk> is used for the pad AND unknown tags
index_to_tags[0] = '<unk>'
self.index_to_tags = index_to_tags
self.bert_name = bert_name
self.sent_tokenizer = SentencesTokenizer(AutoTokenizer.from_pretrained(bert_name,do_lower_case=True),
self.max_len_sentence)
self.optimizer = Adam(params=self.model.parameters(), lr=2e-4, eps=1e-8)
self.tags_tokenizer = SymbolTokenizer(index_to_tags)
self.trainable = True
self.model_load = True
# endregion Instanciation
# region Usage
def predict(self, sentences):
"""
Predict and convert sentences in tags (depends on the dictation given when the model was created)
Parameters:
-----------
sentences: list of sentences : list[str] OR one sentences : str
Returns:
--------
tags prediction for all sentences (no argmax tags, convert tags, embedding layer of bert )
"""
assert self.trainable or self.model is None, "Please use the create_new_model(...) or load_weights(...) " \
"function before the predict, the model is not integrated "
assert type(sentences) == str or type(sentences) == list, "param sentences: list of sentences : list[" \
"str] OR one sentences : str "
sentences = [sentences] if type(sentences) == str else sentences
self.model.eval()
with torch.no_grad():
sents_tokenized_t, sents_mask_t = self.sent_tokenizer.fit_transform_tensors(sentences)
self.model = self.model.cpu()
output = self.model.predict((sents_tokenized_t, sents_mask_t))
return output['logit'], self.tags_tokenizer.convert_ids_to_tags(torch.argmax(output['logit'], dim=2).detach())
def forward(self, b_sents_tokenized, b_sents_mask):
"""
Forward to the model
"""
with torch.no_grad():
output = self.model.predict((b_sents_tokenized, b_sents_mask))
return output
def train(self, sentences, tags, validation_rate=0.1, epochs=20, batch_size=16,
tensorboard=False,
checkpoint=False):
"""
Starts the training of the model, either new or previously loaded
Parameters:
-----------
sentences: list of sentences for train (X)
tags: list of tags for train (Y)
validation_rate: percentage of validation data [0-1]
epochs: number of epoch (50 recommended)
batch_size: number of sample in batch (32 recommended, attention to memory)
tensorboard: use tensorboard for see loss and accuracy
checkpoint: save the model after each epoch
"""
assert self.trainable or self.model is None, "Please use the create_new_model(...) or load_weights(...) function before the train, the model is not integrated"
assert len(sentences) == len(
tags), f" num of sentences (X): {len(sentences)} must be equals with num of labels " \
f"(Y): {len(tags)} "
if checkpoint or tensorboard:
checkpoint_dir, writer = output_create_dir()
training_dataloader, validation_dataloader = self.__preprocess_data(batch_size, sentences, tags,
1 - validation_rate)
epochs = epochs - self.epoch_i
self.model = self.model.to(self.device)
self.model.train()
for epoch_i in range(0, epochs):
print("")
print('======== Epoch {:} / {:} ========'.format(epoch_i+1, epochs))
print('Training...')
# Train
epoch_acc, epoch_loss, training_time = self.__train_epoch(training_dataloader)
# Validation
if validation_rate > 0.0:
eval_accuracy, eval_loss, nb_eval_steps = self.__eval_epoch(validation_dataloader)
print("")
print(f'Epoch: {epoch_i+1:02} | Epoch Time: {training_time}')
print(f'\tTrain Loss: {epoch_loss:.3f} | Train Acc: {epoch_acc * 100:.2f}%')
if validation_rate > 0.0:
print(f'\tVal Loss: {eval_loss:.3f} | Val Acc: {eval_accuracy * 100:.2f}%')
if tensorboard:
writer.add_scalars(f'Accuracy', {
'Train': epoch_acc}, epoch_i+1)
writer.add_scalars(f'Loss', {
'Train': epoch_loss}, epoch_i+1)
if validation_rate > 0.0:
writer.add_scalars(f'Accuracy', {
'Validation': eval_accuracy}, epoch_i+1)
writer.add_scalars(f'Loss', {
'Validation': eval_loss}, epoch_i+1)
self.epoch_i += 1
if checkpoint:
self.__checkpoint_save(path=os.path.join(checkpoint_dir, 'model_check.pt'))
# endregion Usage
# region Private
def __preprocess_data(self, batch_size, sentences, tags,
validation_rate):
"""
Create torch dataloader for training
Parameters:
-----------
batch_size: number of sample in batch
sentences: list of sentences for train (X)
tags: list of tags for train (Y)
validation_rate: percentage of validation data [0-1]
Returns:
--------
training dataloader, validation dataloader
"""
validation_dataloader = None
sents_tokenized_t, sents_mask_t = self.sent_tokenizer.fit_transform_tensors(sentences)
tags_t = self.tags_tokenizer.convert_batchs_to_ids(tags, sents_tokenized_t)
dataset = TensorDataset(sents_tokenized_t, sents_mask_t, tags_t)
train_size = int(validation_rate * len(dataset))
print('{:>5,} training samples'.format(train_size))
if validation_rate < 1:
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
print('{:>5,} validation samples'.format(val_size))
validation_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batch_size, shuffle=True)
else:
train_dataset = dataset
training_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
return training_dataloader, validation_dataloader
def __train_epoch(self, training_dataloader):
"""
Train on epoch
Parameters:
-----------
training_dataloader: dataloader of training data
Returns:
--------
epoch accuracy, epoch loss, training time
"""
self.model.train()
epoch_loss = 0
epoch_acc = 0
t0 = time.time()
i = 0
with tqdm(training_dataloader, unit="batch") as tepoch:
for batch in tepoch:
# Convert to device
b_sents_tokenized = batch[0].to(self.device)
b_sents_mask = batch[1].to(self.device)
targets = batch[2].to(self.device)
self.optimizer.zero_grad()
output = self.model((b_sents_tokenized, b_sents_mask, targets))
loss = output['loss']
predictions = torch.argmax(output['logit'], dim=2).detach().cpu().numpy()
label_ids = targets.cpu().numpy()
acc = categorical_accuracy(predictions, label_ids)
loss.backward()
epoch_acc += acc
epoch_loss += loss.item()
self.optimizer.step()
i += 1
# Measure how long this epoch took.
training_time = format_time(time.time() - t0)
epoch_acc = epoch_acc / i
epoch_loss = epoch_loss / i
return epoch_acc, epoch_loss, training_time
def __eval_epoch(self, validation_dataloader):
"""
Validation on epoch
Parameters:
-----------
validation_dataloader: dataloader of validation data
Returns:
--------
epoch accuracy, epoch loss, num step
"""
self.model.eval()
eval_loss = 0
eval_accuracy = 0
nb_eval_steps, nb_eval_examples = 0, 0
with torch.no_grad():
print("Start eval")
for step, batch in enumerate(validation_dataloader):
# Convert to device
b_sents_tokenized = batch[0].to(self.device)
b_sents_mask = batch[1].to(self.device)
b_symbols_tokenized = batch[2].to(self.device)
output = self.model((b_sents_tokenized, b_sents_mask, b_symbols_tokenized))
loss = output['loss']
predictions = torch.argmax(output['logit'], dim=2).detach().cpu().numpy()
label_ids = b_symbols_tokenized.cpu().numpy()
accuracy = categorical_accuracy(predictions, label_ids)
eval_loss += loss.item()
eval_accuracy += accuracy
nb_eval_examples += b_sents_tokenized.size(0)
nb_eval_steps += 1
eval_loss = eval_loss / nb_eval_steps
eval_accuracy = eval_accuracy / nb_eval_steps
return eval_accuracy, eval_loss, nb_eval_steps
def __checkpoint_save(self, path='/model_check.pt'):
"""
Save the model with good parameters
Parameters:
-----------
path: poth and name for save
"""
self.model.cpu()
# print('save model parameters to [%s]' % path, file=sys.stderr)
torch.save({
'args': dict(bert_name=self.bert_name, index_to_tags=self.index_to_tags, epoch=self.epoch_i),
'state_dict': self.model.state_dict(),
'optimizer': self.optimizer,
}, path)
self.model.to(self.device)
# endregion Private
# endregion Class
\ No newline at end of file
import pickle
import numpy as np
import torch
def load_obj(name):
with open(name + '.pkl', 'rb') as f:
return pickle.load(f)
def pad_sequence(sequences, max_len=400):
padded = [0] * max_len
padded[:len(sequences)] = sequences
return padded
class SymbolTokenizer():
"""
Tokenizer for tags : Based on a dictionary
Atributes:
----------
index_to_super : dict
Convert id to supertag
super_to_index : dict
Convert supertag to id
"""
def __init__(self, index_to_super):
"""
Parameters:
-----------
index_to_super: Dict for convert ID to tags """
self.index_to_super = index_to_super
self.super_to_index = {v: int(k) for k, v in self.index_to_super.items()}
def lenSuper(self):
"""Returns len of dict for convert ID to tags """
return len(self.index_to_super) + 1
def convert_batchs_to_ids(self, tags, sents_tokenized):
"""
Convert batch of tags to id
"""
encoded_labels = []
labels = [[self.super_to_index[str(symbol)] for symbol in sents] for sents in tags]
for l, s in zip(labels, sents_tokenized):
super_tok = pad_sequence(l, len(s))
encoded_labels.append(super_tok)
return torch.tensor(encoded_labels)
def convert_ids_to_tags(self, tags_ids):
labels = [[self.index_to_super[int(symbol)] for symbol in sents if self.index_to_super[int(symbol)] != '<unk>']
for sents in tags_ids]
return labels
import torch
import transformers
from torch.nn import Module
from transformers import logging
class Tagging_bert_model(Module):
""" Implements a Token Classification model with transformers library.
Attributes:
-----------
bert_name : str
Name of BERT model to upload
num_labels : int
number of possible supertags
config : transformer Config
bert : TokenClassification model
"""
def __init__(self, bert_name, num_labels):
super(Tagging_bert_model, self).__init__()
self.bert_name = bert_name
self.num_labels = num_labels
config = transformers.AutoConfig.from_pretrained(bert_name, output_hidden_states=True, num_labels=num_labels)
self.bert = transformers.AutoModelForTokenClassification.from_pretrained(bert_name, config=config)
def forward(self, batch):
"""
Forward to the model.
Parameters:
-----------
batch :
batch of tokenized sentences
Returns:
--------
result : dict containing logit, word_embedding and last_hidden_state
"""
b_input_ids = batch[0]
b_input_mask = batch[1]
labels = batch[2]
output = self.bert(
input_ids=b_input_ids, attention_mask=b_input_mask, labels=labels)
result = {'loss': output[0],'logit': output[1], 'word_embedding': output[2][0], 'last_hidden_state': output[2][1]}
return result
def predict(self, batch):
"""
Prediction of supertags for a batch of sentences
Parameters:
-----------
batch :
batch of tokenized sentences
Returns:
--------
result : dict containing logit, word_embedding and last_hidden_state
"""
b_input_ids = batch[0]
b_input_mask = batch[1]
output = self.bert(
input_ids=b_input_ids, attention_mask=b_input_mask)
result = {'logit' : output[0], 'word_embedding': output[1][0], 'last_hidden_state':output[1][1]}
return result
def categorical_accuracy(preds, truth):
"""
Calculates how often predictions match argmax labels.
preds: batch of prediction. (argmax)
truth: batch of truth label.
@return: scoring of batch prediction. (Categorical accuracy values)
"""
good_label = 0
nb_label = 0
for i in range(len(truth)):
sublist_truth = truth[i]
sublist_preds = preds[i]
for j in range(len(sublist_truth)):
if sublist_truth[j] != 0:
if sublist_truth[j] == sublist_preds[j]:
good_label += 1
nb_label += 1
return good_label / nb_label
\ No newline at end of file
from .SuperTagger.SuperTagger import SuperTagger
\ No newline at end of file
[metadata]
name = SuperTagger
version = 1.0