Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision
  • main
1 result

Target

Select target project
  • pnria/global-helper/deepgrail-tagger-linker
1 result
Select Git revision
  • main
1 result
Show changes
import configparser
import re
import torch
from Linker.atom_map import atom_map_redux
from Linker.utils_linker import get_GOAL, get_atoms_links_batch, get_atoms_batch
from SuperTagger.SuperTagger.SuperTagger import SuperTagger
from utils import read_csv_pgbar, pad_sequence
def configurate(dataset, model_tagger, nb_sentences=1000000000):
print("#" * 20)
print("#" * 20)
print("Configuration with dataset\n")
config = configparser.ConfigParser()
config.read('Configuration/config.ini')
file_path_axiom_links = dataset
df_axiom_links = read_csv_pgbar(file_path_axiom_links, nb_sentences)
supertagger = SuperTagger()
supertagger.load_weights(model_tagger)
sentences_batch = df_axiom_links["X"].str.strip().tolist()
sentences_tokens, sentences_mask = supertagger.sent_tokenizer.fit_transform_tensors(sentences_batch)
max_len_sentence = 0
for sentence in sentences_tokens:
if len(sentence) > max_len_sentence:
max_len_sentence = len(sentence)
print("Configure parameter max len sentence to ", max_len_sentence)
config.set('DATASET_PARAMS', 'max_len_sentence', str(max_len_sentence))
atoms_batch, polarities, num_batch = get_GOAL(max_len_sentence, df_axiom_links)
max_atoms_in_sentence = 0
for sentence in atoms_batch:
if len(sentence) > max_atoms_in_sentence:
max_atoms_in_sentence = len(sentence)
print("Configure parameter max atoms in categories to", max_atoms_in_sentence)
config.set('DATASET_PARAMS', 'max_atoms_in_sentence', str(max_atoms_in_sentence))
atoms_polarity_batch = pad_sequence([torch.as_tensor(polarities[i], dtype=torch.bool) for i in range(len(polarities))],
max_len=max_atoms_in_sentence, padding_value=0)
pos_idx = [[torch.as_tensor([i for i, x in enumerate(sentence) if
bool(re.match(r"" + atom_type + "(_{1}\w+)?\Z", atoms_batch[s_idx][i]))
and atoms_polarity_batch[s_idx][i]])
for s_idx, sentence in enumerate(atoms_batch)]
for atom_type in list(atom_map_redux.keys())]
max_atoms_in_on_type = 0
for atoms_type_batch in pos_idx:
for sentence in atoms_type_batch:
length = sentence.size(0)
if length > max_atoms_in_on_type:
max_atoms_in_on_type = length
print("Configure parameter max atoms of one type in one sentence to", max_atoms_in_on_type)
config.set('DATASET_PARAMS', 'max_atoms_in_one_type', str(max_atoms_in_on_type * 2+2))
with open('Configuration/config.ini', 'w') as configfile: # save
config.write(configfile)
print("#" * 20)
print("#" * 20)
python3 -m venv env
source env/bin/activate
pip install -r requirements.txt
repository="https://gitlab.irit.fr/pnria/global-helper/deepgrail_tagger"
git clone $repository
mkdir Output
mkdir TensorBoard
\ No newline at end of file
from NeuralProofNet.NeuralProofNet import NeuralProofNet
# region data
a_s = "( 1 ) parmi les huit \" partants \" acquis ou potentiels , MM. Lacombe , Koehler et Laroze ne sont pas membres " \
"du PCF . "
tags_s = ['let', 'dr(0,s,s)', 'let', 'dr(0,dr(0,s,s),np)', 'dr(0,np,n)', 'dr(0,n,n)', 'let', 'n', 'let', 'dl(0,n,n)',
'dr(0,dl(0,dl(0,n,n),dl(0,n,n)),dl(0,n,n))', 'dl(0,n,n)', 'let', 'dr(0,np,np)', 'np', 'dr(0,dl(0,np,np),np)',
'np', 'dr(0,dl(0,np,np),np)', 'np', 'dr(0,dl(0,np,s),dl(0,np,s))', 'dr(0,dl(0,np,s),np)', 'dl(1,s,s)', 'np',
'dr(0,dl(0,np,np),n)', 'n', 'dl(0,s,txt)']
# endregion
# region model
model_tagger = "models/flaubert_super_98_V2_50e.pt"
neuralproofnet = NeuralProofNet(model_tagger)
model = "Output/linker.pt"
neuralproofnet.linker.load_weights(model)
# endregion
# region prediction
linker = neuralproofnet.linker
links = linker.predict_without_categories(a_s)
#links = linker.predict_with_categories(a_s, tags_s)
print(links)
# endregion
\ No newline at end of file
from SuperTagger.SuperTagger.SuperTagger import SuperTagger
from SuperTagger.SuperTagger.eval import categorical_accuracy
# region data
a_s = "( 1 ) parmi les huit \" partants \" acquis ou potentiels , MM. Lacombe , Koehler et Laroze ne sont pas membres " \
"du PCF . "
tags_s = [['let', 'dr(0,s,s)', 'let', 'dr(0,dr(0,s,s),np)', 'dr(0,np,n)', 'dr(0,n,n)', 'let', 'n', 'let', 'dl(0,n,n)',
'dr(0,dl(0,dl(0,n,n),dl(0,n,n)),dl(0,n,n))', 'dl(0,n,n)', 'let', 'dr(0,np,np)', 'np', 'dr(0,dl(0,np,np),np)',
'np', 'dr(0,dl(0,np,np),np)', 'np', 'dr(0,dl(0,np,s),dl(0,np,s))', 'dr(0,dl(0,np,s),np)', 'dl(1,s,s)', 'np',
'dr(0,dl(0,np,np),n)', 'n', 'dl(0,s,txt)']]
# endregion
# region model
tagger = SuperTagger()
model = "models/flaubert_super_98_V2_50e.pt"
tagger.load_weights(model)
# endregion
# region prediction
_, pred_convert = tagger.predict(a_s)
print("Model : ", model)
print("\tLen Text : ", len(a_s.split()))
print("\tLen tags : ", len(tags_s[0]))
print("\tLen pred_convert : ", len(pred_convert[0]))
print()
print("\tText : ", a_s)
print()
print("\tTags : ", tags_s[0])
print()
print("\tPred_convert : ", pred_convert[0])
print()
print("\tScore :", f"{categorical_accuracy(pred_convert, tags_s)*100}%" )
# endregion
\ No newline at end of file
#!/bin/sh
#SBATCH --job-name=Deepgrail_Linker
#SBATCH --partition=RTX6000Node
#SBATCH --gres=gpu:1
#SBATCH --mem=32000
#SBATCH --gres-flags=enforce-binding
#SBATCH --error="error_rtx1.err"
#SBATCH --output="out_rtx1.out"
module purge
module load singularity/3.0.3
srun singularity exec /logiciels/containerCollections/CUDA11/pytorch-NGC-21-03-py3.sif python "train_neuralproofnet.py"
import torch
from Linker import *
from NeuralProofNet.NeuralProofNet import NeuralProofNet
from utils import read_csv_pgbar
from Configuration import Configuration
from utils import read_links_csv
torch.cuda.empty_cache()
nb_sentences = 10000000
file_path_axiom_links = 'Datasets/goldANDsilver_dataset_links.csv'
model_tagger = "models/flaubert_super_98_V2_50e.pt"
# region config
config = Configuration.read_config()
version = config["VERSION"]
datasetConfig = config["DATASET_PARAMS"]
modelEncoderConfig = config["MODEL_ENCODER"]
modelLinkerConfig = config["MODEL_LINKER"]
modelTrainingConfig = config["MODEL_TRAINING"]
epochs = int(modelTrainingConfig['epoch'])
batch_size = int(modelTrainingConfig['batch_size'])
# region data
file_path_axiom_links = 'Datasets/goldANDsilver_dataset_links.csv'
df_axiom_links = read_links_csv(file_path_axiom_links)
# endregion
df_axiom_links = read_csv_pgbar(file_path_axiom_links, nb_sentences)
# region model
print("#" * 20)
print("#" * 20)
model_tagger = "models/flaubert_super_98_V2_50e.pt"
neural_proof_net = NeuralProofNet(model_tagger)
neural_proof_net.train_neuralproofnet(df_axiom_links, validation_rate=0.1, epochs=epochs, batch_size=batch_size,
neural_proof_net.train_neuralproofnet(df_axiom_links, validation_rate=0.1, epochs=25, pretrain_linker_epochs=20, batch_size=16,
checkpoint=True, tensorboard=True)
print("#" * 20)
print("#" * 20)
# endregion
\ No newline at end of file
from SuperTagger.SuperTagger.SuperTagger import SuperTagger
from utils import read_supertags_csv, load_obj
import torch
torch.cuda.empty_cache()
# region data
file_path = 'SuperTagger/Datasets/m2_dataset_V2.csv'
df = read_supertags_csv(file_path)
texts = df['X'].tolist()
tags = df['Z'].tolist()
index_to_super = load_obj('SuperTagger/Datasets/index_to_super')
# endregion
# region model
tagger = SuperTagger()
tagger.create_new_model(len(index_to_super),'camembert-base',index_to_super)
## If you want to upload a pretrained model
# tagger.load_weights("models/model_check.pt")
tagger.train(texts, tags, epochs=40, batch_size=16, validation_rate=0.1,
tensorboard=True, checkpoint=True)
# endregion
import datetime
import os
from torch.utils.tensorboard import SummaryWriter
import pandas as pd
import torch
from tqdm import tqdm
# region load data
def read_links_csv(csv_path, nrows=float('inf'), chunksize=100):
r"""
Preparing csv dataset.
Parameters:
-----------
csv_path:
nrows:
chunksize:
"""
print("\n" + "#" * 20)
print("Loading csv...")
chunk_list = []
with tqdm(total=nrows, desc='Rows read: ') as bar:
for chunk in pd.read_csv(csv_path, header=0, converters={'Y': pd.eval, 'Z': pd.eval},
chunksize=chunksize, nrows=nrows):
chunk_list.append(chunk)
bar.update(len(chunk))
df = pd.concat((f for f in chunk_list), axis=0)
print("#" * 20)
return df
def read_supertags_csv(csv_path, nrows=float('inf'), chunksize=100):
r"""
Preparing csv dataset.
Parameters:
-----------
csv_path:
nrows:
chunksize:
"""
print("\n" + "#" * 20)
print("Loading csv...")
chunk_list = []
with tqdm(total=nrows, desc='Rows read: ') as bar:
for chunk in pd.read_csv(csv_path, header=0, converters={'Y1': pd.eval, 'Y2': pd.eval, 'Z': pd.eval},
chunksize=chunksize, nrows=nrows):
chunk_list.append(chunk)
bar.update(len(chunk))
df = pd.concat((f for f in chunk_list), axis=0)
print("#" * 20)
return df
def load_obj(name):
with open(name + '.pkl', 'rb') as f:
import pickle
return pickle.load(f)
#endregion
# region format data
def pad_sequence(sequences, batch_first=True, padding_value=0, max_len=400):
r"""
Padding sequence for preparation to tensorDataset
:param sequences: data to pad
:param batch_first: boolean indicating whether the batch are in first dimension
:param padding_value: the value for pad
:param max_len: the maximum length
sequences: data to pad
batch_first: boolean indicating whether the batch are in first dimension
padding_value: the value for pad
max_len: the maximum length
:return: padding sequences
"""
max_size = sequences[0].size()
......@@ -32,32 +93,21 @@ def pad_sequence(sequences, batch_first=True, padding_value=0, max_len=400):
return out_tensor
#endregion
def read_csv_pgbar(csv_path, nrows=float('inf'), chunksize=500):
r"""
Preparing csv dataset
:param csv_path:
:param nrows:
:param chunksize:
:return:
"""
print("Loading csv...")
rows = sum(1 for _ in open(csv_path, 'r', encoding="utf8")) - 1 # minus the header
chunk_list = []
if rows > nrows:
rows = nrows
chunksize = nrows
with tqdm(total=rows, desc='Rows read: ') as bar:
for chunk in pd.read_csv(csv_path, converters={'Y': pd.eval, 'Z': pd.eval}, chunksize=chunksize, nrows=rows):
chunk_list.append(chunk)
bar.update(len(chunk))
# region utils training
df = pd.concat((f for f in chunk_list), axis=0)
return df
def output_create_dir():
"""
Create le output dir for tensorboard and checkpoint
@return: output dir, tensorboard writter
"""
from datetime import datetime
outpout_path = 'TensorBoard'
training_dir = os.path.join(outpout_path, 'Tranning_' + datetime.today().strftime('%d-%m_%H-%M'))
logs_dir = os.path.join(training_dir, 'logs')
writer = SummaryWriter(log_dir=logs_dir)
return training_dir, writer
def format_time(elapsed):
......@@ -68,4 +118,6 @@ def format_time(elapsed):
elapsed_rounded = int(round(elapsed))
# Format as hh:mm:ss
return str(datetime.timedelta(seconds=elapsed_rounded))
\ No newline at end of file
return str(datetime.timedelta(seconds=elapsed_rounded))
#endregion
\ No newline at end of file