Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found
Select Git revision
  • main
1 result

Target

Select target project
  • pnria/global-helper/deepgrail-linker
1 result
Select Git revision
  • main
1 result
Show changes
import re
import graphviz
import numpy as np
import regex
from Linker.atom_map import atom_map, atom_map_redux
regex_categories = r'\w+\(\d+,(?:((?R))|(\w+))*,?(?:((?R))|(\w+))*\)'
def recursive_linking(links, dot, category, parent_id, word_idx, depth,
polarity, compt_plus, compt_neg):
r"""
recursive linking between atoms inside a category
:param links:
:param dot:
:param category:
:param parent_id:
:param word_idx:
:param depth:
:param polarity:
:param compt_plus:
:param compt_neg:
:return:
"""
res = [(category == atom_type) for atom_type in atom_map.keys()]
if True in res:
polarity = not polarity
if polarity:
atoms_idx = compt_plus[category]
compt_plus[category] += 1
else:
idx_neg = compt_neg[category]
compt_neg[category] += 1
atoms_idx = np.where(links[atom_map_redux[category]] == idx_neg)[0][0]
atom_id = category + "_" + str(polarity) + "_" + str(atoms_idx)
dot.node(atom_id, category + " " + str("+" if polarity else "-"))
dot.edge(parent_id, atom_id)
else:
category_id = category + "_" + str(word_idx) + "_" + str(depth)
dot.node(category_id, category + " " + str("+" if polarity else "-"))
dot.edge(parent_id, category_id)
parent_id = category_id
if category.startswith("dr"):
categories_inside = regex.match(regex_categories, category).groups()
categories_inside = [cat for cat in categories_inside if cat is not None]
categories_inside = [categories_inside[0], categories_inside[1]]
polarities_inside = [polarity, not polarity]
# dl / p
elif category.startswith("dl") or category.startswith("p"):
categories_inside = regex.match(regex_categories, category).groups()
categories_inside = [cat for cat in categories_inside if cat is not None]
categories_inside = [categories_inside[0], categories_inside[1]]
polarities_inside = [not polarity, polarity]
# box / dia
elif category.startswith("box") or category.startswith("dia"):
categories_inside = regex.match(regex_categories, category).groups()
categories_inside = [cat for cat in categories_inside if cat is not None]
categories_inside = [categories_inside[0]]
polarities_inside = [polarity]
else:
categories_inside = []
polarities_inside = []
for cat_id in range(len(categories_inside)):
recursive_linking(links, dot, categories_inside[cat_id], parent_id, word_idx, depth + 1,
polarities_inside[cat_id], compt_plus,
compt_neg)
def draw_sentence_output(sentence, categories, links):
r"""
Drawing the prediction of a sentence when given categories and links predictions
:param sentence: list of words
:param categories: list of categories
:param links: links predicted, output of predict_with/without_categories
:return: dot source
"""
dot = graphviz.Graph('linking', comment='Axiom linking')
dot.graph_attr['rankdir'] = 'BT'
dot.graph_attr['splines'] = 'ortho'
dot.graph_attr['ordering'] = 'in'
compt_plus = {'cl_r': 0, 'pp': 0, 'n': 0, 'np': 0, 'cl_y': 0, 'txt': 0, 's': 0}
compt_neg = {'cl_r': 0, 'pp': 0, 'n': 0, 'np': 0, 'cl_y': 0, 'txt': 0, 's': 0}
last_word_id = ""
for word_idx in range(len(sentence)):
word = sentence[word_idx]
word_id = word + "_" + str(word_idx)
dot.node(word_id, word)
if word_idx > 0:
dot.edge(last_word_id, word_id, constraint="false", style="invis")
category = categories[word_idx]
polarity = True
parent_id = word_id
recursive_linking(links, dot, category, parent_id, word_idx, 0, polarity, compt_plus, compt_neg)
last_word_id = word_id
dot.attr('edge', color='red')
dot.attr('edge', style='dashed')
for atom_type in list(atom_map_redux.keys()):
for id in range(compt_plus[atom_type]):
atom_plus = atom_type + "_" + str(True) + "_" + str(id)
atom_moins = atom_type + "_" + str(False) + "_" + str(id)
dot.edge(atom_plus, atom_moins, constraint="false")
dot.render(format="svg", view=True)
return dot.source
sentence = ["Le", "chat", "est", "noir", "bleu"]
categories = ["dr(0,s,n)", "dl(0,s,n)", "dr(0,dl(0,n,np),n)", "dl(0,np,n)", "n"]
links = np.array([[0, 0, 0, 0], [0, 0, 0, 0], [1, 0, 2, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]])
draw_sentence_output(sentence, categories, links)
numpy==1.22.2
huggingface-hub==0.4.0
pandas==1.4.1
Markdown==3.3.6
packaging==21.3
scikit-learn==1.0.2
scipy==1.8.0
sentencepiece==0.1.96
tensorflow==2.9.1
tensorboard==2.8.0
torch==1.11.0
tqdm==4.64.0
transformers==4.19.0
import torch
from Linker import *
from utils import read_csv_pgbar
from find_config import configurate
from Configuration import Configuration
torch.cuda.empty_cache()
nb_sentences = 1000000000
file_path_axiom_links = 'Datasets/goldANDsilver_dataset_links.csv'
model_tagger = "models/flaubert_super_98_V2_50e.pt"
# region config
configurate(file_path_axiom_links, model_tagger, nb_sentences=nb_sentences)
config = Configuration.read_config()
version = config["VERSION"]
datasetConfig = config["DATASET_PARAMS"]
modelEncoderConfig = config["MODEL_ENCODER"]
modelLinkerConfig = config["MODEL_LINKER"]
modelTrainingConfig = config["MODEL_TRAINING"]
epochs = int(modelTrainingConfig['epoch'])
batch_size = int(modelTrainingConfig['batch_size'])
# endregion
df_axiom_links = read_csv_pgbar(file_path_axiom_links, nb_sentences)
print("#" * 20)
print("#" * 20)
print("Linker")
# Load the Linker with trained tagger
linker = Linker(model_tagger)
print("\nLinker Training\n")
linker.train_linker(df_axiom_links, validation_rate=0.1, epochs=epochs, batch_size=batch_size,
checkpoint=True, tensorboard=True)
print("#" * 20)
print("#" * 20)
import datetime
import pandas as pd
import torch
from tqdm import tqdm
def pad_sequence(sequences, batch_first=True, padding_value=0, max_len=400):
r"""
Padding sequence for preparation to tensorDataset
:param sequences: data to pad
:param batch_first: boolean indicating whether the batch are in first dimension
:param padding_value: the value for pad
:param max_len: the maximum length
:return: padding sequences
"""
max_size = sequences[0].size()
trailing_dims = max_size[1:]
if batch_first:
out_dims = (len(sequences), max_len) + trailing_dims
else:
out_dims = (max_len, len(sequences)) + trailing_dims
out_tensor = sequences[0].data.new(*out_dims).fill_(padding_value)
for i, tensor in enumerate(sequences):
length = tensor.size(0)
# use index notation to prevent duplicate references to the tensor
if batch_first:
out_tensor[i, :length, ...] = tensor
else:
out_tensor[:length, i, ...] = tensor
return out_tensor
def read_csv_pgbar(csv_path, nrows=float('inf'), chunksize=500):
r"""
Preparing csv dataset
:param csv_path:
:param nrows:
:param chunksize:
:return:
"""
print("Loading csv...")
rows = sum(1 for _ in open(csv_path, 'r', encoding="utf8")) - 1 # minus the header
chunk_list = []
if rows > nrows:
rows = nrows
chunksize = nrows
with tqdm(total=rows, desc='Rows read: ') as bar:
for chunk in pd.read_csv(csv_path, converters={'Y': pd.eval, 'Z': pd.eval}, chunksize=chunksize, nrows=rows):
chunk_list.append(chunk)
bar.update(len(chunk))
df = pd.concat((f for f in chunk_list), axis=0)
return df
def format_time(elapsed):
'''
Takes a time in seconds and returns a string hh:mm:ss
'''
# Round to the nearest second.
elapsed_rounded = int(round(elapsed))
# Format as hh:mm:ss
return str(datetime.timedelta(seconds=elapsed_rounded))
\ No newline at end of file