Compare revisions

4a0e2bef · 4a0e2bef · 4a0e2bef · 4a0e2bef
--- a/postprocessing.py
+++ b/postprocessing.py
+import re
+
+import graphviz
+import numpy as np
+import regex
+from Linker.atom_map import atom_map, atom_map_redux
+
+regex_categories = r'\w+\(\d+,(?:((?R))|(\w+))*,?(?:((?R))|(\w+))*\)'
+
+
+def recursive_linking(links, dot, category, parent_id, word_idx, depth,
+                      polarity, compt_plus, compt_neg):
+    r"""
+    recursive linking between atoms inside a category
+    :param links:
+    :param dot:
+    :param category:
+    :param parent_id:
+    :param word_idx:
+    :param depth:
+    :param polarity:
+    :param compt_plus:
+    :param compt_neg:
+    :return:
+    """
+    res = [(category == atom_type) for atom_type in atom_map.keys()]
+    if True in res:
+        polarity = not polarity
+        if polarity:
+            atoms_idx = compt_plus[category]
+            compt_plus[category] += 1
+        else:
+            idx_neg = compt_neg[category]
+            compt_neg[category] += 1
+            atoms_idx = np.where(links[atom_map_redux[category]] == idx_neg)[0][0]
+        atom_id = category + "_" + str(polarity) + "_" + str(atoms_idx)
+        dot.node(atom_id, category + " " + str("+" if polarity else "-"))
+        dot.edge(parent_id, atom_id)
+    else:
+        category_id = category + "_" + str(word_idx) + "_" + str(depth)
+        dot.node(category_id, category + " " + str("+" if polarity else "-"))
+        dot.edge(parent_id, category_id)
+        parent_id = category_id
+
+        if category.startswith("dr"):
+            categories_inside = regex.match(regex_categories, category).groups()
+            categories_inside = [cat for cat in categories_inside if cat is not None]
+            categories_inside = [categories_inside[0], categories_inside[1]]
+            polarities_inside = [polarity, not polarity]
+
+        # dl / p
+        elif category.startswith("dl") or category.startswith("p"):
+            categories_inside = regex.match(regex_categories, category).groups()
+            categories_inside = [cat for cat in categories_inside if cat is not None]
+            categories_inside = [categories_inside[0], categories_inside[1]]
+            polarities_inside = [not polarity, polarity]
+
+        # box / dia
+        elif category.startswith("box") or category.startswith("dia"):
+            categories_inside = regex.match(regex_categories, category).groups()
+            categories_inside = [cat for cat in categories_inside if cat is not None]
+            categories_inside = [categories_inside[0]]
+            polarities_inside = [polarity]
+
+        else:
+            categories_inside = []
+            polarities_inside = []
+
+        for cat_id in range(len(categories_inside)):
+            recursive_linking(links, dot, categories_inside[cat_id], parent_id, word_idx, depth + 1,
+                              polarities_inside[cat_id], compt_plus,
+                              compt_neg)
+
+
+def draw_sentence_output(sentence, categories, links):
+    r"""
+    Drawing the prediction of a sentence when given categories and links predictions
+    :param sentence: list of words
+    :param categories: list of categories
+    :param links: links predicted, output of predict_with/without_categories
+    :return: dot source
+    """
+    dot = graphviz.Graph('linking', comment='Axiom linking')
+    dot.graph_attr['rankdir'] = 'BT'
+    dot.graph_attr['splines'] = 'ortho'
+    dot.graph_attr['ordering'] = 'in'
+
+    compt_plus = {'cl_r': 0, 'pp': 0, 'n': 0, 'np': 0, 'cl_y': 0, 'txt': 0, 's': 0}
+    compt_neg = {'cl_r': 0, 'pp': 0, 'n': 0, 'np': 0, 'cl_y': 0, 'txt': 0, 's': 0}
+    last_word_id = ""
+    for word_idx in range(len(sentence)):
+        word = sentence[word_idx]
+        word_id = word + "_" + str(word_idx)
+        dot.node(word_id, word)
+        if word_idx > 0:
+            dot.edge(last_word_id, word_id, constraint="false", style="invis")
+
+        category = categories[word_idx]
+        polarity = True
+        parent_id = word_id
+        recursive_linking(links, dot, category, parent_id, word_idx, 0, polarity, compt_plus, compt_neg)
+        last_word_id = word_id
+
+    dot.attr('edge', color='red')
+    dot.attr('edge', style='dashed')
+    for atom_type in list(atom_map_redux.keys()):
+        for id in range(compt_plus[atom_type]):
+            atom_plus = atom_type + "_" + str(True) + "_" + str(id)
+            atom_moins = atom_type + "_" + str(False) + "_" + str(id)
+            dot.edge(atom_plus, atom_moins, constraint="false")
+
+    dot.render(format="svg", view=True)
+    return dot.source
+
+
+sentence = ["Le", "chat", "est", "noir", "bleu"]
+categories = ["dr(0,s,n)", "dl(0,s,n)", "dr(0,dl(0,n,np),n)", "dl(0,np,n)", "n"]
+links = np.array([[0, 0, 0, 0], [0, 0, 0, 0], [1, 0, 2, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0], [0, 0, 0, 0]])
+draw_sentence_output(sentence, categories, links)
--- a/requirements.txt
+++ b/requirements.txt
+numpy==1.22.2
+huggingface-hub==0.4.0
+pandas==1.4.1
+Markdown==3.3.6
+packaging==21.3
+scikit-learn==1.0.2
+scipy==1.8.0
+sentencepiece==0.1.96
+tensorflow==2.9.1
+tensorboard==2.8.0
+torch==1.11.0
+tqdm==4.64.0
+transformers==4.19.0
--- a/train.py
+++ b/train.py
+import torch
+from Linker import *
+from utils import read_csv_pgbar
+from find_config import configurate
+from Configuration import Configuration
+
+
+torch.cuda.empty_cache()
+nb_sentences = 1000000000
+file_path_axiom_links = 'Datasets/goldANDsilver_dataset_links.csv'
+model_tagger = "models/flaubert_super_98_V2_50e.pt"
+
+# region config
+configurate(file_path_axiom_links, model_tagger, nb_sentences=nb_sentences)
+config = Configuration.read_config()
+version = config["VERSION"]
+datasetConfig = config["DATASET_PARAMS"]
+modelEncoderConfig = config["MODEL_ENCODER"]
+modelLinkerConfig = config["MODEL_LINKER"]
+modelTrainingConfig = config["MODEL_TRAINING"]
+epochs = int(modelTrainingConfig['epoch'])
+batch_size = int(modelTrainingConfig['batch_size'])
+# endregion
+
+df_axiom_links = read_csv_pgbar(file_path_axiom_links, nb_sentences)
+
+print("#" * 20)
+print("#" * 20)
+print("Linker")
+# Load the Linker with trained tagger
+linker = Linker(model_tagger)
+print("\nLinker Training\n")
+linker.train_linker(df_axiom_links, validation_rate=0.1, epochs=epochs, batch_size=batch_size,
+                    checkpoint=True, tensorboard=True)
+print("#" * 20)
+print("#" * 20)
--- a/utils.py
+++ b/utils.py
+import datetime
+
+import pandas as pd
+import torch
+from tqdm import tqdm
+
+
+def pad_sequence(sequences, batch_first=True, padding_value=0, max_len=400):
+    r"""
+    Padding sequence for preparation to tensorDataset
+    :param sequences: data to pad
+    :param batch_first: boolean indicating whether the batch are in first dimension
+    :param padding_value: the value for pad
+    :param max_len: the maximum length
+    :return: padding sequences
+    """
+    max_size = sequences[0].size()
+    trailing_dims = max_size[1:]
+    if batch_first:
+        out_dims = (len(sequences), max_len) + trailing_dims
+    else:
+        out_dims = (max_len, len(sequences)) + trailing_dims
+
+    out_tensor = sequences[0].data.new(*out_dims).fill_(padding_value)
+    for i, tensor in enumerate(sequences):
+        length = tensor.size(0)
+        # use index notation to prevent duplicate references to the tensor
+        if batch_first:
+            out_tensor[i, :length, ...] = tensor
+        else:
+            out_tensor[:length, i, ...] = tensor
+
+    return out_tensor
+
+
+def read_csv_pgbar(csv_path, nrows=float('inf'), chunksize=500):
+    r"""
+    Preparing csv dataset
+    :param csv_path:
+    :param nrows:
+    :param chunksize:
+    :return:
+    """
+    print("Loading csv...")
+
+    rows = sum(1 for _ in open(csv_path, 'r', encoding="utf8")) - 1  # minus the header
+    chunk_list = []
+
+    if rows > nrows:
+        rows = nrows
+        chunksize = nrows
+
+    with tqdm(total=rows, desc='Rows read: ') as bar:
+        for chunk in pd.read_csv(csv_path, converters={'Y': pd.eval, 'Z': pd.eval}, chunksize=chunksize, nrows=rows):
+            chunk_list.append(chunk)
+            bar.update(len(chunk))
+
+    df = pd.concat((f for f in chunk_list), axis=0)
+
+    return df
+
+
+def format_time(elapsed):
+    '''
+    Takes a time in seconds and returns a string hh:mm:ss
+    '''
+    # Round to the nearest second.
+    elapsed_rounded = int(round(elapsed))
+
+    # Format as hh:mm:ss
+    return str(datetime.timedelta(seconds=elapsed_rounded))
\ No newline at end of file
No results found