From 9c45838b0e36f3d8983adac6fbe88d8dc4be87e3 Mon Sep 17 00:00:00 2001 From: PNRIA - Julien <julien.rabault@irit.fr> Date: Thu, 19 May 2022 09:56:09 +0200 Subject: [PATCH] git ignore --- .gitignore | 3 +++ Configuration/config.ini | 4 ++-- Linker/Linker.py | 3 +-- Utils/PostpreprocesTXT.py | 38 +++++++++++++++++++++++++++++++++++--- train.py | 12 +++++++++--- 5 files changed, 50 insertions(+), 10 deletions(-) diff --git a/.gitignore b/.gitignore index f09e371..fc9826b 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,6 @@ Utils/silver Utils/gold .idea *.pt +Linker/__pycache__ +Configuration/__pycache__ +__pycache__ diff --git a/Configuration/config.ini b/Configuration/config.ini index d6c8605..3154ecf 100644 --- a/Configuration/config.ini +++ b/Configuration/config.ini @@ -28,7 +28,7 @@ sinkhorn_iters=3 [MODEL_TRAINING] device=cpu -batch_size=32 +batch_size=16 epoch=20 seed_val=42 -learning_rate=0.005 \ No newline at end of file +learning_rate=2e-5 \ No newline at end of file diff --git a/Linker/Linker.py b/Linker/Linker.py index ce25640..f79e285 100644 --- a/Linker/Linker.py +++ b/Linker/Linker.py @@ -60,7 +60,6 @@ class Linker(Module): self.cross_entropy_loss = SinkhornLoss() self.optimizer = AdamW(self.parameters(), - weight_decay=1e-5, lr=learning_rate) self.scheduler = get_cosine_schedule_with_warmup(self.optimizer, num_warmup_steps=0, @@ -106,7 +105,7 @@ class Linker(Module): return training_dataloader, validation_dataloader def make_decoder_mask(self, atoms_token): - decoder_attn_mask = torch.ones_like(atoms_token, dtype=torch.float64) + decoder_attn_mask = torch.ones_like(atoms_token, dtype=torch.float64, device=self.device) decoder_attn_mask[atoms_token.eq(self.padding_id)] = 0.0 return decoder_attn_mask.unsqueeze(1).repeat(1, atoms_token.shape[1], 1).repeat(self.nhead, 1, 1) diff --git a/Utils/PostpreprocesTXT.py b/Utils/PostpreprocesTXT.py index eaa9d30..538ab5d 100644 --- a/Utils/PostpreprocesTXT.py +++ b/Utils/PostpreprocesTXT.py @@ -32,12 +32,44 @@ def sub_tree_line(line_with_data: str): return sentence, list(itertools.chain(*sub_trees)) -def Txt_to_csv(file_name: str): +def Txt_to_csv(file_name: str, result_name): file = open(file_name, "r", encoding="utf8") text = file.readlines() sub = [sub_tree_line(data) for data in text] df = pd.DataFrame(data=sub, columns=['Sentences', 'sub_tree']) - df.to_csv("../Datasets/" + file_name[:-4] + "_dataset_links.csv", index=False) + df.to_csv("../Datasets/" + result_name + "_dataset_links.csv", mode='a', index=False, header=False) +def Txt_to_csv_header(file_name: str, result_name): + file = open(file_name, "r", encoding="utf8") + text = file.readlines() + sub = [sub_tree_line(data) for data in text] + df = pd.DataFrame(data=sub, columns=['Sentences', 'sub_tree']) + df.to_csv("../Datasets/" + result_name + "_dataset_links.csv", index=False) + +# import os +# i = 0 +# path = "gold" +# for filename in os.listdir(path): +# if i == 0: +# Txt_to_csv_header(os.path.join(path, filename),path) +# else : +# Txt_to_csv(os.path.join(path, filename),path) +# i+=1 +# +# i = 0 +# path = "silver" +# for filename in os.listdir(path): +# if i == 0: +# Txt_to_csv_header(os.path.join(path, filename),path) +# else : +# Txt_to_csv(os.path.join(path, filename),path) +# i+=1 -Txt_to_csv("aa1_links.txt") +# # reading csv files +# data1 = pd.read_csv('../Datasets/gold_dataset_links.csv') +# data2 = pd.read_csv('../Datasets/silver_dataset_links.csv') +# +# # using merge function by setting how='left' +# df = pd.merge(data1, data2,how='outer') +# +# df.to_csv("../Datasets/goldANDsilver_dataset_links.csv", index=False) diff --git a/train.py b/train.py index 8505431..9130177 100644 --- a/train.py +++ b/train.py @@ -1,7 +1,7 @@ import torch from Configuration import Configuration from Linker import * -from Supertagger import * +from deepgrail_Tagger.SuperTagger.SuperTagger import SuperTagger from utils import read_csv_pgbar torch.cuda.empty_cache() @@ -10,15 +10,21 @@ nb_sentences = batch_size * 200 epochs = int(Configuration.modelTrainingConfig['epoch']) file_path_axiom_links = 'Datasets/goldANDsilver_dataset_links.csv' +nb_sentences = batch_size * 20 +epochs = int(Configuration.modelTrainingConfig['epoch']) + +file_path_axiom_links = 'Datasets/gold_dataset_links.csv' df_axiom_links = read_csv_pgbar(file_path_axiom_links, nb_sentences) sentences_batch = df_axiom_links["Sentences"].tolist() supertagger = SuperTagger() -supertagger.load_weights("models/model_supertagger.pt") +supertagger.load_weights("models/flaubert_super_98%_V2_50e.pt") + + sents_tokenized, sents_mask = supertagger.sent_tokenizer.fit_transform_tensors(sentences_batch) print("Linker") linker = Linker(supertagger) linker = linker.to(torch.device("cuda" if torch.cuda.is_available() else "cpu")) print("Linker Training") -linker.train_linker(df_axiom_links, sents_tokenized, sents_mask, validation_rate=0.1, epochs=epochs, batch_size=batch_size, checkpoint=True, validate=True) +linker.train_linker(df_axiom_links, sents_tokenized, sents_mask, validation_rate=0.1, epochs=epochs, batch_size=batch_size, checkpoint=False, validate=True) -- GitLab