diff --git a/Configuration/config.ini b/Configuration/config.ini index 28c050c124ca6383748dcbb1042d1444fbcaf491..2a7495fed35267f6d1582ca71c192dd5640bf7af 100644 --- a/Configuration/config.ini +++ b/Configuration/config.ini @@ -6,7 +6,7 @@ symbols_vocab_size=26 atom_vocab_size=20 max_len_sentence=109 max_atoms_in_sentence=1250 -max_atoms_in_one_type=250 +max_atoms_in_one_type=324 [MODEL_ENCODER] dim_encoder = 768 @@ -26,8 +26,7 @@ dropout=0.1 sinkhorn_iters=3 [MODEL_TRAINING] -device=cpu batch_size=16 -epoch=25 +epoch=30 seed_val=42 learning_rate=2e-4 diff --git a/Linker/Linker.py b/Linker/Linker.py index ff37ef83fdeec183faf6c6e24fb110c552de9d43..a3306db59234b41f443f5e38a1dd7a584504b355 100644 --- a/Linker/Linker.py +++ b/Linker/Linker.py @@ -22,9 +22,10 @@ from Linker.atom_map import atom_map from Linker.eval import mesure_accuracy, SinkhornLoss from Linker.utils_linker import find_pos_neg_idexes, get_atoms_batch, FFN, get_axiom_links, get_pos_encoding_for_s_idx, \ get_neg_encoding_for_s_idx -from SuperTagger import * +from Supertagger import * from utils import pad_sequence + def format_time(elapsed): ''' Takes a time in seconds and returns a string hh:mm:ss @@ -35,6 +36,7 @@ def format_time(elapsed): # Format as hh:mm:ss return str(datetime.timedelta(seconds=elapsed_rounded)) + def output_create_dir(): """ Create le output dir for tensorboard and checkpoint @@ -99,16 +101,17 @@ class Linker(Module): Returns: the training dataloader and the validation dataloader. They contains the list of atoms, their polarities, the axiom links, the sentences tokenized, sentence mask """ - sentences_batch = df_axiom_links["Sentences"].tolist() + print("Start preprocess Data") + sentences_batch = df_axiom_links["X"].tolist() sentences_tokens, sentences_mask = self.Supertagger.sent_tokenizer.fit_transform_tensors(sentences_batch) - atoms_batch = get_atoms_batch(df_axiom_links["sub_tree"]) + atoms_batch = get_atoms_batch(df_axiom_links["Z"]) atoms_batch_tokenized = self.atoms_tokenizer.convert_batchs_to_ids(atoms_batch) - atoms_polarity_batch = find_pos_neg_idexes(self.max_atoms_in_sentence, df_axiom_links["sub_tree"]) + atoms_polarity_batch = find_pos_neg_idexes(self.max_atoms_in_sentence, df_axiom_links["Z"]) truth_links_batch = get_axiom_links(self.max_atoms_in_one_type, atoms_polarity_batch, - df_axiom_links["sub_tree"]) + df_axiom_links["Y"]) truth_links_batch = truth_links_batch.permute(1, 0, 2) # Construction tensor dataset @@ -125,6 +128,7 @@ class Linker(Module): train_dataset = dataset training_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=False) + print("End preprocess Data") return training_dataloader, validation_dataloader def make_decoder_mask(self, atoms_token): @@ -202,7 +206,6 @@ class Linker(Module): print('Training...') avg_train_loss, avg_accuracy_train, training_time = self.train_epoch(training_dataloader) - print("") print(f'Epoch: {epoch_i + 1:02} | Epoch Time: {training_time}') print(f'\tTrain Loss: {avg_train_loss:.3f} | Train Acc: {avg_accuracy_train * 100:.2f}%') @@ -215,7 +218,6 @@ class Linker(Module): self.__checkpoint_save( path=os.path.join("Output", 'linker' + datetime.today().strftime('%d-%m_%H-%M') + '.pt')) - if tensorboard: writer.add_scalars(f'Accuracy', { 'Train': avg_accuracy_train}, epoch_i) diff --git a/Linker/utils_linker.py b/Linker/utils_linker.py index 1eacad663bd829d90b29ec7fd837ab7245a7ce2f..2ab2a2dff8cc4b5cf5deba9501c49dc6b2dc6483 100644 --- a/Linker/utils_linker.py +++ b/Linker/utils_linker.py @@ -23,6 +23,7 @@ class FFN(Module): return self.ffn(x) +regex_categories_axiom_links = r'\w+\(\d+,(?:((?R))|(\w+))*,?(?:((?R))|(\w+))*\)' regex_categories = r'\w+\(\d+,(?:((?R))|(\w+))*,?(?:((?R))|(\w+))*\)' @@ -76,7 +77,7 @@ def category_to_atoms_axiom_links(category, categories_to_atoms): elif True in res: return [category] else: - category_cut = regex.match(regex_categories, category).groups() + category_cut = regex.match(regex_categories_axiom_links, category).groups() category_cut = [cat for cat in category_cut if cat is not None] for cat in category_cut: categories_to_atoms += category_to_atoms_axiom_links(cat, []) @@ -111,14 +112,16 @@ def category_to_atoms(category, categories_to_atoms): Returns: List of atoms inside the category in prefix order """ - res = [bool(re.match(r'' + atom_type + "_\d+", category)) for atom_type in atom_map.keys()] + res = [bool(re.match(r'' + atom_type, category)) for atom_type in atom_map.keys()] if category.startswith("GOAL:"): word, cat = category.split(':') return category_to_atoms(cat, categories_to_atoms) elif True in res: - category = re.match(r'([a-zA-Z|_]+)_\d+', category).group(1) return [category] + elif category == "let": + return [] else: + print(category) category_cut = regex.match(regex_categories, category).groups() category_cut = [cat for cat in category_cut if cat is not None] for cat in category_cut: @@ -155,21 +158,21 @@ def category_to_atoms_polarity(category, polarity): Boolean Tensor of shape max_symbols_in_word, containing 1 for pos indexes and 0 for neg indexes """ category_to_polarity = [] - res = [bool(re.match(r'' + atom_type + "_\d+", category)) for atom_type in atom_map.keys()] + res = [bool(re.match(r'' + atom_type, category)) for atom_type in atom_map.keys()] # mot final if category.startswith("GOAL:"): word, cat = category.split(':') - res = [bool(re.match(r'' + atom_type + "_\d+", cat)) for atom_type in atom_map.keys()] + res = [bool(re.match(r'' + atom_type, cat)) for atom_type in atom_map.keys()] if True in res: category_to_polarity.append(True) else: category_to_polarity += category_to_atoms_polarity(cat, True) - + elif category == "let": + pass # le mot a une category atomique elif True in res or category.startswith("dia") or category.startswith("box"): category_to_polarity.append(not polarity) - # sinon c'est une formule longue else: # dr = / diff --git a/Supertagger b/Supertagger new file mode 160000 index 0000000000000000000000000000000000000000..eeb4774c071e03f460f48798ab8d6820395825c9 --- /dev/null +++ b/Supertagger @@ -0,0 +1 @@ +Subproject commit eeb4774c071e03f460f48798ab8d6820395825c9 diff --git a/bash_GPU.sh b/bash_GPU.sh index 99692203e0a64519649244caa479801da0500a2a..d98ae661d3d2d2e7a7f37f4f5592ff4a2a0ff0ee 100644 --- a/bash_GPU.sh +++ b/bash_GPU.sh @@ -1,5 +1,5 @@ #!/bin/sh -#SBATCH --job-name=Deepgrail_Linker +#SBATCH --job-name=Deepgrail_Linker_9000 #SBATCH --partition=RTX6000Node #SBATCH --gres=gpu:1 #SBATCH --mem=32000 diff --git a/command_line.txt b/command_line.txt new file mode 100644 index 0000000000000000000000000000000000000000..7abcb351f1aa129d614ca8f9e6bb0bfea94def85 --- /dev/null +++ b/command_line.txt @@ -0,0 +1,4 @@ +scp -r cdepourt@osirim-slurm.irit.fr:projets/deepgrail2/deepgrail_RNN_with_linker/TensorBoard/Tranning_19-05_09-49/logs /home/cdepourt/Bureau/deepgrail_RNN_with_linker/logs + +rsync -av -e ssh --exclude="__pycache__" --exclude="venv" --exclude=".git" --exclude=".idea" -r /home/cdepourt/Bureau/deepgrail_RNN_with_linker cdepourt@osirim-slurm.irit.fr:projets/deepgrail2 + diff --git a/utils.py b/utils.py index 205d21d31f5e254e60dd01e4effdcb3dffcd6d8f..0433510b2838731d38fd2e42e16a4a7b94ecf3b8 100644 --- a/utils.py +++ b/utils.py @@ -37,7 +37,7 @@ def read_csv_pgbar(csv_path, nrows=float('inf'), chunksize=500): chunksize = nrows with tqdm(total=rows, desc='Rows read: ') as bar: - for chunk in pd.read_csv(csv_path, converters={'sub_tree': pd.eval, 'Z': pd.eval}, chunksize=chunksize, nrows=rows): + for chunk in pd.read_csv(csv_path, converters={'Y': pd.eval, 'Z': pd.eval}, chunksize=chunksize, nrows=rows): chunk_list.append(chunk) bar.update(len(chunk))