diff --git a/Datasets/processingTXT.py b/Datasets/originals/processingTXT.py similarity index 98% rename from Datasets/processingTXT.py rename to Datasets/originals/processingTXT.py index 9fc9eca6509399be7b2d7015bce23c1e6ac201c0..cdcacc3f77a926e61dc6202727fd942b47b751ff 100644 --- a/Datasets/processingTXT.py +++ b/Datasets/originals/processingTXT.py @@ -11,9 +11,9 @@ import pandas as pd def normalize_word(orig_word): word = orig_word.lower() - if (word is "["): + if (word == "["): word = "(" - if (word is "]"): + if (word == "]"): word = ")" return word diff --git a/SuperTagger/Datasets/m2.txt b/SuperTagger/Datasets/originals/m2.txt similarity index 100% rename from SuperTagger/Datasets/m2.txt rename to SuperTagger/Datasets/originals/m2.txt diff --git a/SuperTagger/Datasets/processingTXT.py b/SuperTagger/Datasets/originals/processingTXT.py similarity index 100% rename from SuperTagger/Datasets/processingTXT.py rename to SuperTagger/Datasets/originals/processingTXT.py diff --git a/find_config.py b/find_config.py new file mode 100644 index 0000000000000000000000000000000000000000..42d5e19aeb43b42164f663261a0d1c9fd6e6b2bd --- /dev/null +++ b/find_config.py @@ -0,0 +1,61 @@ +import configparser +import re + +import torch + +from Linker.atom_map import atom_map_redux +from Linker.utils_linker import get_GOAL, get_atoms_links_batch, get_atoms_batch +from SuperTagger.SuperTagger.SuperTagger import SuperTagger +from utils import read_csv_pgbar, pad_sequence + + +def configurate(dataset, model_tagger, nb_sentences=1000000000): + print("#" * 20) + print("#" * 20) + print("Configuration with dataset\n") + config = configparser.ConfigParser() + config.read('Configuration/config.ini') + + file_path_axiom_links = dataset + df_axiom_links = read_csv_pgbar(file_path_axiom_links, nb_sentences) + + supertagger = SuperTagger() + supertagger.load_weights(model_tagger) + sentences_batch = df_axiom_links["X"].str.strip().tolist() + sentences_tokens, sentences_mask = supertagger.sent_tokenizer.fit_transform_tensors(sentences_batch) + max_len_sentence = 0 + for sentence in sentences_tokens: + if len(sentence) > max_len_sentence: + max_len_sentence = len(sentence) + print("Configure parameter max len sentence to ", max_len_sentence) + config.set('DATASET_PARAMS', 'max_len_sentence', str(max_len_sentence)) + + atoms_batch, polarities, num_batch = get_GOAL(max_len_sentence, df_axiom_links) + max_atoms_in_sentence = 0 + for sentence in atoms_batch: + if len(sentence) > max_atoms_in_sentence: + max_atoms_in_sentence = len(sentence) + print("Configure parameter max atoms in categories to", max_atoms_in_sentence) + config.set('DATASET_PARAMS', 'max_atoms_in_sentence', str(max_atoms_in_sentence)) + + atoms_polarity_batch = pad_sequence([torch.as_tensor(polarities[i], dtype=torch.bool) for i in range(len(polarities))], + max_len=max_atoms_in_sentence, padding_value=0) + pos_idx = [[torch.as_tensor([i for i, x in enumerate(sentence) if + bool(re.match(r"" + atom_type + "(_{1}\w+)?\Z", atoms_batch[s_idx][i])) + and atoms_polarity_batch[s_idx][i]]) + for s_idx, sentence in enumerate(atoms_batch)] + for atom_type in list(atom_map_redux.keys())] + max_atoms_in_on_type = 0 + for atoms_type_batch in pos_idx: + for sentence in atoms_type_batch: + length = sentence.size(0) + if length > max_atoms_in_on_type: + max_atoms_in_on_type = length + print("Configure parameter max atoms of one type in one sentence to", max_atoms_in_on_type) + config.set('DATASET_PARAMS', 'max_atoms_in_one_type', str(max_atoms_in_on_type * 2+2)) + + with open('Configuration/config.ini', 'w') as configfile: # save + config.write(configfile) + + print("#" * 20) + print("#" * 20) diff --git a/slurm.sh b/slurm.sh new file mode 100644 index 0000000000000000000000000000000000000000..b2f521c72e4c0ffe8dafb66def0f8e398e12cac1 --- /dev/null +++ b/slurm.sh @@ -0,0 +1,13 @@ +#!/bin/sh +#SBATCH --job-name=Deepgrail_Linker +#SBATCH --partition=RTX6000Node +#SBATCH --gres=gpu:1 +#SBATCH --mem=32000 +#SBATCH --gres-flags=enforce-binding +#SBATCH --error="error_rtx1.err" +#SBATCH --output="out_rtx1.out" + +module purge +module load singularity/3.0.3 + +srun singularity exec /logiciels/containerCollections/CUDA11/pytorch-NGC-21-03-py3.sif python "train_neuralproofnet.py" diff --git a/train_neuralproofnet.py b/train_neuralproofnet.py index daa6f651c783e10d7b909c8c839be603c1b9aad0..ce393adfa39c6bfa8b5b52e3d3888453ec892e52 100644 --- a/train_neuralproofnet.py +++ b/train_neuralproofnet.py @@ -16,7 +16,7 @@ print("#" * 20) print("#" * 20) model_tagger = "models/flaubert_super_98_V2_50e.pt" neural_proof_net = NeuralProofNet(model_tagger) -neural_proof_net.train_neuralproofnet(df_axiom_links, validation_rate=0.1, epochs=5, batch_size=16, +neural_proof_net.train_neuralproofnet(df_axiom_links, validation_rate=0.1, epochs=25, pretrain_linker_epochs=20, batch_size=16, checkpoint=True, tensorboard=True) print("#" * 20) print("#" * 20) diff --git a/train_supertagger.py b/train_supertagger.py index edba002387343ab034e4a1bc69488f5a28b1c2a8..2ea2aee9ae9e23fd5cc2091d51aa0afff0e3a5b8 100644 --- a/train_supertagger.py +++ b/train_supertagger.py @@ -1,5 +1,7 @@ from SuperTagger.SuperTagger.SuperTagger import SuperTagger from utils import read_supertags_csv, load_obj +import torch +torch.cuda.empty_cache() # region data file_path = 'SuperTagger/Datasets/m2_dataset_V2.csv' @@ -16,7 +18,7 @@ tagger = SuperTagger() tagger.create_new_model(len(index_to_super),'camembert-base',index_to_super) ## If you want to upload a pretrained model # tagger.load_weights("models/model_check.pt") -tagger.train(texts, tags, epochs=2, batch_size=16, validation_rate=0.1, +tagger.train(texts, tags, epochs=40, batch_size=16, validation_rate=0.1, tensorboard=True, checkpoint=True) # endregion