From 843cc70b6d8ab8e991053d1cfd57505979cfdd52 Mon Sep 17 00:00:00 2001 From: Caroline DE POURTALES <caroline.de-pourtales@irit.fr> Date: Tue, 28 Mar 2023 18:10:27 +0200 Subject: [PATCH] easier to configure --- Configuration/config.ini | 9 +++-- SuperTagger/SuperTagger/SentencesTokenizer.py | 4 +- SuperTagger/SuperTagger/SuperTagger.py | 9 ++++- SuperTagger/SuperTagger/SymbolTokenizer.py | 1 + SuperTagger/SuperTagger/eval.py | 1 - find_config.py | 38 +++++++++++++++++-- train_supertagger.py | 18 ++++++--- utils.py | 12 ++---- 8 files changed, 67 insertions(+), 25 deletions(-) diff --git a/Configuration/config.ini b/Configuration/config.ini index a695829..4f253f4 100644 --- a/Configuration/config.ini +++ b/Configuration/config.ini @@ -4,9 +4,9 @@ transformers = 4.16.2 [DATASET_PARAMS] symbols_vocab_size = 26 atom_vocab_size = 18 -max_len_sentence = 300 -max_atoms_in_sentence = 900 -max_atoms_in_one_type = 360 +max_len_sentence = 290 +max_atoms_in_sentence = 440 +max_atoms_in_one_type = 180 [MODEL_ENCODER] dim_encoder = 768 @@ -20,4 +20,5 @@ dim_cat_out = 256 dim_intermediate_ffn = 128 dim_pre_sinkhorn_transfo = 32 dropout = 0.1 -sinkhorn_iters = 5 \ No newline at end of file +sinkhorn_iters = 5 + diff --git a/SuperTagger/SuperTagger/SentencesTokenizer.py b/SuperTagger/SuperTagger/SentencesTokenizer.py index 104577f..dab638a 100644 --- a/SuperTagger/SuperTagger/SentencesTokenizer.py +++ b/SuperTagger/SuperTagger/SentencesTokenizer.py @@ -25,7 +25,9 @@ class SentencesTokenizer(): """ Tokenizes the given sentences """ - return self.tokenizer(sents, padding=True) + temp = self.tokenizer(sents, padding=True) + + return temp["input_ids"], temp["attention_mask"] def fit_transform_tensors(self, sents): """ diff --git a/SuperTagger/SuperTagger/SuperTagger.py b/SuperTagger/SuperTagger/SuperTagger.py index 670849d..cabef35 100644 --- a/SuperTagger/SuperTagger/SuperTagger.py +++ b/SuperTagger/SuperTagger/SuperTagger.py @@ -5,6 +5,7 @@ import time import torch import transformers from torch.optim import Adam +from torch.optim.lr_scheduler import StepLR from torch.utils.data import TensorDataset, random_split from tqdm import tqdm from transformers import AutoTokenizer @@ -70,6 +71,7 @@ class SuperTagger: self.model = None self.optimizer = None + self.scheduler = None self.epoch_i = 0 self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -104,6 +106,7 @@ class SuperTagger: self.max_len_sentence) self.model.load_state_dict(params['state_dict']) self.optimizer = params['optimizer'] + self.scheduler = StepLR(self.optimizer, step_size=10, gamma=0.5) # self.epoch_i = args['epoch'] print("\n The loading checkpoint was successful ! \n") print("\tBert model : ", self.bert_name) @@ -144,7 +147,9 @@ class SuperTagger: bert_name, do_lower_case=True), self.max_len_sentence) - self.optimizer = Adam(params=self.model.parameters(), lr=1e-3, eps=1e-8) + + self.optimizer = Adam(params=self.model.parameters(), lr=1e-5, eps=1e-8) + self.scheduler = StepLR(self.optimizer, step_size=10, gamma=0.9) self.tags_tokenizer = SymbolTokenizer(index_to_tags) self.trainable = True self.model_load = True @@ -333,6 +338,8 @@ class SuperTagger: self.optimizer.step() i += 1 + + self.scheduler.step() # Measure how long this epoch took. training_time = format_time(time.time() - t0) diff --git a/SuperTagger/SuperTagger/SymbolTokenizer.py b/SuperTagger/SuperTagger/SymbolTokenizer.py index dafcc12..032da97 100644 --- a/SuperTagger/SuperTagger/SymbolTokenizer.py +++ b/SuperTagger/SuperTagger/SymbolTokenizer.py @@ -56,3 +56,4 @@ class SymbolTokenizer(): return labels + diff --git a/SuperTagger/SuperTagger/eval.py b/SuperTagger/SuperTagger/eval.py index 1fd1b97..84e2c6c 100644 --- a/SuperTagger/SuperTagger/eval.py +++ b/SuperTagger/SuperTagger/eval.py @@ -16,4 +16,3 @@ def categorical_accuracy(preds, truth): good_label += 1 nb_label += 1 return good_label / nb_label - \ No newline at end of file diff --git a/find_config.py b/find_config.py index 42d5e19..050ad80 100644 --- a/find_config.py +++ b/find_config.py @@ -6,10 +6,38 @@ import torch from Linker.atom_map import atom_map_redux from Linker.utils_linker import get_GOAL, get_atoms_links_batch, get_atoms_batch from SuperTagger.SuperTagger.SuperTagger import SuperTagger -from utils import read_csv_pgbar, pad_sequence +from utils import read_links_csv, read_supertags_csv, pad_sequence, load_obj +def configurate_supertagger(dataset, index_to_super_path, model_tagger, nb_sentences=1000000000): + print("#" * 20) + print("#" * 20) + print("Configuration with dataset\n") + config = configparser.ConfigParser() + config.read('Configuration/config.ini') + + df = read_supertags_csv(dataset, nb_sentences) + index_to_super = load_obj(index_to_super_path) -def configurate(dataset, model_tagger, nb_sentences=1000000000): + supertagger = SuperTagger() + supertagger.create_new_model(len(index_to_super),model_tagger,index_to_super) + + sentences_batch = df["X"].str.strip().tolist() + sentences_tokens, sentences_mask = supertagger.sent_tokenizer.fit_transform(sentences_batch) + max_len_sentence = 0 + for sentence in sentences_tokens: + if len(sentence) > max_len_sentence: + max_len_sentence = len(sentence) + print("Configure parameter max len sentence to ", max_len_sentence) + config.set('DATASET_PARAMS', 'max_len_sentence', str(max_len_sentence)) + + with open('Configuration/config.ini', 'w') as configfile: # save + config.write(configfile) + + print("#" * 20) + print("#" * 20) + + +def configurate_linker(dataset, model_tagger, nb_sentences=1000000000): print("#" * 20) print("#" * 20) print("Configuration with dataset\n") @@ -17,12 +45,13 @@ def configurate(dataset, model_tagger, nb_sentences=1000000000): config.read('Configuration/config.ini') file_path_axiom_links = dataset - df_axiom_links = read_csv_pgbar(file_path_axiom_links, nb_sentences) + df_axiom_links = read_links_csv(file_path_axiom_links, nb_sentences) supertagger = SuperTagger() supertagger.load_weights(model_tagger) + sentences_batch = df_axiom_links["X"].str.strip().tolist() - sentences_tokens, sentences_mask = supertagger.sent_tokenizer.fit_transform_tensors(sentences_batch) + sentences_tokens, sentences_mask = supertagger.sent_tokenizer.fit_transform(sentences_batch) max_len_sentence = 0 for sentence in sentences_tokens: if len(sentence) > max_len_sentence: @@ -59,3 +88,4 @@ def configurate(dataset, model_tagger, nb_sentences=1000000000): print("#" * 20) print("#" * 20) + diff --git a/train_supertagger.py b/train_supertagger.py index 571edc6..67d8d7c 100644 --- a/train_supertagger.py +++ b/train_supertagger.py @@ -1,24 +1,30 @@ from SuperTagger.SuperTagger.SuperTagger import SuperTagger from utils import read_supertags_csv, load_obj +from find_config import configurate_supertagger + import torch torch.cuda.empty_cache() +dataset = 'SuperTagger/Datasets/m2_dataset_V2.csv' +index_to_super_path = 'SuperTagger/Datasets/index_to_super' +bert_model = "flaubert/flaubert_base_cased" + +configurate_supertagger(dataset, index_to_super_path, bert_model, nb_sentences=1000000000) + # region data -file_path = 'SuperTagger/Datasets/m2_dataset_V2.csv' -df = read_supertags_csv(file_path) +df = read_supertags_csv(dataset) texts = df['X'].tolist() tags = df['Z'].tolist() -index_to_super = load_obj('SuperTagger/Datasets/index_to_super') +index_to_super = load_obj(index_to_super_path) # endregion - # region model tagger = SuperTagger() -tagger.create_new_model(len(index_to_super),"flaubert/flaubert_base_cased",index_to_super) +tagger.create_new_model(len(index_to_super),bert_model,index_to_super) ## If you want to upload a pretrained model # tagger.load_weights("models/model_check.pt") -tagger.train(texts, tags, epochs=60, batch_size=16, validation_rate=0.1, +tagger.train(texts, tags, epochs=70, batch_size=16, validation_rate=0.1, tensorboard=True, checkpoint=True) # endregion diff --git a/utils.py b/utils.py index b1c9f85..95217aa 100644 --- a/utils.py +++ b/utils.py @@ -20,13 +20,9 @@ def read_links_csv(csv_path, nrows=float('inf'), chunksize=100): print("\n" + "#" * 20) print("Loading csv...") - rows = sum(1 for _ in open(csv_path, 'r', encoding="utf8")) - 1 # minus the header chunk_list = [] - if rows > nrows: - rows = nrows - - with tqdm(total=nrows, desc='Rows read: ') as bar: + with tqdm(total=rows, desc='Rows read: ') as bar: for chunk in pd.read_csv(csv_path, header=0, converters={'Y': pd.eval, 'Z': pd.eval}, chunksize=chunksize, nrows=nrows): chunk_list.append(chunk) @@ -55,9 +51,9 @@ def read_supertags_csv(csv_path, nrows=float('inf'), chunksize=100): if rows > nrows: rows = nrows - with tqdm(total=nrows, desc='Rows read: ') as bar: - for chunk in pd.read_csv(csv_path, header=0, converters={'Y1': pd.eval, 'Y2': pd.eval, 'Z': pd.eval}, - chunksize=chunksize, nrows=nrows): + with tqdm(total=rows, desc='Rows read: ') as bar: + for chunk in pd.read_csv(csv_path, converters={'Y1': pd.eval, 'Y2': pd.eval, 'Z': pd.eval}, chunksize=chunksize, + nrows=rows): chunk_list.append(chunk) bar.update(len(chunk)) -- GitLab