diff --git a/Configuration/config.ini b/Configuration/config.ini index d05a21b070107c11f74e81f627e8accee93e4c22..28c050c124ca6383748dcbb1042d1444fbcaf491 100644 --- a/Configuration/config.ini +++ b/Configuration/config.ini @@ -12,13 +12,15 @@ max_atoms_in_one_type=250 dim_encoder = 768 [MODEL_DECODER] -nhead=4 +nhead=8 +num_layers=1 dropout=0.1 dim_feedforward=512 layer_norm_eps=1e-5 [MODEL_LINKER] dim_embedding_atoms=256 +dim_pre_sinkhorn_transfo=32 dim_polarity_transfo=256 dropout=0.1 sinkhorn_iters=3 @@ -26,6 +28,6 @@ sinkhorn_iters=3 [MODEL_TRAINING] device=cpu batch_size=16 -epoch=20 +epoch=25 seed_val=42 -learning_rate=2e-5 +learning_rate=2e-4 diff --git a/Linker/Linker.py b/Linker/Linker.py index c88e880e1d1e17fa8e94423c7cc2cfb7ce42e0ea..0f40f1e8dbcf599a91db408ec27f557fc07b223e 100644 --- a/Linker/Linker.py +++ b/Linker/Linker.py @@ -39,15 +39,14 @@ class Linker(Module): def __init__(self, supertagger_path_model): super(Linker, self).__init__() - self.dim_encoder = int(Configuration.modelEncoderConfig['dim_encoder']) - self.dim_polarity_transfo = int(Configuration.modelLinkerConfig['dim_polarity_transfo']) self.dim_embedding_atoms = int(Configuration.modelLinkerConfig['dim_embedding_atoms']) - self.sinkhorn_iters = int(Configuration.modelLinkerConfig['sinkhorn_iters']) self.nhead = int(Configuration.modelDecoderConfig['nhead']) - self.max_len_sentence = int(Configuration.datasetConfig['max_len_sentence']) + dim_pre_sinkhorn_transfo = int(Configuration.modelLinkerConfig['dim_pre_sinkhorn_transfo']) + dim_polarity_transfo = int(Configuration.modelLinkerConfig['dim_polarity_transfo']) + self.sinkhorn_iters = int(Configuration.modelLinkerConfig['sinkhorn_iters']) self.max_atoms_in_sentence = int(Configuration.datasetConfig['max_atoms_in_sentence']) self.max_atoms_in_one_type = int(Configuration.datasetConfig['max_atoms_in_one_type']) - self.atom_vocab_size = int(Configuration.datasetConfig['atom_vocab_size']) + atom_vocab_size = int(Configuration.datasetConfig['atom_vocab_size']) learning_rate = float(Configuration.modelTrainingConfig['learning_rate']) self.dropout = Dropout(0.1) self.device = "cpu" @@ -59,25 +58,22 @@ class Linker(Module): self.atom_map = atom_map self.padding_id = self.atom_map['[PAD]'] self.atoms_tokenizer = AtomTokenizer(atom_map, self.max_atoms_in_sentence) - self.atoms_embedding = AtomEmbedding(self.dim_embedding_atoms, self.atom_vocab_size, self.padding_id) + self.atoms_embedding = AtomEmbedding(self.dim_embedding_atoms, atom_vocab_size, self.padding_id) self.linker_encoder = AttentionDecoderLayer() self.pos_transformation = Sequential( - FFN(self.dim_embedding_atoms, self.dim_polarity_transfo, 0.1), - LayerNorm(self.dim_embedding_atoms, eps=1e-12) + FFN(self.dim_embedding_atoms, dim_polarity_transfo, 0.1, d_out=dim_pre_sinkhorn_transfo), + LayerNorm(dim_pre_sinkhorn_transfo, eps=1e-12) ) self.neg_transformation = Sequential( - FFN(self.dim_embedding_atoms, self.dim_polarity_transfo, 0.1), - LayerNorm(self.dim_embedding_atoms, eps=1e-12) + FFN(self.dim_embedding_atoms, dim_polarity_transfo, 0.1, d_out=dim_pre_sinkhorn_transfo), + LayerNorm(dim_pre_sinkhorn_transfo, eps=1e-12) ) self.cross_entropy_loss = SinkhornLoss() self.optimizer = AdamW(self.parameters(), lr=learning_rate) - self.scheduler = get_cosine_schedule_with_warmup(self.optimizer, - num_warmup_steps=0, - num_training_steps=float(Configuration.modelTrainingConfig['epoch'])) self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") @@ -160,7 +156,7 @@ class Linker(Module): neg_encoding = self.neg_transformation(neg_encoding) weights = torch.bmm(pos_encoding, neg_encoding.transpose(2, 1)) - link_weights.append(sinkhorn(weights, iters=3)) + link_weights.append(sinkhorn(weights, iters=self.sinkhorn_iters)) total_link_weights = torch.stack(link_weights) link_weights_per_batch = total_link_weights.permute(1, 0, 2, 3) @@ -260,7 +256,6 @@ class Linker(Module): # Update parameters and take a step using the computed gradient. self.optimizer.step() - self.scheduler.step() pred_axiom_links = torch.argmax(logits_predictions, dim=3) accuracy_train += mesure_accuracy(batch_true_links, pred_axiom_links) diff --git a/train.py b/train.py index e1d6389c4efdab689e8792bfefa44859820cb50b..4c6645ea9b8e93faa610e8e10a9b6387f7ea1f02 100644 --- a/train.py +++ b/train.py @@ -1,19 +1,17 @@ import torch from Configuration import Configuration from Linker import * -from deepgrail_Tagger.SuperTagger.SuperTagger import SuperTagger from utils import read_csv_pgbar torch.cuda.empty_cache() batch_size = int(Configuration.modelTrainingConfig['batch_size']) -nb_sentences = batch_size * 400 +nb_sentences = batch_size * 40 epochs = int(Configuration.modelTrainingConfig['epoch']) - file_path_axiom_links = 'Datasets/gold_dataset_links.csv' df_axiom_links = read_csv_pgbar(file_path_axiom_links, nb_sentences) print("Linker") linker = Linker("models/model_supertagger.pt") print("Linker Training") -linker.train_linker(df_axiom_links, validation_rate=0.1, epochs=epochs, batch_size=batch_size, checkpoint=True, tensorboard=True) +linker.train_linker(df_axiom_links, validation_rate=0.1, epochs=epochs, batch_size=batch_size, checkpoint=False, tensorboard=True)