diff --git a/SuperTagger/Linker/AttentionLayer.py b/SuperTagger/Linker/AttentionLayer.py deleted file mode 100644 index 150df88dec9c2ec815e1e9ea3c2659154e06d636..0000000000000000000000000000000000000000 --- a/SuperTagger/Linker/AttentionLayer.py +++ /dev/null @@ -1,109 +0,0 @@ -from torch import Tensor -import torch -from torch.nn import (GELU, Dropout, LayerNorm, Linear, Module, MultiheadAttention, - Sequential) - -from Configuration import Configuration -from SuperTagger.Symbol.SymbolEmbedding import SymbolEmbedding - - -class FFN(Module): - "Implements FFN equation." - - def __init__(self, d_model, d_ff, dropout=0.1): - super(FFN, self).__init__() - self.ffn = Sequential( - Linear(d_model, d_ff, bias=False), - GELU(), - Dropout(dropout), - Linear(d_ff, d_model, bias=False) - ) - - def forward(self, x): - return self.ffn(x) - - -class AttentionLayer(Module): - r"""TransformerDecoderLayer is made up of self-attn, multi-head-attn and feedforward network. - This standard decoder layer is based on the paper "Attention Is All You Need". - Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N Gomez, - Lukasz Kaiser, and Illia Polosukhin. 2017. Attention is all you need. In Advances in - Neural Information Processing Systems, pages 6000-6010. Users may modify or implement - in a different way during application. - - Args: - dim_model: the number of expected features in the input (required). - nhead: the number of heads in the multiheadattention models (required). - dim_feedforward: the dimension of the feedforward network model (default=2048). - dropout: the dropout value (default=0.1). - activation: the activation function of the intermediate layer, can be a string - ("relu" or "gelu") or a unary callable. Default: relu - layer_norm_eps: the eps value in layer normalization components (default=1e-5). - batch_first: If ``True``, then the input and output tensors are provided - as (batch, seq, feature). Default: ``False``. - norm_first: if ``True``, layer norm is done prior to self attention, multihead - attention and feedforward operations, respectivaly. Otherwise it's done after. - Default: ``False`` (after). - """ - __constants__ = ['batch_first', 'norm_first'] - - def __init__(self) -> None: - super(AttentionLayer, self).__init__() - - # init params - dim_encoder = int(Configuration.modelEncoderConfig['dim_encoder']) - dim_embedding_atoms = int(Configuration.modelLinkerConfig['dim_embedding_atoms']) - dim_feedforward = int(Configuration.modelLinkerConfig['dim_feedforward']) - dropout = float(Configuration.modelLinkerConfig['dropout']) - layer_norm_eps = float(Configuration.modelLinkerConfig['layer_norm_eps']) - self.nhead = int(Configuration.modelLinkerConfig['nhead']) - self.max_symbols_in_sentence = int(Configuration.datasetConfig['max_symbols_in_sentence']) - - self.symbols_embedder = SymbolEmbedding(self.dim_embedding_atoms, self.symbols_vocab_size) - - # layers - self.dropout = Dropout(dropout) - self.self_attn = MultiheadAttention(dim_embedding_atoms, self.nhead, dropout=dropout, batch_first=True, - kdim=dim_embedding_atoms, vdim=dim_embedding_atoms) - self.norm1 = LayerNorm(dim_embedding_atoms, eps=layer_norm_eps) - self.multihead_attn = MultiheadAttention(dim_embedding_atoms, self.nhead, dropout=dropout, - kdim=dim_encoder, vdim=dim_encoder, - batch_first=True) - self.norm2 = LayerNorm(dim_embedding_atoms, eps=layer_norm_eps) - self.ffn = FFN(d_model=dim_embedding_atoms, d_ff=dim_feedforward, dropout=dropout) - self.norm3 = LayerNorm(dim_embedding_atoms, eps=layer_norm_eps) - - def forward(self, atoms_embeddings, sents_embedding, encoder_mask, decoder_mask): - r"""Pass the inputs through the decoder layer. - - Args: - atoms: the sequence to the decoder layer (required). - sents: the sequence from the last layer of the encoder (required). - """ - x = atoms_embeddings - x = self.norm1(x + self._mask_mha_block(x, decoder_mask)) - x = self.norm2(x + self._mha_block(x, sents_embedding, encoder_mask)) - x = self.norm3(x + self._ff_block(x)) - - return x - - # self-attention block - def _mask_mha_block(self, x: Tensor, decoder_mask: Tensor) -> Tensor: - if decoder_mask is not None: - # Same mask applied to all h heads. - decoder_mask = decoder_mask.repeat(self.nhead, 1, 1) - x = self.self_attn(x, x, x, attn_mask=decoder_mask)[0] - return x - - # multihead attention block - def _mha_block(self, x: Tensor, sents_embs: Tensor, encoder_mask: Tensor) -> Tensor: - if encoder_mask is not None: - # Same mask applied to all h heads. - encoder_mask = encoder_mask.repeat(self.nhead, 1, 1) - x = self.multihead_attn(x, sents_embs, sents_embs, attn_mask=encoder_mask)[0] - return x - - # feed forward block - def _ff_block(self, x: Tensor) -> Tensor: - x = self.ffn.forward(x) - return x diff --git a/SuperTagger/Linker/Linker.py b/SuperTagger/Linker/Linker.py index 208e8d4137a26b0b2d5822408437f88d334e17d7..181b33ff4955f2958c4074c7d788117a21910d15 100644 --- a/SuperTagger/Linker/Linker.py +++ b/SuperTagger/Linker/Linker.py @@ -3,17 +3,34 @@ from itertools import chain import torch from torch.nn import Sequential, LayerNorm, Linear, Dropout, GELU from torch.nn import Module +import torch.nn.functional as F from Configuration import Configuration from SuperTagger.Linker.AtomEmbedding import AtomEmbedding from SuperTagger.Linker.AtomTokenizer import AtomTokenizer from SuperTagger.Linker.atom_map import atom_map from SuperTagger.Linker.Sinkhorn import sinkhorn_fn_no_exp as sinkhorn -from SuperTagger.Linker.utils import find_pos_neg_idexes, get_atoms_batch, mesure_accuracy -from SuperTagger.Linker.AttentionLayer import FFN, AttentionLayer +from SuperTagger.Linker.utils import find_pos_neg_idexes, get_atoms_batch +from SuperTagger.eval import mesure_accuracy from SuperTagger.utils import pad_sequence +class FFN(Module): + "Implements FFN equation." + + def __init__(self, d_model, d_ff, dropout=0.1): + super(FFN, self).__init__() + self.ffn = Sequential( + Linear(d_model, d_ff, bias=False), + GELU(), + Dropout(dropout), + Linear(d_ff, d_model, bias=False) + ) + + def forward(self, x): + return self.ffn(x) + + class Linker(Module): def __init__(self): super(Linker, self).__init__() @@ -33,7 +50,7 @@ class Linker(Module): self.atom_embedding = AtomEmbedding(self.dim_embedding_atoms, self.atom_vocab_size, self.padding_id) # to do : definit un encoding - self.linker_encoder = AttentionLayer() + # self.linker_encoder = self.pos_transformation = Sequential( FFN(self.dim_polarity_transfo, self.dim_polarity_transfo, 0.1), @@ -49,7 +66,7 @@ class Linker(Module): decoder_attn_mask[atoms_batch.eq(self.padding_id)] = 0.0 return decoder_attn_mask.unsqueeze(1).repeat(1, atoms_batch.shape[1], 1) - def forward(self, category_batch, sents_embedding, sents_mask): + def forward(self, category_batch, sents_embedding): ''' Parameters : category_batch : batch of size (batch_size, sequence_length) = output of decoder @@ -96,26 +113,26 @@ class Linker(Module): weights = torch.bmm(pos_encoding, neg_encoding.transpose(2, 1)) link_weights.append(sinkhorn(weights, iters=3)) - return link_weights + return torch.cat([link_weights[i].unsqueeze(0) for i in range(len(link_weights))]) - def predict_axiom_links(self, b_sents_tokenized, b_sents_mask): - return None + def eval_batch(self, supertagger, batch, cross_entropy_loss): + batch_categories = batch[0].to("cuda" if torch.cuda.is_available() else "cpu") + batch_sentences = batch[1].to("cuda" if torch.cuda.is_available() else "cpu") + batch_axiom_links = batch[2].to("cuda" if torch.cuda.is_available() else "cpu") - def eval_batch(self, batch, cross_entropy_loss): - b_sents_tokenized = batch[0].to("cuda" if torch.cuda.is_available() else "cpu") - b_sents_mask = batch[1].to("cuda" if torch.cuda.is_available() else "cpu") - b_category = batch[2].to("cuda" if torch.cuda.is_available() else "cpu") + batch_sentences_embedding = supertagger(batch_sentences, batch_sentences) - logits_axiom_links_pred = self.predict_axiom_links(b_sents_tokenized, b_sents_mask) + logits_axiom_links_pred = self.forward(batch_categories, batch_sentences_embedding) # Softmax and argmax - axiom_links_pred = torch.argmax(torch.nn.functional.softmax(logits_axiom_links_pred, dim=2), dim=2) - accuracy = mesure_accuracy(b_category, axiom_links_pred) - loss = float(cross_entropy_loss(axiom_links_pred, b_category)) + axiom_links_pred = torch.argmax(F.softmax(logits_axiom_links_pred, dim=2), dim=2) + + accuracy = mesure_accuracy(batch_axiom_links, axiom_links_pred) + loss = float(cross_entropy_loss(axiom_links_pred, batch_axiom_links)) return accuracy, loss - def eval_epoch(self, dataloader, cross_entropy_loss): + def eval_epoch(self, supertagger, dataloader, cross_entropy_loss): r"""Average the evaluation of all the batch. Args: @@ -126,7 +143,7 @@ class Linker(Module): compt = 0 for step, batch in enumerate(dataloader): compt += 1 - accuracy, loss = self.eval_batch(batch, cross_entropy_loss) + accuracy, loss = self.eval_batch(supertagger, batch, cross_entropy_loss) accuracy_average += accuracy loss_average += loss diff --git a/SuperTagger/Linker/utils.py b/SuperTagger/Linker/utils.py index ce569de05524c89f8af9f0170bbb4a0de0c94d5f..d13f5dc23d90d81aae2c893b3968090f1c52f58d 100644 --- a/SuperTagger/Linker/utils.py +++ b/SuperTagger/Linker/utils.py @@ -94,8 +94,3 @@ def find_pos_neg_idexes(batch_symbols): return list_batch -def mesure_accuracy(b_category, axiom_links_pred): - - # Convert b_category into - - return 0 \ No newline at end of file diff --git a/SuperTagger/eval.py b/SuperTagger/eval.py index 7a14ac5e20c2e15d723c978c02382dc7ee5ad72c..426f5e6d3827b6c7641eeb419896db7f983fb952 100644 --- a/SuperTagger/eval.py +++ b/SuperTagger/eval.py @@ -11,3 +11,23 @@ class SinkhornLoss(Module): def forward(self, predictions, truths): return sum(nll_loss(link.flatten(0, 1), perm.flatten(), reduction='mean') for link, perm in zip(predictions, truths)) + + +def mesure_accuracy(batch_axiom_links, axiom_links_pred): + r""" + batch_axiom_links : (batch_size, ...) + axiom_links_pred : (batch_size, max_atoms_type_polarity) + """ + # Convert batch_axiom_links into list of atoms (batch_size, max_atoms_in_sentence) + + # then convert into atom_vocab_size lists of (batch_size, max atom in one cat) with prefix parcours of graphe + + axiom_links_true = "" + + # match axiom_links_pred and true data + + correct_links = torch.ones(axiom_links_pred.size()) + correct_links[axiom_links_pred != axiom_links_true] = 0 + num_correct_links = correct_links.sum().item() + + return num_correct_links \ No newline at end of file diff --git a/train.py b/train.py index 9287436a5c86f2cd4c2c1fc3548b4a0c46d304b3..2b8b4fffb04073d0d91fb165f2d7c7b49e2f24e2 100644 --- a/train.py +++ b/train.py @@ -127,13 +127,13 @@ def run_epochs(epochs): optimizer_linker.zero_grad() # Find the prediction of categories to feed the linker and the sentences embedding - category_logits_pred, sents_embedding, sents_mask = supertagger(batch_categories, batch_sentences) + category_logits_pred, sents_embedding = supertagger(batch_categories, batch_sentences) # Predict the categories from prediction with argmax and softmax category_batch = torch.argmax(torch.nn.functional.softmax(category_logits_pred, dim=2), dim=2) # Run the kinker on the categories predictions - logits_predictions = linker(category_batch, sents_embedding, sents_mask) + logits_predictions = linker(category_batch, sents_embedding) linker_loss = cross_entropy_loss(logits_predictions, batch_axiom_links) # Perform a backward pass to calculate the gradients. @@ -145,7 +145,6 @@ def run_epochs(epochs): # Update parameters and take a step using the computed gradient. optimizer_linker.step() - scheduler_linker.step() avg_train_loss = total_train_loss / len(training_dataloader) @@ -157,7 +156,7 @@ def run_epochs(epochs): linker.eval() with torch.no_grad(): print("Start eval") - accuracy_sents, accuracy_atom, v_loss = linker.eval_epoch(validation_dataloader, cross_entropy_loss) + accuracy_sents, accuracy_atom, v_loss = linker.eval_epoch(supertagger, validation_dataloader, cross_entropy_loss) print("") print(" Average accuracy sents on epoch: {0:.2f}".format(accuracy_sents)) print(" Average accuracy atom on epoch: {0:.2f}".format(accuracy_atom))