add encoding label

ba429805 · Julien Rabault · bc19cbf9 · ba429805 · ba429805 · ba429805
Commit ba429805 authored 3 years ago by Julien Rabault
--- a/Datasets/index_to_pos1.pkl
+++ b/Datasets/index_to_pos1.pkl
--- a/Datasets/index_to_pos2.pkl
+++ b/Datasets/index_to_pos2.pkl
--- a/Datasets/index_to_super.pkl
+++ b/Datasets/index_to_super.pkl
--- a/SuperTagger/Decoder/RNNDecoderLayer.py
+++ b/SuperTagger/Decoder/RNNDecoderLayer.py
-import random
-
-import torch
-import torch.nn.functional as F
-from torch.nn import (Dropout, Module, ModuleList, Linear, LSTM, GRU)
-
-from Configuration import Configuration
-
-
-class RNNDecoderLayer(Module):
-    def __init__(self, symbols_map):
-        super(RNNDecoderLayer, self).__init__()
-
-        # init params
-        self.dim_encoder = int(Configuration.modelDecoderConfig['dim_encoder'])
-        self.dim_decoder = int(Configuration.modelDecoderConfig['dim_decoder'])
-        self.max_symbols_per_word = int(Configuration.modelDecoderConfig['max_symbols_per_word'])
-        self.max_len_sentence = int(Configuration.modelDecoderConfig['max_len_sentence'])
-        self.symbols_vocab_size = int(Configuration.modelDecoderConfig['symbols_vocab_size'])
-        dropout = float(Configuration.modelDecoderConfig['dropout'])
-        self.num_rnn_layers = int(Configuration.modelDecoderConfig['num_rnn_layers'])
-        self.teacher_forcing = float(Configuration.modelDecoderConfig['teacher_forcing'])
-
-        self.bidirectional = False
-        self.use_attention = True
-        self.symbols_map = symbols_map
-        self.symbols_padding_id = self.symbols_map["[PAD]"]
-        self.symbols_sep_id = self.symbols_map["[SEP]"]
-        self.symbols_start_id = self.symbols_map["[START]"]
-        self.symbols_sos_id = self.symbols_map["[SOS]"]
-
-        # Different layers
-        # Symbols Embedding
-
-        # For hidden_state
-        self.dropout = Dropout(dropout)
-        # rnn Layer
-        if self.use_attention:
-            self.rnn = LSTM(input_size=self.dim_encoder, hidden_size=self.dim_encoder, num_layers=self.num_rnn_layers,
-                        dropout=dropout,
-                        bidirectional=self.bidirectional, batch_first=True)
-        else :
-            self.rnn = LSTM(input_size=self.dim_decoder, hidden_size=self.dim_encoder, num_layers=self.num_rnn_layers,
-                        dropout=dropout,
-                        bidirectional=self.bidirectional, batch_first=True)
-
-        # Projection on vocab_size
-        if self.bidirectional:
-            self.proj = Linear(self.dim_encoder * 2, self.symbols_vocab_size)
-        else:
-            self.proj = Linear(self.dim_encoder, self.symbols_vocab_size)
-
-        self.attn_combine = Linear(self.dim_decoder + self.dim_encoder, self.dim_encoder)
-
-    def sos_mask(self, y):
-        return torch.eq(y, self.symbols_sos_id)
-
-    def forward(self, symbols_tokenized_batch, last_hidden_state, pooler_output):
-        r"""Training the translation from encoded sentences to symbols
-
-        Args:
-            symbols_tokenized_batch: [batch_size, max_symbols_in_sentence] the true symbols for each sentence.
-            last_hidden_state: [batch_size, max_len_sentence, dim_encoder]  Sequence of hidden-states at the output of the last layer of the model.
-            pooler_output: [batch_size, dim_encoder] Last layer hidden-state of the first token of the sequence (classification token) after further processing through the layers used for the auxiliary pretraining task
-        """
-        batch_size, sequence_length, hidden_size = last_hidden_state.shape
-
-        # y_hat[batch_size, max_len_sentence, vocab_size] init with probability pad =1
-        y_hat = torch.zeros(batch_size, self.max_len_sentence, self.max_symbols_per_word, self.symbols_vocab_size,
-                            dtype=torch.float, device="cuda" if torch.cuda.is_available() else "cpu")
-        y_hat[:, :, self.symbols_padding_id] = 1
-
-        decoded_ij = torch.ones(batch_size, 1, dtype=torch.long,
-                               device="cuda" if torch.cuda.is_available() else "cpu") * self.symbols_start_id
-
-        sos_mask = torch.zeros(batch_size, dtype=torch.bool, device="cuda" if torch.cuda.is_available() else "cpu")
-
-        # hidden_state goes through multiple linear layers
-        hidden_state = pooler_output.unsqueeze(0).repeat(self.num_rnn_layers * (1 + self.bidirectional), 1, 1)
-
-        c_state = torch.zeros(self.num_rnn_layers * (1 + self.bidirectional), batch_size, hidden_size,
-                              dtype=torch.float, device="cuda" if torch.cuda.is_available() else "cpu")
-
-        use_teacher_forcing = False
-
-        # for each symbol
-        for i in range(self.max_len_sentence):
-            for j in range(self.max_symbols_per_word) :
-                symbols_embedding = self.symbols_embedder(decoded_ij)
-                symbols_embedding = self.dropout(symbols_embedding)
-
-                output = symbols_embedding
-                if self.use_attention:
-                    output = torch.cat((symbols_embedding, last_hidden_state[:, i, :].unsqueeze(1)), 2)
-                    output = self.attn_combine(output)
-                    output = F.relu(output)
-
-                # rnn layer
-                output, (hidden_state, c_state) = self.rnn(output, (hidden_state, c_state))
-
-                # Projection of the output of the rnn omitting the last probability (which is pad) so we dont predict PAD
-                proj = self.proj(output)[:, :, :-2]
-
-                if use_teacher_forcing:
-                    decoded_ij = symbols_tokenized_batch[:, i, j].unsqueeze(1)
-                else:
-                    decoded_ij = torch.argmax(F.softmax(proj, dim=2), dim=2)
-
-                # Calculate sos and pad
-                sos_mask_ij = self.sos_mask(torch.argmax(F.softmax(proj, dim=2), dim=2)[:, -1])
-                y_hat[~sos_mask, i, j, self.symbols_padding_id] = 0
-                y_hat[~sos_mask, i, j, :-2] = proj[~sos_mask, -1, :]
-                sos_mask = sos_mask_ij | sos_mask
-
-                # Stop if every sentence says padding or if we are full
-                if not torch.any(~sos_mask):
-                    break
-        return y_hat
-
-    def predict_rnn(self, last_hidden_state, pooler_output):
-        r"""Predicts the symbols from the output of the encoder.
-
-        Args:
-            last_hidden_state: [batch_size, max_len_sentence, dim_encoder] the output of the encoder
-            pooler_output: [batch_size, dim_encoder] Last layer hidden-state of the first token of the sequence (classification token) after further processing through the layers used for the auxiliary pretraining task
-        """
-        batch_size, sequence_length, hidden_size = last_hidden_state.shape
-
-        # y_hat[batch_size, max_len_sentence, vocab_size] init with probability pad =1
-        y_hat = torch.zeros(batch_size, self.max_len_sentence, self.max_symbols_per_word, self.symbols_vocab_size,
-                            dtype=torch.float, device="cuda" if torch.cuda.is_available() else "cpu")
-        y_hat[:, :, self.symbols_padding_id] = 1
-
-        decoded_ij = torch.ones(batch_size, 1, dtype=torch.long,
-                               device="cuda" if torch.cuda.is_available() else "cpu") * self.symbols_start_id
-
-        sos_mask = torch.zeros(batch_size, dtype=torch.bool, device="cuda" if torch.cuda.is_available() else "cpu")
-
-        # hidden_state goes through multiple linear layers
-        hidden_state = pooler_output.unsqueeze(0).repeat(self.num_rnn_layers * (1 + self.bidirectional), 1, 1)
-
-        c_state = torch.zeros(self.num_rnn_layers * (1 + self.bidirectional), batch_size, hidden_size,
-                              dtype=torch.float, device="cuda" if torch.cuda.is_available() else "cpu")
-        symbols_embedding = self.symbols_embedder(decoded_ij)
-
-        symbols_embedding = self.dropout(symbols_embedding)
-
-        # for each symbol
-        for i in range(self.max_len_sentence):
-            output = symbols_embedding
-            if self.use_attention:
-                output = torch.cat((symbols_embedding, last_hidden_state[:, i, :].unsqueeze(1)), 2)
-                output = self.attn_combine(output)
-                output = F.relu(output)
-            for j in range(self.max_symbols_per_word) :
-                symbols_embedding = self.symbols_embedder(decoded_ij)
-
-                symbols_embedding = self.dropout(symbols_embedding)
-
-                # rnn layer
-                output, (hidden_state, c_state) = self.rnn(output, (hidden_state, c_state))
-
-                # Projection of the output of the rnn omitting the last probability (which is pad) so we dont predict PAD
-                proj_softmax = F.softmax(self.proj(output)[:, :, :-2], dim=2)
-                decoded_ij = torch.argmax(proj_softmax, dim=2)
-
-                # Set sos and pad
-                sos_mask_ij = self.sos_mask(decoded_ij[:, -1])
-                y_hat[~sos_mask, i, j, self.symbols_padding_id] = 0
-                y_hat[~sos_mask, i, j, :-2] = proj_softmax[~sos_mask, -1, :]
-                sos_mask = sos_mask_ij | sos_mask
-
-                # Stop if every sentence says padding or if we are full
-                if not torch.any(~sos_mask):
-                    break
-
-        return y_hat
--- a/SuperTagger/Encoder/EncoderLayer.py
+++ b/SuperTagger/Encoder/EncoderLayer.py
-import sys
-
-import torch
-from torch import nn
-
-from Configuration import Configuration
-
-
-class EncoderLayer(nn.Module):
-    """Encoder class, imput of supertagger"""
-
-    def __init__(self, model):
-        super(EncoderLayer, self).__init__()
-        self.name = "Encoder"
-
-        self.bert = model
-
-        self.hidden_size = self.bert.config.hidden_size
-
-    def forward(self, batch):
-        r"""
-        Args :
-            batch: list[str,mask], list of sentences (NOTE: untokenized, continuous sentences)
-        Returns :
-                last_hidden_state: [batch_size, max_len_sentence, dim_encoder]  Sequence of hidden-states at the output of the last layer of the model.
-                pooler_output: [batch_size, dim_encoder] Last layer hidden-state of the first token of the sequence (classification token) after further processing through the layers used for the auxiliary pretraining task
-        """
-        b_input_ids = batch[0]
-        b_input_mask = batch[1]
-
-        outputs = self.bert(
-            input_ids=b_input_ids, attention_mask=b_input_mask)
-
-        return outputs[0], outputs[1]
-
-    @staticmethod
-    def load(model_path: str):
-        r""" Load the model from a file.
-        Args :
-            model_path (str): path to model
-        Returns :
-            model (nn.Module): model with saved parameters
-        """
-        params = torch.load(
-            model_path, map_location=lambda storage, loc: storage)
-        args = params['args']
-        model = EncoderLayer(**args)
-        model.load_state_dict(params['state_dict'])
-
-        return model
-
-    def save(self, path: str):
-        r""" Save the model to a file.
-        Args :
-            path (str): path to the model
-        """
-        print('save model parameters to [%s]' % path, file=sys.stderr)
-
-        params = {
-            'args': dict(bert_config=self.bert.config, dropout_rate=self.dropout_rate),
-            'state_dict': self.state_dict()
-        }
-
-        torch.save(params, path)
-
-    def to_dict(self):
-        return {}
--- a/SuperTagger/EncoderDecoder.py
+++ b/SuperTagger/EncoderDecoder.py
@@ -4,9 +4,7 @@ from torch.nn import Dropout, LSTM
 from torch.nn import Module

 from Configuration import Configuration
-from SuperTagger.Decoder.RNNDecoderLayer import RNNDecoderLayer
 from torch.nn.utils.rnn import pack_padded_sequence
-from SuperTagger.Encoder.EncoderLayer import EncoderLayer
 from SuperTagger.eval import measure_supertagging_accuracy


@@ -21,6 +19,8 @@ class EncoderDecoder(Module):
    def __init__(self, BASE_MODEL, numPos1Classes, numPos2Classes, numSuperClasses):
        super(EncoderDecoder, self).__init__()

+        self.bert = BASE_MODEL
+
        self.dim_encoder = int(Configuration.modelDecoderConfig['dim_encoder'])
        self.dim_decoder = int(Configuration.modelDecoderConfig['dim_decoder'])
        self.num_rnn_layers = int(Configuration.modelDecoderConfig['num_rnn_layers'])
@@ -28,18 +28,18 @@ class EncoderDecoder(Module):
        dropout = float(Configuration.modelDecoderConfig['dropout'])
        self.dropout = Dropout(dropout)

-        self.encoder = EncoderLayer(BASE_MODEL)
+        self.bert = BASE_MODEL

        self.lstm_shared = LSTM(input_size=self.dim_encoder, hidden_size=self.dim_encoder, num_layers=self.num_rnn_layers,
                        dropout=dropout,
                        bidirectional=self.bidirectional, batch_first=True, )

        #Pos1
-        self.pos1_1 = nn.Linear(self.dim_encoder*2,self.dim_decoder)
+        self.pos1_1 = nn.Linear(self.dim_encoder,self.dim_decoder)
        self.pos1_2 = nn.Linear(self.dim_decoder, numPos1Classes)

        #Pos2
-        self.pos2_1 = nn.Linear(self.dim_encoder*2, self.dim_decoder)
+        self.pos2_1 = nn.Linear(self.dim_encoder, self.dim_decoder)
        self.pos2_2 = nn.Linear(self.dim_decoder, numPos2Classes)

        #super
@@ -47,24 +47,27 @@ class EncoderDecoder(Module):
                                num_layers=self.num_rnn_layers,
                                dropout=dropout,
                                bidirectional=self.bidirectional, batch_first=True, )
-        self.pos_super_1 = nn.Linear(self.dim_encoder*2,self.dim_decoder)
+        self.pos_super_1 = nn.Linear(self.dim_encoder,self.dim_decoder)
        self.pos_super_2 = nn.Linear(self.dim_decoder, numSuperClasses)



    def forward(self, batch):

-        encoded_layers, pooled_output = self.encoder(batch)
-        encoded_layers = self.dropout(encoded_layers)
-        # encoded_layers = encoded_layers.permute(1, 0, 2)
-        print("encoded_layers : ", encoded_layers.size())
+        b_input_ids = batch[0]
+        b_input_mask = batch[1]

+        encoded_layers, _ = self.bert(
+            input_ids=b_input_ids, attention_mask=b_input_mask, return_dict=False)

-        lstm_output, (h, c) = self.lstm_shared(encoded_layers)  ## extract the 1st token's embeddings
-        print("last_hidden : ", lstm_output.size())
-        # output_shared = torch.cat((lstm_output[:, :, :self.dim_encoder], lstm_output[:, :, self.dim_encoder:]), dim=2)
+        lstm_output = self.dropout(encoded_layers)
+
+        print("encoded_layers : ", encoded_layers.size())

-        print("output_shared : ", lstm_output.size())
+        # lstm_output, _ = self.lstm_shared(encoded_layers)  ## extract the 1st token's embeddings
+        # print("last_hidden : ", lstm_output.size())
+        #
+        # print("output_shared : ", lstm_output.size())

        # Pos1
        pos_1_output= self.pos1_1(lstm_output)
@@ -77,9 +80,8 @@ class EncoderDecoder(Module):
        pos_2_output = self.pos2_2(pos_2_output)

        # super
-        enc_hiddens, (last_hidden_super, last_cell_super) = self.lstm_super(lstm_output)
-        print(enc_hiddens.size())
-        super_output = self.pos_super_1(enc_hiddens)
+        # enc_hiddens, _ = self.lstm_super(lstm_output)
+        super_output = self.pos_super_1(lstm_output)
        super_output = self.dropout(super_output)
        super_output = self.pos_super_2(super_output)


--- a/SuperTagger/Encoder/EncoderInput.py
+++ b/SuperTagger/Encoder/EncoderInput.py
 import torch


-class EncoderInput():
+class EncoderTokenizer():

    def __init__(self, tokenizer):
        """@params tokenizer (PretrainedTokenizer): Tokenizer that tokenizes text """

--- a/SuperTagger/SymbolTokenizer.py
+++ b/SuperTagger/SymbolTokenizer.py
+import pickle
+
+import numpy as np
+import torch
+
+
+def load_obj(name):
+    with open(name + '.pkl', 'rb') as f:
+        return pickle.load(f)
+
+
+class SymbolTokenizer():
+
+    def __init__(self):
+        """@params tokenizer (PretrainedTokenizer): Tokenizer that tokenizes text """
+        self.index_to_super = load_obj('Datasets/index_to_super')
+        self.index_to_pos1 = load_obj('Datasets/index_to_pos1')
+        self.index_to_pos2 = load_obj('Datasets/index_to_pos2')
+        self.super_to_index = {v: int(k) for k, v in self.index_to_super.items()}
+        self.pos1_to_index = {v: int(k) for k, v in self.index_to_pos1.items()}
+        self.pos2_to_index = {v: int(k) for k, v in self.index_to_pos2.items()}
+
+    def lenPOS1(self):
+        print(self.pos1_to_index)
+        return len(self.index_to_pos1) + 1
+
+    def lenPOS2(self):
+        return len(self.index_to_pos2) + 1
+
+    def lenSuper(self):
+        return len(self.index_to_super) + 1
+
+    def convert_batchs_to_ids(self, Y1, Y2, Super):
+        max_len_Y1 = max(len(elem) for elem in Y1)
+        max_len_Y2 = max(len(elem) for elem in Y2)
+        max_len_S = max(len(elem) for elem in Super)
+        Y1_tok = torch.as_tensor(pad_sequence([[self.pos1_to_index[str(symbol)] for symbol in sents] for sents in Y1]))
+        Y2_tok = torch.as_tensor(pad_sequence(
+            [[self.pos2_to_index[str(symbol)] for symbol in sents] for sents in Y2]))
+        super_tok = torch.as_tensor(pad_sequence(
+            [[self.super_to_index[str(symbol)] for symbol in sents] for sents in Super]))
+
+        return Y1_tok, Y2_tok, super_tok
+
+    # def convert_ids_to_symbols(self, ids):
+    #     return [self.inverse_symbol_map[int(i)] for i in ids]
+
+def pad_sequence(sequences, max_len=400):
+    sequences_pad = []
+    for s in sequences:
+        padded = [0] * max_len
+        padded[:len(s)] = s
+        sequences_pad.append(padded)
+    return sequences_pad
+
+
--- a/SuperTagger/eval.py
+++ b/SuperTagger/eval.py
@@ -33,12 +33,14 @@ class NormCrossEntropy(Module):
    r"""Loss based on the cross entropy, it considers the number of words and ignore the padding.
     """

-    def __init__(self, ignore_index, sep_id, weights=None):
+    def __init__(self, ignore_index, weights=None):
        super(NormCrossEntropy, self).__init__()
        self.ignore_index = ignore_index
-        self.sep_id = sep_id
        self.weights = weights

    def forward(self, predictions, truths):
-        return cross_entropy(predictions.flatten(0, -2), truths.flatten(), weight=self.weights,
-                             reduction='sum', ignore_index=self.ignore_index) / count_sep(truths.flatten(), self.sep_id)
+        print()
+        print("predictions : ", predictions.size())
+        print("truths : ", truths.size())
+        return cross_entropy(predictions.flatten(0, -2), truths.flatten(), weight=torch.tensor(self.weights,device="cuda" if torch.cuda.is_available() else "cpu"),
+        reduction='sum', ignore_index=self.ignore_index)
--- a/SuperTagger/utils.py
+++ b/SuperTagger/utils.py
@@ -5,7 +5,7 @@ import torch
 from tqdm import tqdm


-def read_csv_pgbar(csv_path, nrows=float('inf'), chunksize=500):
+def read_csv_pgbar(csv_path, nrows=float('inf'), chunksize=100):
    print("\n" + "#" * 20)
    print("Loading csv...")


--- a/Utils/PostpreprocesTXT.py
+++ b/Utils/PostpreprocesTXT.py
 import itertools
+import pickle
 import re

 import numpy as np
@@ -115,17 +116,15 @@ def read_maxentdata(file):
        Z = np.asarray(allsuper)
        return X, Y1, Y2, Z, vocabulary, vnorm, partsofspeech1, partsofspeech2, superset, maxlen

+def save_obj(obj, name):
+    with open(name + '.pkl', 'wb+') as f:
+        pickle.dump(obj, f, pickle.HIGHEST_PROTOCOL)

-# Txt_to_csv("m2.txt")
-
-X, Y1, Y2, Z, vocabulary, vnorm, partsofspeech1, partsofspeech2, superset, maxlen = read_maxentdata("m2.txt")
+def load_obj(name):
+    with open(name + '.pkl', 'rb') as f:
+        return pickle.load(f)

-print("X[17] (", np.array(X[17]).shape ,") : ")
-print(X[17])
-print("Y1[17] (", np.array(Y1[17]).shape ,") : ")
-print(Y1[17])
-print("Y2[17] (", np.array(Y2[17]).shape ,") : ")
-print(Y2[17])
+# Txt_to_csv("m2.txt")

 X, Y1, Y2, Z, vocabulary, vnorm, partsofspeech1, partsofspeech2, superset, maxlen = read_maxentdata("m2.txt")

@@ -136,6 +135,13 @@ df['Y1'] = Y1
 df['Y2'] = Y2
 df['Z'] = Z

+df.to_csv("../Datasets/m2_dataset_V2.csv", index=False)
+
+
+t =  np.unique(np.array(list(itertools.chain(*Z))))
+
+print(t.size)

+dict = { i : t[i] for i in range(0, len(t) ) }

-df.to_csv("../Datasets/m2_dataset_V2.csv", index=False)
\ No newline at end of file
+save_obj(dict,"../Datasets/index_to_super")
\ No newline at end of file
--- a/train.py
+++ b/train.py
@@ -6,16 +6,18 @@ import numpy as np
 import torch
 import torch.nn.functional as F
 import transformers
-from torch.optim import SGD, Adam
+from torch.optim import Adam, RMSprop
 from torch.utils.data import Dataset, TensorDataset, random_split
 from transformers import (AutoTokenizer, get_cosine_schedule_with_warmup)
 from transformers import (CamembertModel)

 from Configuration import Configuration
-from SuperTagger.Encoder.EncoderInput import EncoderInput
+
 from SuperTagger.EncoderDecoder import EncoderDecoder
+from SuperTagger.EncoderTokenizer import EncoderTokenizer
+from SuperTagger.SymbolTokenizer import SymbolTokenizer
 from SuperTagger.eval import NormCrossEntropy
-from SuperTagger.utils import format_time, read_csv_pgbar, checkpoint_save, checkpoint_load
+from SuperTagger.utils import format_time, read_csv_pgbar

 from torch.utils.tensorboard import SummaryWriter

@@ -24,9 +26,7 @@ torch.cuda.empty_cache()

 # region ParamsModel

-# max_symbols_per_word = int(Configuration.modelDecoderConfig['max_symbols_per_word'])
-# max_len_sentence = int(Configuration.modelDecoderConfig['max_len_sentence'])
-# symbol_vocab_size = int(Configuration.modelDecoderConfig['symbols_vocab_size'])
+
 num_gru_layers = int(Configuration.modelDecoderConfig['num_rnn_layers'])

 # endregion ParamsModel
@@ -35,7 +35,7 @@ num_gru_layers = int(Configuration.modelDecoderConfig['num_rnn_layers'])

 file_path = 'Datasets/m2_dataset_V2.csv'
 batch_size = int(Configuration.modelTrainingConfig['batch_size'])
-nb_sentences = batch_size * 300
+nb_sentences = batch_size * 50
 epochs = int(Configuration.modelTrainingConfig['epoch'])
 seed_val = int(Configuration.modelTrainingConfig['seed_val'])
 learning_rate = float(Configuration.modelTrainingConfig['learning_rate'])
@@ -115,9 +115,8 @@ BASE_TOKENIZER = AutoTokenizer.from_pretrained(
    'camembert-base',
    do_lower_case=True)
 BASE_MODEL = CamembertModel.from_pretrained("camembert-base")
-sents_tokenizer = EncoderInput(BASE_TOKENIZER)
-model = EncoderDecoder(BASE_MODEL, 20,20,20)
-model = model.to("cuda" if torch.cuda.is_available() else "cpu")
+sents_tokenizer = EncoderTokenizer(BASE_TOKENIZER)
+symbol_tokenizer = SymbolTokenizer()

 # endregion Model

@@ -126,7 +125,9 @@ df = read_csv_pgbar(file_path, nb_sentences)

 sents_tokenized, sents_mask = sents_tokenizer.fit_transform_tensors(df['X'].tolist())

-dataset = TensorDataset(sents_tokenized, sents_mask)
+y1, y2, super = symbol_tokenizer.convert_batchs_to_ids(df['Y1'].tolist(),df['Y2'].tolist(),df['Z'].tolist())
+
+dataset = TensorDataset(sents_tokenized, sents_mask, y1, y2, super)
 # , torch.tensor(df['Y1'].tolist()), torch.tensor(df['Y2'].tolist()), torch.tensor(df['Z'].tolist())

 # Calculate the number of samples to include in each set.
@@ -144,13 +145,14 @@ validation_dataloader = torch.utils.data.DataLoader(val_dataset, batch_size=batc

 # endregion Data loader

+
+model = EncoderDecoder(BASE_MODEL, symbol_tokenizer.lenPOS1(),symbol_tokenizer.lenPOS2(),symbol_tokenizer.lenSuper())
+model = model.to("cuda" if torch.cuda.is_available() else "cpu")
+
 # region Fit tunning

 # Optimizer
-# optimizer_encoder = Adam(model.encoder.parameters(),
-#                          lr=5e-5)
-# optimizer_decoder = Adam(model.decoder.parameters(),
-#                          lr=learning_rate)
+optimizer = RMSprop(model.parameters())

 # Total number of training steps is [number of batches] x [number of epochs].
 # (Note that this is not the same as the number of training samples).
@@ -165,15 +167,9 @@ total_steps = len(training_dataloader) * epochs
 #                                                     num_training_steps=total_steps)

 # # Loss
-# if loss_scaled_by_freq:
-#     weights = torch.as_tensor(
-#         [6.9952, 1.0763, 1.0317, 43.274, 16.5276, 11.8821, 28.2416, 2.7548, 1.0728, 3.1847, 8.4521, 6.77, 11.1887,
-#          6.6692, 23.1277, 11.8821, 4.4338, 1.2303, 5.0238, 8.4376, 1.0656, 4.6886, 1.028, 4.273, 4.273, 0],
-#         device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
-#     cross_entropy_loss = NormCrossEntropy(symbols_tokenizer.pad_token_id, symbols_tokenizer.sep_token_id,
-#                                           weights=weights)
-# else:
-#     cross_entropy_loss = NormCrossEntropy(symbols_tokenizer.pad_token_id, symbols_tokenizer.sep_token_id)
+cross_entropy_loss_Y1 = NormCrossEntropy(0,0.15)
+cross_entropy_loss_Y2 = NormCrossEntropy(0,.35)
+cross_entropy_loss_S = NormCrossEntropy(0,.5)

 np.random.seed(seed_val)
 torch.manual_seed(seed_val)
@@ -211,7 +207,9 @@ def run_epochs(epochs):
        t0 = time.time()

        # Reset the total loss for this epoch.
-        total_train_loss = 0
+        total_train_loss_Y1 =0
+        total_train_loss_Y2 =0
+        total_train_loss_S =0

        model.train()

@@ -221,8 +219,8 @@ def run_epochs(epochs):
            # if epoch_i == 0 and step == 0:
            #     writer.add_graph(model, input_to_model=batch[0], verbose=False)

-            # Progress update every 40 batches.
-            if step % 40 == 0 and not step == 0:
+            # Progress update every 10 batches.
+            if step % 10 == 0 and not step == 0:
                # Calculate elapsed time in minutes.
                elapsed = format_time(time.time() - t0)
                # Report progress.
@@ -231,23 +229,36 @@ def run_epochs(epochs):
                # Unpack this training batch from our dataloader.
            b_sents_tokenized = batch[0].to("cuda" if torch.cuda.is_available() else "cpu")
            b_sents_mask = batch[1].to("cuda" if torch.cuda.is_available() else "cpu")
-            b_symbols_tokenized = batch[2].to("cuda" if torch.cuda.is_available() else "cpu")

-            # optimizer_encoder.zero_grad()
-            # optimizer_decoder.zero_grad()
+            optimizer.zero_grad()
+
+            logits_predictions = model((b_sents_tokenized, b_sents_mask))
+
+            output_dim_Y1 = logits_predictions[0].shape[1]
+            print(output_dim_Y1)
+            # output_Y1 = logits_predictions[0][1:].view(-1, output_dim_Y1)
+            output_dim_Y2 = logits_predictions[1].shape[1]
+            # output_Y2 = logits_predictions[1][1:].view(-1, output_dim_Y2)
+            output_dim_S = logits_predictions[2].shape[1]
+            # output_S = logits_predictions[2][1:].view(-1, output_dim_S)

-            logits_predictions = model(b_sents_tokenized, b_sents_mask, b_symbols_tokenized)
+            loss_Y1 = cross_entropy_loss_Y1(logits_predictions[0], batch[2][:output_dim_Y1])
+            loss_Y2 = cross_entropy_loss_Y2(logits_predictions[1], batch[3][:output_dim_Y2])
+            loss_S = cross_entropy_loss_S(logits_predictions[2], batch[4][:output_dim_S])

-            # loss = cross_entropy_loss(logits_predictions, b_symbols_tokenized)
-            # total_train_loss += float(loss)
-            # loss.backward()
+            total_train_loss_Y1 += float(loss_Y1)
+            total_train_loss_Y2 += float(loss_Y2)
+            total_train_loss_S += float(loss_S)
+
+            loss_Y1.backward()
+            loss_Y2.backward()
+            loss_S.backward()

            # This is to help prevent the "exploding gradients" problem.
            #torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0, norm_type=2)

            # Update parameters and take a step using the computed gradient.
-            # optimizer_encoder.step()
-            # optimizer_decoder.step()
+            optimizer.step()
            #
            # scheduler_encoder.step()
            # scheduler_decoder.step()
@@ -257,7 +268,9 @@ def run_epochs(epochs):
        # if use_checkpoint_SAVE:
        #     checkpoint_save(model, optimizer_decoder, epoch_i, checkpoint_dir, loss)

-        avg_train_loss = total_train_loss / len(training_dataloader)
+        avg_train_loss_Y1 = total_train_loss_Y1 / len(training_dataloader)
+        avg_train_loss_Y2 = total_train_loss_Y2 / len(training_dataloader)
+        avg_train_loss_S = total_train_loss_S / len(training_dataloader)

        # Measure how long this epoch took.
        training_time = format_time(time.time() - t0)
@@ -274,7 +287,9 @@ def run_epochs(epochs):
                # writer.add_scalar('Accuracy/symbol', accuracy_symbol, epoch_i + 1)

        print("")
-        print("  Average training loss: {0:.2f}".format(avg_train_loss))
+        print("  Average training loss: {0:.2f}".format(avg_train_loss_Y1))
+        print("  Average training loss: {0:.2f}".format(avg_train_loss_Y2))
+        print("  Average training loss: {0:.2f}".format(avg_train_loss_S))
        print("  Training epcoh took: {:}".format(training_time))

        # writer.add_scalar('Loss/train', total_train_loss, epoch_i+1)
@@ -287,10 +302,13 @@ def run_epochs(epochs):

 # run_epochs(epochs)
 # endregion Train
-b = next(iter(training_dataloader))
-# , y1,y2,y3
-a = model(b)
-print(len(b))
-print(a[0].size(),a[1].size(),a[2].size())
+# b1, b2 , y1,y2,y3 = next(iter(training_dataloader))
+# b =(b1, b2)
+# # , y1,y2,y3
+# a = model(b)
+# print(len(b))
+# print(a[0].size(),a[1].size(),a[2].size())
+print(symbol_tokenizer.lenPOS1())
+