diff --git a/classifier_bare_huggingface.py b/classifier_bare_huggingface.py
index 2897b5b92e449192e49f215244bb3bf06be85dfb..6c3c93d758b57d581ec275a6fcc42b3b1ae688df 100644
--- a/classifier_bare_huggingface.py
+++ b/classifier_bare_huggingface.py
@@ -26,7 +26,7 @@ substitutions_file = 'mappings/substitutions.txt'
 mapping_classes = args.mappings_file[:-4].split('-')[-1]
 set_seed(42)
 
-print('Model:', args.transformer_model)
+print('\nModel:', args.transformer_model)
 print('Batch size:', args.batch_size * args.gradient_accumulation_steps)
 print('Num epochs:', args.num_epochs)
 
@@ -38,7 +38,8 @@ print('Num epochs:', args.num_epochs)
 mappings, inv_mappings = open_mappings(args.mappings_file)
 
 # Open sentences
-train_sentences, dev_dict_sentences, test_dict_sentences, framework_labels = open_sentences_with_lang(args.data_path, mappings)
+# train_sentences, dev_dict_sentences, test_dict_sentences, framework_labels = open_sentences_with_lang(args.data_path, mappings)
+train_sentences, dev_dict_sentences, test_dict_sentences, framework_labels = open_sentences(args.data_path, mappings)
 
 
 print('\nCheck encodings:\n')
@@ -119,11 +120,11 @@ training_args = TrainingArguments(
     remove_unused_columns = False,
     warmup_steps = 1000,  # number of warmup steps for learning rate  
 #     save_steps = (len(train_sentences)/(args.batch_size * args.gradient_accumulation_steps)) / 1368,
-    save_total_limit = args.num_epochs,
-    load_best_model_at_end = True,
+#     save_total_limit = args.num_epochs,
+#     load_best_model_at_end = True,
     weight_decay = 0.01,  # strength of weight decay
-    save_strategy='epoch', 
-    evaluation_strategy='epoch'
+#     save_strategy='epoch', 
+#     evaluation_strategy='epoch'
     
 )
 
@@ -183,13 +184,13 @@ for corpus in encoded_dev_dataset:
 
 # Test results
 
-# print('\ntest results:')
-# for corpus in encoded_test_dataset:
-#     print()
-#     test_results = get_predictions_huggingface(trainer, 
-#                                                corpus, 
-#                                                framework_labels[corpus.split('.')[1]],
-#                                                encoded_test_dataset[corpus])
+print('\n\ntest results:')
+for corpus in encoded_test_dataset:
+    print()
+    test_results = get_predictions_huggingface(trainer, 
+                                               corpus, 
+                                               framework_labels[corpus.split('.')[1]],
+                                               encoded_test_dataset[corpus])
     
     
 #     path_results = 'results/test/' + args.transformer_model + '_' + str(args.num_epochs)
diff --git a/classifier_bare_pytorch-conf-matrix.py b/classifier_bare_pytorch-conf-matrix.py
new file mode 100644
index 0000000000000000000000000000000000000000..289e5c52908e5f080cb7d01170e680875e231d8b
--- /dev/null
+++ b/classifier_bare_pytorch-conf-matrix.py
@@ -0,0 +1,310 @@
+#!/usr/bin/env python
+# coding: utf-8
+
+import torch
+import numpy as np
+from transformers import AutoModel, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
+from torch import nn
+from torch.optim import AdamW
+from torch.utils.data import DataLoader
+import torch.nn.functional as F
+from torch.autograd import Variable
+from tqdm import tqdm
+import os
+from time import sleep
+from datetime import datetime
+import sys
+from sklearn.metrics import classification_report, accuracy_score
+from configure import parse_args
+from utils import *
+
+args = parse_args()
+now = datetime.now()
+dt_string = now.strftime("%d.%m.%y-%H:%M:%S")
+layers_to_freeze = args.freeze_layers.split(";")
+substitutions_file = 'mappings/substitutions.txt'
+# mapping_classes = args.mappings_file[:-4].split('-')[-1]
+# specific_results = open_specific_results('mappings/specific_results.txt')['B']
+
+print('\nlangs to use: '+ args.langs_to_use)
+print('mappings file: ' + args.mappings_file, flush='True')
+set_seed(42)
+torch.manual_seed(42)
+
+# ===============
+# Dataset class
+# ===============
+
+class Dataset(torch.utils.data.Dataset):
+
+    def __init__(self, sentences):
+
+        self.labels = [sent[-1] for sent in sentences]
+        self.texts = [tokenizer(sent[-2], 
+                                is_split_into_words=True,                              
+                                padding='max_length', 
+                                max_length = 512, 
+                                truncation=True,
+                                return_tensors="pt") 
+                                for sent in sentences]
+
+    def classes(self):
+        return self.labels
+
+    def __len__(self):
+        return len(self.labels)
+    
+    def get_batch_labels(self, idx):
+        # Fetch a batch of labels
+        return np.array(self.labels[idx])
+
+    def get_batch_texts(self, idx):
+        # Fetch a batch of inputs
+        return self.texts[idx]
+
+    def __getitem__(self, idx):
+
+        batch_texts = self.get_batch_texts(idx)
+        batch_y = self.get_batch_labels(idx)
+
+        return batch_texts, batch_y
+
+# ===============
+# Load datasets
+# ===============
+
+# Open mappings
+mappings, inv_mappings = open_mappings(args.mappings_file)
+batch_size = args.batch_size
+tokenizer  = AutoTokenizer.from_pretrained(args.transformer_model)
+
+# train_sentences, dev_dict_sentences, test_dict_sentences, framework_labels = open_sentences(args.data_path, mappings)
+train_sentences, dev_dict_sentences, test_dict_sentences, framework_labels = open_sentences_with_lang(args.data_path, mappings)
+
+print('\nCheck encodings:\n')
+print(train_sentences[0])
+
+
+# Determine linear size (= number of classes in the sets + 1)
+num_labels = len(set(sent[-1] for sent in train_sentences)) + 1
+
+# make train/dev datasets
+train_dataset = Dataset(train_sentences)
+dev_dataset   = {corpus: Dataset(s) for corpus, s in dev_dict_sentences.items()}
+test_dataset  = {corpus: Dataset(s) for corpus, s in test_dict_sentences.items()}
+
+# Make dasets with batches and dataloader
+train_dataloader = DataLoader(train_dataset, batch_size, shuffle=True)
+dev_dict_dataloader = {corpus: DataLoader(dev_data, batch_size) 
+                        for corpus, dev_data in dev_dataset.items()}
+test_dict_dataloader = {corpus: DataLoader(test_data, batch_size) 
+                        for corpus, test_data in test_dataset.items()}
+
+
+# ===============
+# Model setup
+# ===============
+
+class TransformerClassifier(nn.Module):
+
+    def __init__(self, dropout=args.dropout):
+
+        super(TransformerClassifier, self).__init__()
+
+        self.tr_model = AutoModel.from_pretrained(args.transformer_model)
+        self.dropout = nn.Dropout(dropout)
+        self.linear = nn.Linear(768, num_labels) # bert input x num of classes
+        self.relu = nn.ReLU()
+
+    def forward(self, input_id, mask):
+        
+        outputs = self.tr_model(input_ids = input_id, 
+                                attention_mask = mask,
+                                return_dict = True)['last_hidden_state'][:, 0, :]
+        dropout_output = self.dropout(outputs)
+        linear_output = self.linear(dropout_output)
+        final_layer = self.relu(linear_output)
+
+        return final_layer
+
+
+model = TransformerClassifier()
+
+
+def train(model, 
+          train_dataloader, 
+          dev_dict_dataloader, 
+          test_dict_sentences, 
+          test_dict_dataloader,
+          epochs, 
+          #specific_results
+         ):
+
+    device = torch.device("cuda" if args.use_cuda else "cpu")
+
+    criterion = nn.CrossEntropyLoss()
+    optimizer = AdamW(model.parameters(), #Adam
+                      lr = 2e-5, #1e-6
+                      eps = 1e-8
+                    )
+
+    if args.use_cuda:
+        model = model.cuda()
+        criterion = criterion.cuda()
+    
+    gradient_accumulation_steps = args.gradient_accumulation_steps
+    total_steps = len(train_dataloader) * epochs
+    scheduler = get_linear_schedule_with_warmup(optimizer, 
+                                                num_warmup_steps = 0,
+                                                num_training_steps = total_steps)
+    
+    seed_val = 42
+    torch.manual_seed(seed_val)
+    torch.cuda.manual_seed_all(seed_val)
+    
+    # freeze layers, see argument in configure.py
+    if args.freeze_layers != '':
+        for name, param in model.named_parameters():
+            if any(x in name for x in layers_to_freeze):
+                param.requires_grad = False
+
+    for epoch_num in range(0, epochs):
+        print('\n=== Epoch {:} / {:} ==='.format(epoch_num + 1, epochs))
+        
+        model.train()
+
+        total_acc_train = 0
+        total_loss_train = 0
+        batch_counter = 0
+        
+        for train_input, train_label in tqdm(train_dataloader):
+#         for train_input, train_label in train_dataloader:
+            batch_counter += 1
+            train_label = train_label.to(device)
+            mask = train_input['attention_mask'].to(device)
+            input_id = train_input['input_ids'].squeeze(1).to(device)
+
+            output = model(input_id, mask)
+                
+#             batch_loss = criterion(output, train_label.long())
+#             total_loss_train += batch_loss.item()
+                
+#             acc = (output.argmax(dim=1) == train_label).sum().item()
+#             total_acc_train += acc
+            
+            # Compute Loss and Perform Back-propagation
+            loss = criterion(output, train_label.long())
+
+
+            # Normalize the Gradients
+            loss = loss / gradient_accumulation_steps
+            loss.backward()
+
+            
+            if (batch_counter % gradient_accumulation_steps == 0):
+                # Update Optimizer
+                optimizer.step() # or flip them?
+                optimizer.zero_grad()
+                
+                model.zero_grad()
+#             loss.backward()
+                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
+#             optimizer.step()
+                scheduler.step()
+            
+        # ------ Validation --------
+        
+        print('\nValidation for epoch:', epoch_num + 1)
+        
+        # Dev and test results for each corpus. We don't need to save the results.
+        for corpus in dev_dict_dataloader:
+            dev_results = get_predictions(
+                                model, 
+                                corpus, 
+                                dev_dict_dataloader[corpus]
+                                )
+            better_dev_results = get_better_predictions(
+                                    model, 
+                                    corpus, 
+                                    dev_dict_dataloader[corpus], 
+                                    framework_labels[corpus.split('.')[1]], 
+                                    inv_mappings,
+                                    epoch_num+1,
+                                    save_conf_matrix=True
+                                    
+                                    )
+            
+#             path_results = 'results/dev/language_' + mapping_classes + '_' + str(epoch_num+1)
+#             if not os.path.exists(path_results):
+#                 os.makedirs(path_results)
+                
+#             print_results_to_file(corpus, 
+#                                 dev_dict_sentences[corpus], 
+#                                 dev_results,
+#                                 inv_mappings, #substitutions_file, 
+#                                 path_results)
+            
+        # ------ Test --------
+        
+        print('\nTest results for epoch:', epoch_num + 1)
+        
+        for corpus in test_dict_dataloader:
+            test_results = get_predictions(
+                                model, 
+                                corpus, 
+                                test_dict_dataloader[corpus]
+                                )
+            better_test_results = get_better_predictions(
+                                    model, 
+                                    corpus, 
+                                    test_dict_dataloader[corpus], 
+                                    framework_labels[corpus.split('.')[1]], 
+                                    inv_mappings,
+                                    epoch_num+1,
+                                    save_conf_matrix=False
+                                    
+                                    )
+            
+#             path_results = 'results/test/language_' + mapping_classes + '_' + str(epoch_num+1)
+#             if not os.path.exists(path_results):
+#                 os.makedirs(path_results)
+                
+#             print_results_to_file(corpus, 
+#                                 test_dict_sentences[corpus], 
+#                                 test_results,
+#                                 inv_mappings, #substitutions_file, 
+#                                 path_results)
+            
+            
+#         # we want the results of specific epochs for specific corpora. 
+#         # we define the epochs and the corpora and we save only these results.
+        
+#         if epoch_num+1 in specific_results:
+#             for corpus in specific_results[epoch_num+1]:
+#                 test_results = get_predictions(model, 
+#                                                corpus, 
+#                                                test_dict_dataloader[corpus], 
+#                                                print_results=False)
+
+
+        # ========= New Code! =============
+        # Save for each epoch the dev and test results 
+        
+        
+
+                
+# ------- Start the training -------   
+
+print('\nModel: ', args.transformer_model)
+print('Batch size: ', args.batch_size * args.gradient_accumulation_steps)
+print('\nStart training...\n')
+train(model, 
+      train_dataloader,
+      dev_dict_dataloader, 
+      test_dict_sentences, 
+      test_dict_dataloader,
+      args.num_epochs, 
+#       specific_results
+     )
+print('\nTraining Done!')
+
diff --git a/classifier_bare_pytorch.py b/classifier_bare_pytorch.py
index 9cfd03eead463acbd032f1f27667e26d46f89273..81c9bde055cd813d92f3c3328b837b87556f0af9 100644
--- a/classifier_bare_pytorch.py
+++ b/classifier_bare_pytorch.py
@@ -79,6 +79,11 @@ batch_size = args.batch_size
 tokenizer  = AutoTokenizer.from_pretrained(args.transformer_model)
 
 train_sentences, dev_dict_sentences, test_dict_sentences, framework_labels = open_sentences_with_lang(args.data_path, mappings)
+# train_sentences, dev_dict_sentences, test_dict_sentences, framework_labels = open_sentences(args.data_path, mappings)
+
+print('\nCheck encodings:\n')
+print(train_sentences[0])
+
 
 # Determine linear size (= number of classes in the sets + 1)
 num_labels = len(set(sent[-1] for sent in train_sentences)) + 1
@@ -222,9 +227,13 @@ def train(model,
                                     model, 
                                     corpus, 
                                     dev_dict_dataloader[corpus], 
-                                    framework_labels[corpus.split('.')[1]]
+                                    framework_labels[corpus.split('.')[1]], 
+                                    inv_mappings,
+                                    epoch_num+1,
+                                    save_conf_matrix=False
                                     )
             
+                    
 #             path_results = 'results/dev/language_' + mapping_classes + '_' + str(epoch_num+1)
 #             if not os.path.exists(path_results):
 #                 os.makedirs(path_results)
@@ -249,7 +258,10 @@ def train(model,
                                     model, 
                                     corpus, 
                                     test_dict_dataloader[corpus], 
-                                    framework_labels[corpus.split('.')[1]]
+                                    framework_labels[corpus.split('.')[1]], 
+                                    inv_mappings,
+                                    epoch_num+1,
+                                    save_conf_matrix=False
                                     )
             
 #             path_results = 'results/test/language_' + mapping_classes + '_' + str(epoch_num+1)
diff --git a/classifier_with_adapter.py b/classifier_with_adapter.py
index 79916e54bdf33bc89946742f69c1dcaf19df9a94..a8377675e4dedcfcb5fe45b98c02fd768e5a14f8 100644
--- a/classifier_with_adapter.py
+++ b/classifier_with_adapter.py
@@ -125,11 +125,11 @@ training_args = TrainingArguments(
     remove_unused_columns = False,
     warmup_steps = 1000,  # number of warmup steps for learning rate  
 #     save_steps = (len(train_sentences)/(args.batch_size * args.gradient_accumulation_steps)) / 1368,
-    save_total_limit = args.num_epochs,
-    load_best_model_at_end = True,
+#     save_total_limit = args.num_epochs,
+#     load_best_model_at_end = True,
     weight_decay = 0.01,  # strength of weight decay
-    save_strategy='epoch', 
-    evaluation_strategy='epoch'
+#     save_strategy='epoch', 
+#     evaluation_strategy='epoch'
 )
 
 
@@ -175,18 +175,18 @@ for corpus in encoded_dev_dataset:
                                                  )
 
     
-    print('\nTest results:')
-for corpus in encoded_test_dataset:
-    print()
+#     print('\nTest results:')
+# for corpus in encoded_test_dataset:
+#     print()
     
-    dev_results_ = get_predictions_huggingface(trainer, 
-                                                  corpus, 
-                                                  encoded_test_dataset[corpus]
-                                                 )
+#     dev_results_ = get_predictions_huggingface(trainer, 
+#                                                   corpus, 
+#                                                   encoded_test_dataset[corpus]
+#                                                  )
 
     
-    dev_results = better_predictions_huggingface(trainer, 
-                                                  corpus, 
-                                                  encoded_test_dataset[corpus],
-                                                  framework_labels[corpus.split('.')[1]]
-                                                 )
+#     dev_results = better_predictions_huggingface(trainer, 
+#                                                   corpus, 
+#                                                   encoded_test_dataset[corpus],
+#                                                   framework_labels[corpus.split('.')[1]]
+#                                                  )
diff --git a/utils.py b/utils.py
index e1cf997f375cc7c01a21d341f74def18f2a4d779..ec1f3f6e431a0017318335bf20255fefe13e02b8 100644
--- a/utils.py
+++ b/utils.py
@@ -6,24 +6,18 @@ import torch
 from transformers import AutoConfig, AutoTokenizer
 from configure import parse_args
 import numpy as np
-from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
+from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
 import matplotlib.pyplot as plt
+import seaborn as sns
+from time import sleep
+from datetime import datetime
 
-args = parse_args()
+now = datetime.now()
+dt_string = now.strftime("%d.%m.%y-%H:%M:%S")
 
 
-def switch_dimensions(vector_list):
-    
-    target_dim_len = len(vector_list[0])
-    new_vector = []
-    
-    for n in range(target_dim_len):
-        temp = []
-        for x in vector_list:
-            temp.append(x[n])
-        new_vector.append(temp)
-        
-    return new_vector
+args = parse_args()
+
 
 
 def open_mappings(mappings_file):
@@ -156,6 +150,7 @@ def open_file_with_lang(filename, mappings_dict):
                 # flip them if different direction
                 if args.normalize_direction == 'yes':
                     if l[9] == '1>2':
+                        #lang, fullname, framework
                         lines.append(l + [[lang, fullname, framework] + sent_1 + [SEP_token] + sent_2, encode_label(l[11], mappings_dict)])
                     else:
                         lines.append(l + [[lang, fullname, framework] + sent_2 + [SEP_token] + sent_1, encode_label(l[11], mappings_dict)])
@@ -257,6 +252,10 @@ def open_sentences(path_to_corpora, mappings_dict):
         all_labels[framework] += [l[-1] for l in temp]  
 
     corpus_labels = {framework:set(all_labels[framework]) for framework in all_labels}
+    # delete unk as a sanity check
+    for framework in corpus_labels:
+        if 'unk' in corpus_labels[framework]:
+            corpus_labels[framework].remove('unk')
 
     return train_sentences, dev_dict_sentences, test_dict_sentences, corpus_labels
 
@@ -330,18 +329,14 @@ def open_sentences_with_lang(path_to_corpora, mappings_dict):
         all_labels[framework] += [l[-1] for l in temp]  
 
     corpus_labels = {framework:set(all_labels[framework]) for framework in all_labels}
+    # delete unk as a sanity check
+    for framework in corpus_labels:
+        if 'unk' in corpus_labels[framework]:
+            corpus_labels[framework].remove('unk')
     
     return train_sentences, dev_dict_sentences, test_dict_sentences, corpus_labels
 
 
-def show_confusion(predictions, labels, save_name):
-    
-    cm = confusion_matrix(y_test, y_pred)
-    cm_display = ConfusionMatrixDisplay(cm).plot()
-    
-    plt.savefig(save_name, dpi=300)
-    
-    return None
 
 # ===============
 # Testing functions
@@ -451,11 +446,64 @@ def better_predictions_huggingface(trainer,
     return best_labels
 
 
+def make_confusion_matrices(y_test,
+                            y_pred, 
+                            corpus_name,
+                            inv_mappings,
+                            epoch):
+    
+    save_path = 'conf_matrix/' + dt_string
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+        
+    print(classification_report(y_test,
+                                y_pred, 
+                                #target_names=                                sorted(list(set([inv_mappings[x] for x in y_test])))
+                               )
+         )
+    
+
+    cm = confusion_matrix(y_test,
+                          y_pred, 
+                          labels = list(inv_mappings.keys())
+                         )
+    print(cm)
+    
+    xticklabels = list(inv_mappings.values())
+    yticklabels = list(inv_mappings.values())    
+    
+    sns.color_palette("cubehelix", as_cmap=True)
+    # Plot the confusion matrix.
+    
+    fig, ax = plt.subplots()
+#     ax.tick_params(axis='both', which='major', labelsize=6)
+#     ax.tick_params(axis='both', which='minor', labelsize=6)
+    ax = sns.heatmap(cm,
+                #annot=Truex
+                xticklabels=xticklabels, 
+                yticklabels=yticklabels
+               )
+    plt.ylabel('Predicted label')
+    plt.xlabel('Corpus label')
+    plt.xticks(fontsize=2)
+    plt.yticks(fontsize=2)
+#     plt.xticks(x, labels, rotation='vertical')
+#     plt.margins(0.5)
+    plt.subplots_adjust(bottom=0.5, left=0.5)
+    plt.title('Confusion Matrix: '+corpus_name+' (epoch:'+ str(epoch) + ')')
+    plt.savefig(save_path + '/' + corpus_name + '_' + str(epoch) + '.png', 
+                dpi=300)
+    plt.clf()
+
+
 def get_better_predictions(model,
                             corpus, 
                             test_dataloader, 
                             corpus_labels,
-                            print_results=True):
+                            inv_mappings,
+                            epoch,
+                            print_results=True, 
+                            save_conf_matrix=False):
     
     device = torch.device("cuda" if args.use_cuda else "cpu")
 
@@ -504,9 +552,25 @@ def get_better_predictions(model,
     if print_results:
         print('better:\t' + str(test_acc), flush='True')
         
+        print(classification_report(all_labels, top_preds))
+    
+    if save_conf_matrix:
+        #try:
+        make_confusion_matrices(all_labels, 
+                                    top_preds, 
+                                    corpus,
+                                    inv_mappings,
+                                    epoch)
+#         except ValueError:
+#             print('matrix failed to print')
+#             pass
+    print()
+#     print(all_preds)
+    print('----')
     
     return all_labels, all_preds
 
+
     
 def print_results_to_file(corpus, 
                           test_sentences,