diff --git a/classifier_bare_pytorch.py b/classifier_bare_pytorch.py
index e2794ca34da51f66823d94cf87cbd25c332a535c..9cfd03eead463acbd032f1f27667e26d46f89273 100644
--- a/classifier_bare_pytorch.py
+++ b/classifier_bare_pytorch.py
@@ -26,8 +26,8 @@ substitutions_file = 'mappings/substitutions.txt'
 # mapping_classes = args.mappings_file[:-4].split('-')[-1]
 # specific_results = open_specific_results('mappings/specific_results.txt')['B']
 
-print('\nlangs to use: '+ args.langs_to_use + '\n', flush='True')
-
+print('\nlangs to use: '+ args.langs_to_use)
+print('mappings file: ' + args.mappings_file, flush='True')
 set_seed(42)
 torch.manual_seed(42)
 
diff --git a/classifier_with_adapter.py b/classifier_with_adapter.py
index 1fcc3780058a6883bf3e26a686b8093b440043cd..79916e54bdf33bc89946742f69c1dcaf19df9a94 100644
--- a/classifier_with_adapter.py
+++ b/classifier_with_adapter.py
@@ -29,17 +29,8 @@ mappings, inv_mappings = open_mappings(args.mappings_file)
 substitutions_file = 'mappings/substitutions.txt'
 tokenizer = AutoTokenizer.from_pretrained(args.transformer_model)
 
-# we are saving the test results of specific epochs
-# specific_results = open_specific_results('mappings/specific_results.txt')
-# if '1-2-3' in adapter_name or 'layer1;layer2;layer3' in adapter_name:
-#     specific_results = list(specific_results['A1_3'][args.num_epochs])
-# else:
-#     specific_results = list(specific_results['A1'][args.num_epochs])
-
 set_seed(42)
 
-            
-
 print('Train classifier with adapter\n')
 print('Adapter name:', adapter_name)
 print('Model:', args.transformer_model)
@@ -50,7 +41,12 @@ print('Num epochs:', args.num_epochs)
 mappings, inv_mappings = open_mappings(args.mappings_file)
 
 # Open sentences
-train_sentences, dev_dict_sentences, test_dict_sentences = open_sentences(args.data_path, mappings)
+train_sentences, dev_dict_sentences, test_dict_sentences, framework_labels = open_sentences_with_lang(args.data_path, mappings)
+
+
+print('\nCheck encodings:\n')
+print(train_sentences[0])
+
 
 # make pandas dataframes
 file_header = ['text', 'labels']
@@ -59,6 +55,13 @@ train_df = pd.DataFrame([[' '.join(x[-2]), x[-1]] for x in train_sentences],
                         columns =file_header)
 train_df = train_df.sample(frac = 1) # shuffle the train
 
+# make a joint dev dataset in order to save models
+
+eval_df = pd.DataFrame([[' '.join(x[-2]), x[-1]] 
+                        for corpus, sents in dev_dict_sentences.items()
+                        for x in sents], 
+                        columns =file_header)
+
 dev_dict_df = {corpus : pd.DataFrame([[' '.join(x[-2]), x[-1]] 
                                       for x in sents], 
                                      columns = file_header)
@@ -71,6 +74,7 @@ test_dict_df = {corpus : pd.DataFrame([[' '.join(x[-2]), x[-1]]
 
 #Make datasets from dataframes
 train_dataset = datasets.Dataset.from_pandas(train_df)
+eval_dataset =  datasets.Dataset.from_pandas(eval_df)
 dev_dict_dataset  = {corpus:datasets.Dataset.from_pandas(dev_df) 
                      for corpus, dev_df in dev_dict_df.items()}
 test_dict_dataset = {corpus:datasets.Dataset.from_pandas(dev_df) 
@@ -84,6 +88,9 @@ num_labels = len(set([int(x.strip())
 train_dataset = train_dataset.map(encode_batch, batched=True)
 train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
 
+eval_dataset = eval_dataset.map(encode_batch, batched=True)
+eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
+
 encoded_dev_dataset = {}
 for corpus in dev_dict_dataset:
     temp = dev_dict_dataset[corpus].map(encode_batch, batched=True)
@@ -129,7 +136,8 @@ training_args = TrainingArguments(
 trainer = Trainer(
     model = model,
     args  = training_args,
-    train_dataset = train_dataset
+    train_dataset = train_dataset, 
+    eval_dataset = eval_dataset
 )
 
 
@@ -153,72 +161,32 @@ trainer.train()
 print('\nDev results:')
 for corpus in encoded_dev_dataset:
     print()
-    dev_results = get_predictions_huggingface(trainer, corpus, 
-                                    encoded_dev_dataset[corpus])
     
+    dev_results_ = get_predictions_huggingface(trainer, 
+                                                  corpus, 
+                                                  encoded_dev_dataset[corpus]
+                                                 )
+
     
-    path_results = 'results/dev/' + adapter_name + '_' + str(args.num_epochs)
-    if not os.path.exists(path_results):
-        os.makedirs(path_results)
-                
-    print_results_to_file(corpus, 
-                          dev_dict_sentences[corpus], 
-                          dev_results,
-                          inv_mappings, 
-                          #substitutions_file, 
-                          path_results)
-
-# Test results
-
-print('\ntest results:')
+    dev_results = better_predictions_huggingface(trainer, 
+                                                  corpus, 
+                                                  encoded_dev_dataset[corpus],
+                                                  framework_labels[corpus.split('.')[1]]
+                                                 )
+
+    
+    print('\nTest results:')
 for corpus in encoded_test_dataset:
     print()
-    test_results = get_predictions_huggingface(trainer, 
-                                               corpus, 
-                                               encoded_test_dataset[corpus])
-    
-    
-    path_results = 'results/test/' + adapter_name + '_' + str(args.num_epochs)
-    if not os.path.exists(path_results):
-        os.makedirs(path_results)
-                
-    print_results_to_file(corpus, 
-                          test_dict_sentences[corpus], 
-                          test_results,
-                          inv_mappings, 
-                          #substitutions_file, 
-                          path_results)
-
-
-
-#         for corpus in test_dict_dataloader:
-#             test_results = get_predictions(model, 
-#                                 corpus, 
-#                                 test_dict_dataloader[corpus])
-            
-#             path_results = 'results/test/pytorch' + str(epoch_num+1)
-#             if not os.path.exists(path_results):
-#                 os.makedirs(path_results)
-                
-#             print_results_to_file(corpus, 
-#                                 test_dict_sentences[corpus], 
-#                                 test_results,
-#                                 inv_mappings, substitutions_file, 
-#                                 path_results)    
-    
-    
-    
-    
-    
     
+    dev_results_ = get_predictions_huggingface(trainer, 
+                                                  corpus, 
+                                                  encoded_test_dataset[corpus]
+                                                 )
 
-# Save specific test results
-
-# print('\nTest results:')
-# for corpus in encoded_test_dataset:
-#     print()
-#     test_results = get_predictions_huggingface(trainer, corpus, 
-#                                     encoded_test_dataset[corpus])
-# 
-#     print_results_to_file(corpus, test_dict_sentences[corpus], test_results, 
-#                           inv_mappings, substitutions_file)
\ No newline at end of file
+    
+    dev_results = better_predictions_huggingface(trainer, 
+                                                  corpus, 
+                                                  encoded_test_dataset[corpus],
+                                                  framework_labels[corpus.split('.')[1]]
+                                                 )
diff --git a/utils.py b/utils.py
index bdac43a63495b81191bc316514018accd0d4b6bb..e1cf997f375cc7c01a21d341f74def18f2a4d779 100644
--- a/utils.py
+++ b/utils.py
@@ -6,7 +6,8 @@ import torch
 from transformers import AutoConfig, AutoTokenizer
 from configure import parse_args
 import numpy as np
-from sklearn.metrics import accuracy_score
+from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
+import matplotlib.pyplot as plt
 
 args = parse_args()
 
@@ -333,6 +334,14 @@ def open_sentences_with_lang(path_to_corpora, mappings_dict):
     return train_sentences, dev_dict_sentences, test_dict_sentences, corpus_labels
 
 
+def show_confusion(predictions, labels, save_name):
+    
+    cm = confusion_matrix(y_test, y_pred)
+    cm_display = ConfusionMatrixDisplay(cm).plot()
+    
+    plt.savefig(save_name, dpi=300)
+    
+    return None
 
 # ===============
 # Testing functions
@@ -417,9 +426,6 @@ def better_predictions_huggingface(trainer,
    
     results = trainer.predict(test_set)
     orig_labels = results.label_ids.tolist()
-    print('len sentences', len(orig_labels))
-    print('shape of preds', results.predictions.shape)
-    
     results_per_sent = results.predictions.tolist()
     
     # try to make the better prediction bit