update before server shutdown

23d2149f · emetheni · b14b8d54 · 23d2149f · 23d2149f · 23d2149f
Commit 23d2149f authored 1 year ago by emetheni
--- a/classifier_bare_huggingface.py
+++ b/classifier_bare_huggingface.py
@@ -16,7 +16,6 @@ from utils import *

 device = torch.device("cuda")

-# print('\n\nwith Language token - eng + Corpus (no framework) \n')

 # ---------------------------------------------------------------------------------------------------
 args = parse_args()
@@ -41,6 +40,11 @@ mappings, inv_mappings = open_mappings(args.mappings_file)
 # Open sentences
 train_sentences, dev_dict_sentences, test_dict_sentences, framework_labels = open_sentences_with_lang(args.data_path, mappings)

+
+print('\nCheck encodings:\n')
+print(train_sentences[0])
+
+
 # make pandas dataframes
 file_header = ['text', 'labels']

@@ -152,18 +156,23 @@ trainer.train()
 print('\nDev results:')
 for corpus in encoded_dev_dataset:
    print()
+    
+    dev_results_ = get_predictions_huggingface(trainer, 
+                                                  corpus, 
+                                                  encoded_dev_dataset[corpus]
+                                                 )
+
+    
    dev_results = better_predictions_huggingface(trainer, 
                                                  corpus, 
                                                  encoded_dev_dataset[corpus],
                                                  framework_labels[corpus.split('.')[1]]
                                                 )
+
    
-    print(dev_results)
-    
-    
-    path_results = 'results/dev/' + args.transformer_model + '_' + str(args.num_epochs)
-    if not os.path.exists(path_results):
-        os.makedirs(path_results)
+#     path_results = 'results/dev/' + args.transformer_model + '_' + str(args.num_epochs)
+#     if not os.path.exists(path_results):
+#         os.makedirs(path_results)
                
 #     print_results_to_file(corpus, 
 #                           dev_dict_sentences[corpus], 

--- a/classifier_bare_pytorch.py
+++ b/classifier_bare_pytorch.py
@@ -172,8 +172,8 @@ def train(model,
        total_loss_train = 0
        batch_counter = 0
        
-#         for train_input, train_label in tqdm(train_dataloader):
-        for train_input, train_label in train_dataloader:
+        for train_input, train_label in tqdm(train_dataloader):
+#         for train_input, train_label in train_dataloader:
            batch_counter += 1
            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)

--- a/make_adapter.py
+++ b/make_adapter.py
@@ -32,7 +32,7 @@ print('Frozen layers:',  args.freeze_layers.replace(';', ', '))
 mappings, inv_mappings = open_mappings(args.mappings_file)

 # Open sentences
-train_sentences, dev_dict_sentences, _ = open_sentences_with_lang(args.data_path, mappings)
+train_sentences, dev_dict_sentences, _, framework_labels = open_sentences_with_lang(args.data_path, mappings)


 print('\nCheck encodings:\n')

--- a/utils.py
+++ b/utils.py
@@ -11,6 +11,20 @@ from sklearn.metrics import accuracy_score
 args = parse_args()


+def switch_dimensions(vector_list):
+    
+    target_dim_len = len(vector_list[0])
+    new_vector = []
+    
+    for n in range(target_dim_len):
+        temp = []
+        for x in vector_list:
+            temp.append(x[n])
+        new_vector.append(temp)
+        
+    return new_vector
+
+
 def open_mappings(mappings_file):
    
    ''' Open the mappings file into a dictionary.'''
@@ -21,19 +35,18 @@ def open_mappings(mappings_file):
        for line in f:
            l = line.strip().split('\t')
            mappings[l[0]] = int(l[-1])
-            
-    inv_mappings = []
-    # this cannot be a dictionary! it has to be tuples
-    # because we have some labels which are replaced, e.g.
-    # joint-list is replaced
+           
+    # reject the converted labels
+    inv_mappings = {}
    for k, v in mappings.items():
-        inv_mappings.append((v, k))
-
+        if v not in inv_mappings:
+            inv_mappings[v] = k
+    
    return mappings, inv_mappings


 def encode_label(og_label, mappings_dict):
-    label = og_label.lower()
+    label = og_label.lower().strip()
    if label in mappings_dict:
        return mappings_dict[label]
    else:
@@ -72,11 +85,11 @@ def open_file(filename, mappings_dict):
                # flip them if different direction
                if args.normalize_direction == 'yes':
                    if l[9] == '1>2':
-                        lines.append(l + [sent_1 + [SEP_token] + sent_2, encode_label(l[11], mappings_dict)])
+                        lines.append(l + [sent_1 + [SEP_token] + sent_2, encode_label(l[-1], mappings_dict)])
                    else:
-                        lines.append(l + [sent_2 + [SEP_token] + sent_1, encode_label(l[11], mappings_dict)])
+                        lines.append(l + [sent_2 + [SEP_token] + sent_1, encode_label(l[-1], mappings_dict)])
                else:
-                    lines.append(l + [sent_1 + [SEP_token] + sent_2, encode_label(l[11], mappings_dict)])
+                    lines.append(l + [sent_1 + [SEP_token] + sent_2, encode_label(l[-1], mappings_dict)])

    return lines

@@ -142,20 +155,22 @@ def open_file_with_lang(filename, mappings_dict):
                # flip them if different direction
                if args.normalize_direction == 'yes':
                    if l[9] == '1>2':
-                        lines.append(l + [[lang, fullname] + sent_1 + [SEP_token] + sent_2, encode_label(l[11], mappings_dict)])
+                        lines.append(l + [[lang, fullname, framework] + sent_1 + [SEP_token] + sent_2, encode_label(l[11], mappings_dict)])
                    else:
-                        lines.append(l + [[lang, fullname] + sent_2 + [SEP_token] + sent_1, encode_label(l[11], mappings_dict)])
+                        lines.append(l + [[lang, fullname, framework] + sent_2 + [SEP_token] + sent_1, encode_label(l[11], mappings_dict)])
                else:
-                    lines.append(l + [[lang, fullname] + sent_1 + [SEP_token] + sent_2, encode_label(l[11], mappings_dict)])
+                    lines.append(l + [[lang, fullname, framework] + sent_1 + [SEP_token] + sent_2, encode_label(l[11], mappings_dict)])

    return lines

+
+
 def encode_batch(batch):
    
    """ Encodes a batch of input data using the model tokenizer.
        Works for a pandas DF column, instead of a list.
    """
-    tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
+    tokenizer = AutoTokenizer.from_pretrained(args.transformer_model)
    return tokenizer(batch["text"], 
                     max_length=512, 
                     truncation=True, 
@@ -289,7 +304,7 @@ def open_sentences_with_lang(path_to_corpora, mappings_dict):
                                  for x in os.listdir(path_to_corpora + '/' + corpus) 
                                  if 'train' in x and 'rels' in x
                             ][0]
-            temp = open_file(train_file, mappings_dict)
+            temp = open_file_with_lang(train_file, mappings_dict)
            train_sentences += temp
            all_labels[framework] += [l[-1] for l in temp]
        except: # some of them don't have train
@@ -373,7 +388,7 @@ def get_predictions_huggingface(trainer,
                                test_set, 
                                print_results=True):
    
-    ''' SPECIFI FUNCTION FOR THE HUGGINGFACE TRAINER.
+    ''' SPECIFIC FUNCTION FOR THE HUGGINGFACE TRAINER.
        Function to get the model's predictions for one corpus' test set.
        Can print accuracy using scikit-learn.
        Also works with dev sets -- just don't save the outputs.
@@ -381,7 +396,6 @@ def get_predictions_huggingface(trainer,
    '''

    results = trainer.predict(test_set)
-    preds = np.softmax(results.predictions, axis=1)
    top_preds = np.argmax(results.predictions, axis=1)
    results = results.label_ids
    test_acc = round(accuracy_score(top_preds, results), 4)
@@ -389,7 +403,7 @@ def get_predictions_huggingface(trainer,
    if print_results:
        print(corpus, '\t', test_acc, '\n')
    
-    return preds
+    return top_preds

    
 def better_predictions_huggingface(trainer,
@@ -398,33 +412,33 @@ def better_predictions_huggingface(trainer,
                                corpus_labels,
                                print_results=True):
    
-    ''' SPECIFI FUNCTION FOR THE HUGGINGFACE TRAINER.
-        Function to get the model's predictions for one corpus' test set.
-        Can print accuracy using scikit-learn.
-        Also works with dev sets -- just don't save the outputs.
-        Returns: list of predictions that match test file's lines.
    '''
-    
+    '''
+   
    results = trainer.predict(test_set)
-    preds = np.argmax(results.predictions, axis=1)
-    orig_labels = results.label_ids
-    test_acc = round(accuracy_score(top_preds, orig_labels), 4)
+    orig_labels = results.label_ids.tolist()
+    print('len sentences', len(orig_labels))
+    print('shape of preds', results.predictions.shape)
    
-    if print_results:
-        print(corpus + '\t' + str(test_acc) + '\n', flush='True')
+    results_per_sent = results.predictions.tolist()
    
-    print(type(results.predictions))
    # try to make the better prediction bit
    best_labels = []
-    for n, result in enumerate(results.predictions.tolist()):
-        orig_label = results.label_ids[n]
+    for sent, sent_results in enumerate(results_per_sent):
+        
        best_prob = -1000
-        best_label = -1
-        if orig_label in corpus_labels:
-            if result > best_prob:
-                best_prob = result
-                best_label = n
-        best_labels.append(n)
+        best_label = -1 
+        
+        #assert len(sent_results) == len(orig_labels)
+        
+        for n, prob in enumerate(sent_results):
+            if n in corpus_labels:
+                if prob > best_prob:
+                    best_prob = prob
+                    best_label = n
+                    
+        best_labels.append(best_label)
+        
    test_acc = round(accuracy_score(best_labels, orig_labels), 4)
    print('better:\t' + str(test_acc) + '\n', flush='True')