huggingface finally works

3dffe4fc · emetheni · 30190c63 · 3dffe4fc · 3dffe4fc · 3dffe4fc
Commit 3dffe4fc authored 1 year ago by emetheni
--- a/classifier_bare_huggingface.py
+++ b/classifier_bare_huggingface.py
@@ -16,7 +16,7 @@ from utils import *

 device = torch.device("cuda")

-print('\n\nwith Language token - eng + Corpus (no framework) \n')
+# print('\n\nwith Language token - eng + Corpus (no framework) \n')

 # ---------------------------------------------------------------------------------------------------
 args = parse_args()
@@ -41,10 +41,6 @@ mappings, inv_mappings = open_mappings(args.mappings_file)
 # Open sentences
 train_sentences, dev_dict_sentences, test_dict_sentences, _ = open_sentences(args.data_path, mappings)

-print('\nCheck encodings:\n')
-print(train_sentences[0])
-
-
 # make pandas dataframes
 file_header = ['text', 'labels']


--- a/classifier_bare_pytorch.py
+++ b/classifier_bare_pytorch.py
@@ -26,7 +26,7 @@ substitutions_file = 'mappings/substitutions.txt'
 # mapping_classes = args.mappings_file[:-4].split('-')[-1]
 # specific_results = open_specific_results('mappings/specific_results.txt')['B']

-print('ZERO-SHOT LANG: '+ args.langs_to_use)
+print('\nlangs to use: '+ args.langs_to_use + '\n')

 set_seed(42)
 torch.manual_seed(42)
@@ -80,8 +80,6 @@ tokenizer  = AutoTokenizer.from_pretrained(args.transformer_model)

 train_sentences, dev_dict_sentences, test_dict_sentences, framework_labels = open_sentences(args.data_path, mappings)

-
-print(framework_labels, flush=True)
 # Determine linear size (= number of classes in the sets + 1)
 num_labels = len(set(sent[-1] for sent in train_sentences)) + 1


--- a/configure.py
+++ b/configure.py
@@ -53,7 +53,7 @@ def parse_args():
                        help="Change order of sentences when the direction of relations is 1<2 to 2>1.") 
    
    # only specific languages/corpora
-    parser.add_argument("--langs_to_use", default='', type=str, 
+    parser.add_argument("--langs_to_use", default='@', type=str, 
                        help="List of languages/corpora to use, a str separated by ;")   
    
            

--- a/get_predictions.sh
+++ b/get_predictions.sh
-#!/usr/bin/env bash
-
-#SBATCH --job-name=model-LC
-
-#SBATCH --ntasks=1
-#SBATCH --cpus-per-task=4
-#SBATCH --partition=GPUNodes
-#SBATCH --gres=gpu:1
-
-
-# tests tests
-
-
-# srun singularity exec /logiciels/containerCollections/CUDA10/pytorch.sif python3 xml-roberta-classifier.py \
-#     --batch_size 4 \
-#     --gradient_accumulation_steps 32 \
-#     --num_epochs 6 \
-#     --data_path '/users/melodi/emetheni/clean_data' \
-#     --mappings_file 'mappings/mappings_substitutions.tsv' \
-#     --transformer_model "xlm-roberta-base"
-   
-    
-srun singularity exec /logiciels/containerCollections/CUDA10/pytorch.sif python3 open_finetuned_model.py \
-    --data_path '/users/melodi/emetheni/clean_data' \
-    --mappings_file 'mappings/mappings_substitutions.tsv' \
-    --transformer_model 'results/models/run_xlm-roberta-base/checkpoint-13500'
--- a/make_mappings_zero-shot.py
+++ b/make_mappings_zero-shot.py
@@ -29,23 +29,23 @@ for label, num in mappings.items():
        

 # -----------------------------------
-# define which language to NOT use with the arguments
-not_language = args.langs_to_use
+# define which language to use with the arguments
+languages = args.langs_to_use.split(';')


-corpora = [folder for folder in os.listdir(args.data_path) 
-           if not not_language in folder]
+corpora = [folder 
+           for folder in os.listdir(args.data_path) 
+           if any(l in folder for l in languages)]

 files = ['/'.join([args.data_path, corpus, f])
         for corpus in corpora
-         for f in os.listdir(args.data_path + '/' + corpus)
-        ]
+         for f in os.listdir(args.data_path + '/' + corpus)]

 # open the files
 def read_file(file):
    ''' Open the relations file. '''
    relations = []
-    
+    sub_rels = []
    with open(file, 'r', encoding='utf-8') as f:
        next(f)
        for line in f:
@@ -53,50 +53,48 @@ def read_file(file):
                l = line.strip().split('\t')
                if not l[11].lower() in subs:
                    relations.append(l[11].lower())
+                else:
+                    sub_rels.append(l[11].lower())
            except IndexError:
                pass
-        return relations
+        return relations, sub_rels


 rel_files = [f for f in files if any (x in f for x in ['train', 'test', 'dev'])]

-rels = []
+good_rels = []
+sub_rels = []
 for f in rel_files:
-    temp = read_file(f)
-    if temp != []:
-        rels += temp
+    x, y = read_file(f)
+    good_rels += x
+    sub_rels += y

-dict_labels = dict(enumerate(list(set(rels))))
-inv_labels = {v:k for k, v in dict_labels.items()}
+dict_labels = dict(enumerate(list(set(good_rels))))
+corpora_labels = {v:k for k, v in dict_labels.items()}

-leftovers = []
-for sub in subs:
-    if sub not in inv_labels:
-        try:
-            inv_labels[sub] = inv_labels[subs[sub]]
-        except KeyError:
-            leftovers.append(sub)
-    else:
-        leftovers.append(sub)
-for mapping in mappings:
-    if mapping not in inv_labels:
-        leftovers.append(mapping)

-counter = len(inv_labels) -1
-for i in leftovers:
-    counter += 1
-    inv_labels[i] = counter
-    

+leftovers = []
+
+# for mapping in mappings:
+#     if mapping not in corpora_labels and mapping not in subs:
+#         leftovers.append(mapping)

-# # save the new labels
-print('-'*20)
-print(not_language)
-print(len(inv_labels))
+# counter = max(list(corpora_labels.values())) -1
+# for i in leftovers:
+#     counter += 1
+#     corpora_labels[i] = counter

+for sub in sub_rels:
+    try:
+        corpora_labels[sub] = corpora_labels[subs[sub]]
+    except KeyError:
+        corpora_labels[subs[sub]] = max(list(corpora_labels.values())) + 1
+        corpora_labels[sub] = corpora_labels[subs[sub]]

+# print(corpora_labels)

-with open('mappings/zero-shot/' + not_language + '_zero-shot.tsv', 'w') as f:
+with open('mappings/jaccard/' + 'por.rst' + '.tsv', 'w') as f:
    f.write('LABEL\tMAPPING\n')
-    for k, v in inv_labels.items():
+    for k, v in corpora_labels.items():
        f.write(k + '\t' + str(v) + '\n')
\ No newline at end of file
--- a/utils.py
+++ b/utils.py
@@ -18,8 +18,9 @@ def open_mappings(mappings_file):
    mappings = {}
    with open(mappings_file, 'r') as f:
        next(f)
-        for l in f:
-            mappings[l.split('\t')[0]] = int(l.strip().split('\t')[-1])
+        for line in f:
+            l = line.strip().split('\t')
+            mappings[l[0]] = int(l[-1])
            
    inv_mappings = {}
 #     for label, num in mappings.items():
@@ -163,10 +164,13 @@ def open_sentences(path_to_corpora, mappings_dict):
        - dict of sentences for TEST: each test set categorized per corpus
        - ** NEW ** : dict of labels per framework
    '''
-    
+    langs_to_use = args.langs_to_use.split(';')
    corpora = [folder for folder in os.listdir(path_to_corpora) 
               if not any(i in folder for i in ['.md', 'DS_', 'utils', 'ipynb'])
+               # langs here
+               #if any(l in folder for l in langs_to_use)
               ]
+    
               
    # ---------------------
    train_sentences     = []
@@ -186,8 +190,9 @@ def open_sentences(path_to_corpora, mappings_dict):
            train_file = ['/'.join([path_to_corpora, corpus, x])
                              for x in os.listdir(path_to_corpora + '/' + corpus) 
                              if 'train' in x and 'rels' in x
-                             # attention! we block training for some languages if we want HERE
-                              if not args.langs_to_use in x][0]
+                             # attention! we ALLOW training for some corpora if we want HERE
+#                               if any(l in x for l in langs_to_use)
+                         ][0]
            temp = open_file(train_file, mappings_dict)
            # train_sentences += open_file_with_lang(train_file, mappings_dict)
            train_sentences += temp
@@ -216,9 +221,9 @@ def open_sentences(path_to_corpora, mappings_dict):
        all_labels[framework] += [l[-1] for l in temp]  
 #         test_dict_sentences[corpus] += open_file_with_lang(test_file, mappings_dict)

-    labels = {framework:set(all_labels[framework]) for framework in all_labels}
+    corpus_labels = {framework:set(all_labels[framework]) for framework in all_labels}

-    return train_sentences, dev_dict_sentences, test_dict_sentences, labels
+    return train_sentences, dev_dict_sentences, test_dict_sentences, corpus_labels


 def open_sentences_with_lang(path_to_corpora, mappings_dict):
@@ -237,7 +242,8 @@ def open_sentences_with_lang(path_to_corpora, mappings_dict):
    train_sentences     = []
    dev_dict_sentences  = {}
    test_dict_sentences = {}
-
+    corpus_labels = []
+    
    for corpus in corpora:
        
        try:
@@ -267,7 +273,7 @@ def open_sentences_with_lang(path_to_corpora, mappings_dict):
 #         test_dict_sentences[corpus] += open_file_with_lang(test_file, mappings_dict)

    
-    return train_sentences, dev_dict_sentences, test_dict_sentences
+    return train_sentences, dev_dict_sentences, test_dict_sentences, corpus_labels



@@ -333,7 +339,7 @@ def get_predictions_huggingface(trainer,
    '''

    results = trainer.predict(test_set)
-    preds = np.softmax(results.predictions, axis=1)
+#     preds = np.softmax(results.predictions, axis=1)
    top_preds = np.argmax(results.predictions, axis=1)
    results = results.label_ids
    test_acc = round(accuracy_score(top_preds, results), 4)
@@ -341,7 +347,7 @@ def get_predictions_huggingface(trainer,
    if print_results:
        print(corpus, '\t', test_acc, '\n')
    
-    return preds
+#     return preds


 def get_better_predictions(model,