diff --git a/classifier_bare_huggingface.py b/classifier_bare_huggingface.py index 22c0509031653b1a9978438fa2d78758ab5b7a5f..bf1e7203cb8a0c1e10a6de3021546907259d03b7 100644 --- a/classifier_bare_huggingface.py +++ b/classifier_bare_huggingface.py @@ -16,7 +16,7 @@ from utils import * device = torch.device("cuda") -print('\n\nwith Language token - eng + Corpus (no framework) \n') +# print('\n\nwith Language token - eng + Corpus (no framework) \n') # --------------------------------------------------------------------------------------------------- args = parse_args() @@ -41,10 +41,6 @@ mappings, inv_mappings = open_mappings(args.mappings_file) # Open sentences train_sentences, dev_dict_sentences, test_dict_sentences, _ = open_sentences(args.data_path, mappings) -print('\nCheck encodings:\n') -print(train_sentences[0]) - - # make pandas dataframes file_header = ['text', 'labels'] diff --git a/classifier_bare_pytorch.py b/classifier_bare_pytorch.py index b1ee85aa96ec57acf0ccd90cac805908b146d2b5..d802a097c6e1a086d27dfb9603945fb9caa870c0 100644 --- a/classifier_bare_pytorch.py +++ b/classifier_bare_pytorch.py @@ -26,7 +26,7 @@ substitutions_file = 'mappings/substitutions.txt' # mapping_classes = args.mappings_file[:-4].split('-')[-1] # specific_results = open_specific_results('mappings/specific_results.txt')['B'] -print('ZERO-SHOT LANG: '+ args.langs_to_use) +print('\nlangs to use: '+ args.langs_to_use + '\n') set_seed(42) torch.manual_seed(42) @@ -80,8 +80,6 @@ tokenizer = AutoTokenizer.from_pretrained(args.transformer_model) train_sentences, dev_dict_sentences, test_dict_sentences, framework_labels = open_sentences(args.data_path, mappings) - -print(framework_labels, flush=True) # Determine linear size (= number of classes in the sets + 1) num_labels = len(set(sent[-1] for sent in train_sentences)) + 1 diff --git a/configure.py b/configure.py index 0eb23b91d67ac8a936c1305be187fc29ab18d8a0..52ecdbdfc3af336e742e2c848482049c7e619bdf 100644 --- a/configure.py +++ b/configure.py @@ -53,7 +53,7 @@ def parse_args(): help="Change order of sentences when the direction of relations is 1<2 to 2>1.") # only specific languages/corpora - parser.add_argument("--langs_to_use", default='', type=str, + parser.add_argument("--langs_to_use", default='@', type=str, help="List of languages/corpora to use, a str separated by ;") diff --git a/get_predictions.sh b/get_predictions.sh deleted file mode 100644 index 8c5f7ddf38399331949c3787fe395c9506883863..0000000000000000000000000000000000000000 --- a/get_predictions.sh +++ /dev/null @@ -1,26 +0,0 @@ -#!/usr/bin/env bash - -#SBATCH --job-name=model-LC - -#SBATCH --ntasks=1 -#SBATCH --cpus-per-task=4 -#SBATCH --partition=GPUNodes -#SBATCH --gres=gpu:1 - - -# tests tests - - -# srun singularity exec /logiciels/containerCollections/CUDA10/pytorch.sif python3 xml-roberta-classifier.py \ -# --batch_size 4 \ -# --gradient_accumulation_steps 32 \ -# --num_epochs 6 \ -# --data_path '/users/melodi/emetheni/clean_data' \ -# --mappings_file 'mappings/mappings_substitutions.tsv' \ -# --transformer_model "xlm-roberta-base" - - -srun singularity exec /logiciels/containerCollections/CUDA10/pytorch.sif python3 open_finetuned_model.py \ - --data_path '/users/melodi/emetheni/clean_data' \ - --mappings_file 'mappings/mappings_substitutions.tsv' \ - --transformer_model 'results/models/run_xlm-roberta-base/checkpoint-13500' diff --git a/make_mappings_zero-shot.py b/make_mappings_zero-shot.py index 14b05a6b31b50e80c52394a7f32391f6aa16f124..c2fd034201d102175663a529dbb948217577094d 100644 --- a/make_mappings_zero-shot.py +++ b/make_mappings_zero-shot.py @@ -29,23 +29,23 @@ for label, num in mappings.items(): # ----------------------------------- -# define which language to NOT use with the arguments -not_language = args.langs_to_use +# define which language to use with the arguments +languages = args.langs_to_use.split(';') -corpora = [folder for folder in os.listdir(args.data_path) - if not not_language in folder] +corpora = [folder + for folder in os.listdir(args.data_path) + if any(l in folder for l in languages)] files = ['/'.join([args.data_path, corpus, f]) for corpus in corpora - for f in os.listdir(args.data_path + '/' + corpus) - ] + for f in os.listdir(args.data_path + '/' + corpus)] # open the files def read_file(file): ''' Open the relations file. ''' relations = [] - + sub_rels = [] with open(file, 'r', encoding='utf-8') as f: next(f) for line in f: @@ -53,50 +53,48 @@ def read_file(file): l = line.strip().split('\t') if not l[11].lower() in subs: relations.append(l[11].lower()) + else: + sub_rels.append(l[11].lower()) except IndexError: pass - return relations + return relations, sub_rels rel_files = [f for f in files if any (x in f for x in ['train', 'test', 'dev'])] -rels = [] +good_rels = [] +sub_rels = [] for f in rel_files: - temp = read_file(f) - if temp != []: - rels += temp + x, y = read_file(f) + good_rels += x + sub_rels += y -dict_labels = dict(enumerate(list(set(rels)))) -inv_labels = {v:k for k, v in dict_labels.items()} +dict_labels = dict(enumerate(list(set(good_rels)))) +corpora_labels = {v:k for k, v in dict_labels.items()} -leftovers = [] -for sub in subs: - if sub not in inv_labels: - try: - inv_labels[sub] = inv_labels[subs[sub]] - except KeyError: - leftovers.append(sub) - else: - leftovers.append(sub) -for mapping in mappings: - if mapping not in inv_labels: - leftovers.append(mapping) -counter = len(inv_labels) -1 -for i in leftovers: - counter += 1 - inv_labels[i] = counter - +leftovers = [] + +# for mapping in mappings: +# if mapping not in corpora_labels and mapping not in subs: +# leftovers.append(mapping) -# # save the new labels -print('-'*20) -print(not_language) -print(len(inv_labels)) +# counter = max(list(corpora_labels.values())) -1 +# for i in leftovers: +# counter += 1 +# corpora_labels[i] = counter +for sub in sub_rels: + try: + corpora_labels[sub] = corpora_labels[subs[sub]] + except KeyError: + corpora_labels[subs[sub]] = max(list(corpora_labels.values())) + 1 + corpora_labels[sub] = corpora_labels[subs[sub]] +# print(corpora_labels) -with open('mappings/zero-shot/' + not_language + '_zero-shot.tsv', 'w') as f: +with open('mappings/jaccard/' + 'por.rst' + '.tsv', 'w') as f: f.write('LABEL\tMAPPING\n') - for k, v in inv_labels.items(): + for k, v in corpora_labels.items(): f.write(k + '\t' + str(v) + '\n') \ No newline at end of file diff --git a/utils.py b/utils.py index 3880be60ded41aef92cdfabed33a6aa261e613d4..393bfcb9727bfe1765fd5f60c5d94062daacd6bb 100644 --- a/utils.py +++ b/utils.py @@ -18,8 +18,9 @@ def open_mappings(mappings_file): mappings = {} with open(mappings_file, 'r') as f: next(f) - for l in f: - mappings[l.split('\t')[0]] = int(l.strip().split('\t')[-1]) + for line in f: + l = line.strip().split('\t') + mappings[l[0]] = int(l[-1]) inv_mappings = {} # for label, num in mappings.items(): @@ -163,10 +164,13 @@ def open_sentences(path_to_corpora, mappings_dict): - dict of sentences for TEST: each test set categorized per corpus - ** NEW ** : dict of labels per framework ''' - + langs_to_use = args.langs_to_use.split(';') corpora = [folder for folder in os.listdir(path_to_corpora) if not any(i in folder for i in ['.md', 'DS_', 'utils', 'ipynb']) + # langs here + #if any(l in folder for l in langs_to_use) ] + # --------------------- train_sentences = [] @@ -186,8 +190,9 @@ def open_sentences(path_to_corpora, mappings_dict): train_file = ['/'.join([path_to_corpora, corpus, x]) for x in os.listdir(path_to_corpora + '/' + corpus) if 'train' in x and 'rels' in x - # attention! we block training for some languages if we want HERE - if not args.langs_to_use in x][0] + # attention! we ALLOW training for some corpora if we want HERE +# if any(l in x for l in langs_to_use) + ][0] temp = open_file(train_file, mappings_dict) # train_sentences += open_file_with_lang(train_file, mappings_dict) train_sentences += temp @@ -216,9 +221,9 @@ def open_sentences(path_to_corpora, mappings_dict): all_labels[framework] += [l[-1] for l in temp] # test_dict_sentences[corpus] += open_file_with_lang(test_file, mappings_dict) - labels = {framework:set(all_labels[framework]) for framework in all_labels} + corpus_labels = {framework:set(all_labels[framework]) for framework in all_labels} - return train_sentences, dev_dict_sentences, test_dict_sentences, labels + return train_sentences, dev_dict_sentences, test_dict_sentences, corpus_labels def open_sentences_with_lang(path_to_corpora, mappings_dict): @@ -237,7 +242,8 @@ def open_sentences_with_lang(path_to_corpora, mappings_dict): train_sentences = [] dev_dict_sentences = {} test_dict_sentences = {} - + corpus_labels = [] + for corpus in corpora: try: @@ -267,7 +273,7 @@ def open_sentences_with_lang(path_to_corpora, mappings_dict): # test_dict_sentences[corpus] += open_file_with_lang(test_file, mappings_dict) - return train_sentences, dev_dict_sentences, test_dict_sentences + return train_sentences, dev_dict_sentences, test_dict_sentences, corpus_labels @@ -333,7 +339,7 @@ def get_predictions_huggingface(trainer, ''' results = trainer.predict(test_set) - preds = np.softmax(results.predictions, axis=1) +# preds = np.softmax(results.predictions, axis=1) top_preds = np.argmax(results.predictions, axis=1) results = results.label_ids test_acc = round(accuracy_score(top_preds, results), 4) @@ -341,7 +347,7 @@ def get_predictions_huggingface(trainer, if print_results: print(corpus, '\t', test_acc, '\n') - return preds +# return preds def get_better_predictions(model,