Skip to content
Snippets Groups Projects
Commit 3dffe4fc authored by emetheni's avatar emetheni
Browse files

huggingface finally works

parent 30190c63
Branches
No related tags found
No related merge requests found
...@@ -16,7 +16,7 @@ from utils import * ...@@ -16,7 +16,7 @@ from utils import *
device = torch.device("cuda") device = torch.device("cuda")
print('\n\nwith Language token - eng + Corpus (no framework) \n') # print('\n\nwith Language token - eng + Corpus (no framework) \n')
# --------------------------------------------------------------------------------------------------- # ---------------------------------------------------------------------------------------------------
args = parse_args() args = parse_args()
...@@ -41,10 +41,6 @@ mappings, inv_mappings = open_mappings(args.mappings_file) ...@@ -41,10 +41,6 @@ mappings, inv_mappings = open_mappings(args.mappings_file)
# Open sentences # Open sentences
train_sentences, dev_dict_sentences, test_dict_sentences, _ = open_sentences(args.data_path, mappings) train_sentences, dev_dict_sentences, test_dict_sentences, _ = open_sentences(args.data_path, mappings)
print('\nCheck encodings:\n')
print(train_sentences[0])
# make pandas dataframes # make pandas dataframes
file_header = ['text', 'labels'] file_header = ['text', 'labels']
......
...@@ -26,7 +26,7 @@ substitutions_file = 'mappings/substitutions.txt' ...@@ -26,7 +26,7 @@ substitutions_file = 'mappings/substitutions.txt'
# mapping_classes = args.mappings_file[:-4].split('-')[-1] # mapping_classes = args.mappings_file[:-4].split('-')[-1]
# specific_results = open_specific_results('mappings/specific_results.txt')['B'] # specific_results = open_specific_results('mappings/specific_results.txt')['B']
print('ZERO-SHOT LANG: '+ args.langs_to_use) print('\nlangs to use: '+ args.langs_to_use + '\n')
set_seed(42) set_seed(42)
torch.manual_seed(42) torch.manual_seed(42)
...@@ -80,8 +80,6 @@ tokenizer = AutoTokenizer.from_pretrained(args.transformer_model) ...@@ -80,8 +80,6 @@ tokenizer = AutoTokenizer.from_pretrained(args.transformer_model)
train_sentences, dev_dict_sentences, test_dict_sentences, framework_labels = open_sentences(args.data_path, mappings) train_sentences, dev_dict_sentences, test_dict_sentences, framework_labels = open_sentences(args.data_path, mappings)
print(framework_labels, flush=True)
# Determine linear size (= number of classes in the sets + 1) # Determine linear size (= number of classes in the sets + 1)
num_labels = len(set(sent[-1] for sent in train_sentences)) + 1 num_labels = len(set(sent[-1] for sent in train_sentences)) + 1
......
...@@ -53,7 +53,7 @@ def parse_args(): ...@@ -53,7 +53,7 @@ def parse_args():
help="Change order of sentences when the direction of relations is 1<2 to 2>1.") help="Change order of sentences when the direction of relations is 1<2 to 2>1.")
# only specific languages/corpora # only specific languages/corpora
parser.add_argument("--langs_to_use", default='', type=str, parser.add_argument("--langs_to_use", default='@', type=str,
help="List of languages/corpora to use, a str separated by ;") help="List of languages/corpora to use, a str separated by ;")
......
#!/usr/bin/env bash
#SBATCH --job-name=model-LC
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=4
#SBATCH --partition=GPUNodes
#SBATCH --gres=gpu:1
# tests tests
# srun singularity exec /logiciels/containerCollections/CUDA10/pytorch.sif python3 xml-roberta-classifier.py \
# --batch_size 4 \
# --gradient_accumulation_steps 32 \
# --num_epochs 6 \
# --data_path '/users/melodi/emetheni/clean_data' \
# --mappings_file 'mappings/mappings_substitutions.tsv' \
# --transformer_model "xlm-roberta-base"
srun singularity exec /logiciels/containerCollections/CUDA10/pytorch.sif python3 open_finetuned_model.py \
--data_path '/users/melodi/emetheni/clean_data' \
--mappings_file 'mappings/mappings_substitutions.tsv' \
--transformer_model 'results/models/run_xlm-roberta-base/checkpoint-13500'
...@@ -29,23 +29,23 @@ for label, num in mappings.items(): ...@@ -29,23 +29,23 @@ for label, num in mappings.items():
# ----------------------------------- # -----------------------------------
# define which language to NOT use with the arguments # define which language to use with the arguments
not_language = args.langs_to_use languages = args.langs_to_use.split(';')
corpora = [folder for folder in os.listdir(args.data_path) corpora = [folder
if not not_language in folder] for folder in os.listdir(args.data_path)
if any(l in folder for l in languages)]
files = ['/'.join([args.data_path, corpus, f]) files = ['/'.join([args.data_path, corpus, f])
for corpus in corpora for corpus in corpora
for f in os.listdir(args.data_path + '/' + corpus) for f in os.listdir(args.data_path + '/' + corpus)]
]
# open the files # open the files
def read_file(file): def read_file(file):
''' Open the relations file. ''' ''' Open the relations file. '''
relations = [] relations = []
sub_rels = []
with open(file, 'r', encoding='utf-8') as f: with open(file, 'r', encoding='utf-8') as f:
next(f) next(f)
for line in f: for line in f:
...@@ -53,50 +53,48 @@ def read_file(file): ...@@ -53,50 +53,48 @@ def read_file(file):
l = line.strip().split('\t') l = line.strip().split('\t')
if not l[11].lower() in subs: if not l[11].lower() in subs:
relations.append(l[11].lower()) relations.append(l[11].lower())
else:
sub_rels.append(l[11].lower())
except IndexError: except IndexError:
pass pass
return relations return relations, sub_rels
rel_files = [f for f in files if any (x in f for x in ['train', 'test', 'dev'])] rel_files = [f for f in files if any (x in f for x in ['train', 'test', 'dev'])]
rels = [] good_rels = []
sub_rels = []
for f in rel_files: for f in rel_files:
temp = read_file(f) x, y = read_file(f)
if temp != []: good_rels += x
rels += temp sub_rels += y
dict_labels = dict(enumerate(list(set(rels)))) dict_labels = dict(enumerate(list(set(good_rels))))
inv_labels = {v:k for k, v in dict_labels.items()} corpora_labels = {v:k for k, v in dict_labels.items()}
leftovers = []
for sub in subs:
if sub not in inv_labels:
try:
inv_labels[sub] = inv_labels[subs[sub]]
except KeyError:
leftovers.append(sub)
else:
leftovers.append(sub)
for mapping in mappings:
if mapping not in inv_labels:
leftovers.append(mapping)
counter = len(inv_labels) -1
for i in leftovers:
counter += 1
inv_labels[i] = counter
leftovers = []
# for mapping in mappings:
# if mapping not in corpora_labels and mapping not in subs:
# leftovers.append(mapping)
# # save the new labels # counter = max(list(corpora_labels.values())) -1
print('-'*20) # for i in leftovers:
print(not_language) # counter += 1
print(len(inv_labels)) # corpora_labels[i] = counter
for sub in sub_rels:
try:
corpora_labels[sub] = corpora_labels[subs[sub]]
except KeyError:
corpora_labels[subs[sub]] = max(list(corpora_labels.values())) + 1
corpora_labels[sub] = corpora_labels[subs[sub]]
# print(corpora_labels)
with open('mappings/zero-shot/' + not_language + '_zero-shot.tsv', 'w') as f: with open('mappings/jaccard/' + 'por.rst' + '.tsv', 'w') as f:
f.write('LABEL\tMAPPING\n') f.write('LABEL\tMAPPING\n')
for k, v in inv_labels.items(): for k, v in corpora_labels.items():
f.write(k + '\t' + str(v) + '\n') f.write(k + '\t' + str(v) + '\n')
\ No newline at end of file
...@@ -18,8 +18,9 @@ def open_mappings(mappings_file): ...@@ -18,8 +18,9 @@ def open_mappings(mappings_file):
mappings = {} mappings = {}
with open(mappings_file, 'r') as f: with open(mappings_file, 'r') as f:
next(f) next(f)
for l in f: for line in f:
mappings[l.split('\t')[0]] = int(l.strip().split('\t')[-1]) l = line.strip().split('\t')
mappings[l[0]] = int(l[-1])
inv_mappings = {} inv_mappings = {}
# for label, num in mappings.items(): # for label, num in mappings.items():
...@@ -163,10 +164,13 @@ def open_sentences(path_to_corpora, mappings_dict): ...@@ -163,10 +164,13 @@ def open_sentences(path_to_corpora, mappings_dict):
- dict of sentences for TEST: each test set categorized per corpus - dict of sentences for TEST: each test set categorized per corpus
- ** NEW ** : dict of labels per framework - ** NEW ** : dict of labels per framework
''' '''
langs_to_use = args.langs_to_use.split(';')
corpora = [folder for folder in os.listdir(path_to_corpora) corpora = [folder for folder in os.listdir(path_to_corpora)
if not any(i in folder for i in ['.md', 'DS_', 'utils', 'ipynb']) if not any(i in folder for i in ['.md', 'DS_', 'utils', 'ipynb'])
# langs here
#if any(l in folder for l in langs_to_use)
] ]
# --------------------- # ---------------------
train_sentences = [] train_sentences = []
...@@ -186,8 +190,9 @@ def open_sentences(path_to_corpora, mappings_dict): ...@@ -186,8 +190,9 @@ def open_sentences(path_to_corpora, mappings_dict):
train_file = ['/'.join([path_to_corpora, corpus, x]) train_file = ['/'.join([path_to_corpora, corpus, x])
for x in os.listdir(path_to_corpora + '/' + corpus) for x in os.listdir(path_to_corpora + '/' + corpus)
if 'train' in x and 'rels' in x if 'train' in x and 'rels' in x
# attention! we block training for some languages if we want HERE # attention! we ALLOW training for some corpora if we want HERE
if not args.langs_to_use in x][0] # if any(l in x for l in langs_to_use)
][0]
temp = open_file(train_file, mappings_dict) temp = open_file(train_file, mappings_dict)
# train_sentences += open_file_with_lang(train_file, mappings_dict) # train_sentences += open_file_with_lang(train_file, mappings_dict)
train_sentences += temp train_sentences += temp
...@@ -216,9 +221,9 @@ def open_sentences(path_to_corpora, mappings_dict): ...@@ -216,9 +221,9 @@ def open_sentences(path_to_corpora, mappings_dict):
all_labels[framework] += [l[-1] for l in temp] all_labels[framework] += [l[-1] for l in temp]
# test_dict_sentences[corpus] += open_file_with_lang(test_file, mappings_dict) # test_dict_sentences[corpus] += open_file_with_lang(test_file, mappings_dict)
labels = {framework:set(all_labels[framework]) for framework in all_labels} corpus_labels = {framework:set(all_labels[framework]) for framework in all_labels}
return train_sentences, dev_dict_sentences, test_dict_sentences, labels return train_sentences, dev_dict_sentences, test_dict_sentences, corpus_labels
def open_sentences_with_lang(path_to_corpora, mappings_dict): def open_sentences_with_lang(path_to_corpora, mappings_dict):
...@@ -237,7 +242,8 @@ def open_sentences_with_lang(path_to_corpora, mappings_dict): ...@@ -237,7 +242,8 @@ def open_sentences_with_lang(path_to_corpora, mappings_dict):
train_sentences = [] train_sentences = []
dev_dict_sentences = {} dev_dict_sentences = {}
test_dict_sentences = {} test_dict_sentences = {}
corpus_labels = []
for corpus in corpora: for corpus in corpora:
try: try:
...@@ -267,7 +273,7 @@ def open_sentences_with_lang(path_to_corpora, mappings_dict): ...@@ -267,7 +273,7 @@ def open_sentences_with_lang(path_to_corpora, mappings_dict):
# test_dict_sentences[corpus] += open_file_with_lang(test_file, mappings_dict) # test_dict_sentences[corpus] += open_file_with_lang(test_file, mappings_dict)
return train_sentences, dev_dict_sentences, test_dict_sentences return train_sentences, dev_dict_sentences, test_dict_sentences, corpus_labels
...@@ -333,7 +339,7 @@ def get_predictions_huggingface(trainer, ...@@ -333,7 +339,7 @@ def get_predictions_huggingface(trainer,
''' '''
results = trainer.predict(test_set) results = trainer.predict(test_set)
preds = np.softmax(results.predictions, axis=1) # preds = np.softmax(results.predictions, axis=1)
top_preds = np.argmax(results.predictions, axis=1) top_preds = np.argmax(results.predictions, axis=1)
results = results.label_ids results = results.label_ids
test_acc = round(accuracy_score(top_preds, results), 4) test_acc = round(accuracy_score(top_preds, results), 4)
...@@ -341,7 +347,7 @@ def get_predictions_huggingface(trainer, ...@@ -341,7 +347,7 @@ def get_predictions_huggingface(trainer,
if print_results: if print_results:
print(corpus, '\t', test_acc, '\n') print(corpus, '\t', test_acc, '\n')
return preds # return preds
def get_better_predictions(model, def get_better_predictions(model,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment