Skip to content
Snippets Groups Projects
Commit 3dffe4fc authored by emetheni's avatar emetheni
Browse files

huggingface finally works

parent 30190c63
Branches
No related tags found
No related merge requests found
......@@ -16,7 +16,7 @@ from utils import *
device = torch.device("cuda")
print('\n\nwith Language token - eng + Corpus (no framework) \n')
# print('\n\nwith Language token - eng + Corpus (no framework) \n')
# ---------------------------------------------------------------------------------------------------
args = parse_args()
......@@ -41,10 +41,6 @@ mappings, inv_mappings = open_mappings(args.mappings_file)
# Open sentences
train_sentences, dev_dict_sentences, test_dict_sentences, _ = open_sentences(args.data_path, mappings)
print('\nCheck encodings:\n')
print(train_sentences[0])
# make pandas dataframes
file_header = ['text', 'labels']
......
......@@ -26,7 +26,7 @@ substitutions_file = 'mappings/substitutions.txt'
# mapping_classes = args.mappings_file[:-4].split('-')[-1]
# specific_results = open_specific_results('mappings/specific_results.txt')['B']
print('ZERO-SHOT LANG: '+ args.langs_to_use)
print('\nlangs to use: '+ args.langs_to_use + '\n')
set_seed(42)
torch.manual_seed(42)
......@@ -80,8 +80,6 @@ tokenizer = AutoTokenizer.from_pretrained(args.transformer_model)
train_sentences, dev_dict_sentences, test_dict_sentences, framework_labels = open_sentences(args.data_path, mappings)
print(framework_labels, flush=True)
# Determine linear size (= number of classes in the sets + 1)
num_labels = len(set(sent[-1] for sent in train_sentences)) + 1
......
......@@ -53,7 +53,7 @@ def parse_args():
help="Change order of sentences when the direction of relations is 1<2 to 2>1.")
# only specific languages/corpora
parser.add_argument("--langs_to_use", default='', type=str,
parser.add_argument("--langs_to_use", default='@', type=str,
help="List of languages/corpora to use, a str separated by ;")
......
#!/usr/bin/env bash
#SBATCH --job-name=model-LC
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=4
#SBATCH --partition=GPUNodes
#SBATCH --gres=gpu:1
# tests tests
# srun singularity exec /logiciels/containerCollections/CUDA10/pytorch.sif python3 xml-roberta-classifier.py \
# --batch_size 4 \
# --gradient_accumulation_steps 32 \
# --num_epochs 6 \
# --data_path '/users/melodi/emetheni/clean_data' \
# --mappings_file 'mappings/mappings_substitutions.tsv' \
# --transformer_model "xlm-roberta-base"
srun singularity exec /logiciels/containerCollections/CUDA10/pytorch.sif python3 open_finetuned_model.py \
--data_path '/users/melodi/emetheni/clean_data' \
--mappings_file 'mappings/mappings_substitutions.tsv' \
--transformer_model 'results/models/run_xlm-roberta-base/checkpoint-13500'
......@@ -29,23 +29,23 @@ for label, num in mappings.items():
# -----------------------------------
# define which language to NOT use with the arguments
not_language = args.langs_to_use
# define which language to use with the arguments
languages = args.langs_to_use.split(';')
corpora = [folder for folder in os.listdir(args.data_path)
if not not_language in folder]
corpora = [folder
for folder in os.listdir(args.data_path)
if any(l in folder for l in languages)]
files = ['/'.join([args.data_path, corpus, f])
for corpus in corpora
for f in os.listdir(args.data_path + '/' + corpus)
]
for f in os.listdir(args.data_path + '/' + corpus)]
# open the files
def read_file(file):
''' Open the relations file. '''
relations = []
sub_rels = []
with open(file, 'r', encoding='utf-8') as f:
next(f)
for line in f:
......@@ -53,50 +53,48 @@ def read_file(file):
l = line.strip().split('\t')
if not l[11].lower() in subs:
relations.append(l[11].lower())
else:
sub_rels.append(l[11].lower())
except IndexError:
pass
return relations
return relations, sub_rels
rel_files = [f for f in files if any (x in f for x in ['train', 'test', 'dev'])]
rels = []
good_rels = []
sub_rels = []
for f in rel_files:
temp = read_file(f)
if temp != []:
rels += temp
x, y = read_file(f)
good_rels += x
sub_rels += y
dict_labels = dict(enumerate(list(set(rels))))
inv_labels = {v:k for k, v in dict_labels.items()}
dict_labels = dict(enumerate(list(set(good_rels))))
corpora_labels = {v:k for k, v in dict_labels.items()}
leftovers = []
for sub in subs:
if sub not in inv_labels:
try:
inv_labels[sub] = inv_labels[subs[sub]]
except KeyError:
leftovers.append(sub)
else:
leftovers.append(sub)
for mapping in mappings:
if mapping not in inv_labels:
leftovers.append(mapping)
counter = len(inv_labels) -1
for i in leftovers:
counter += 1
inv_labels[i] = counter
leftovers = []
# for mapping in mappings:
# if mapping not in corpora_labels and mapping not in subs:
# leftovers.append(mapping)
# # save the new labels
print('-'*20)
print(not_language)
print(len(inv_labels))
# counter = max(list(corpora_labels.values())) -1
# for i in leftovers:
# counter += 1
# corpora_labels[i] = counter
for sub in sub_rels:
try:
corpora_labels[sub] = corpora_labels[subs[sub]]
except KeyError:
corpora_labels[subs[sub]] = max(list(corpora_labels.values())) + 1
corpora_labels[sub] = corpora_labels[subs[sub]]
# print(corpora_labels)
with open('mappings/zero-shot/' + not_language + '_zero-shot.tsv', 'w') as f:
with open('mappings/jaccard/' + 'por.rst' + '.tsv', 'w') as f:
f.write('LABEL\tMAPPING\n')
for k, v in inv_labels.items():
for k, v in corpora_labels.items():
f.write(k + '\t' + str(v) + '\n')
\ No newline at end of file
......@@ -18,8 +18,9 @@ def open_mappings(mappings_file):
mappings = {}
with open(mappings_file, 'r') as f:
next(f)
for l in f:
mappings[l.split('\t')[0]] = int(l.strip().split('\t')[-1])
for line in f:
l = line.strip().split('\t')
mappings[l[0]] = int(l[-1])
inv_mappings = {}
# for label, num in mappings.items():
......@@ -163,10 +164,13 @@ def open_sentences(path_to_corpora, mappings_dict):
- dict of sentences for TEST: each test set categorized per corpus
- ** NEW ** : dict of labels per framework
'''
langs_to_use = args.langs_to_use.split(';')
corpora = [folder for folder in os.listdir(path_to_corpora)
if not any(i in folder for i in ['.md', 'DS_', 'utils', 'ipynb'])
# langs here
#if any(l in folder for l in langs_to_use)
]
# ---------------------
train_sentences = []
......@@ -186,8 +190,9 @@ def open_sentences(path_to_corpora, mappings_dict):
train_file = ['/'.join([path_to_corpora, corpus, x])
for x in os.listdir(path_to_corpora + '/' + corpus)
if 'train' in x and 'rels' in x
# attention! we block training for some languages if we want HERE
if not args.langs_to_use in x][0]
# attention! we ALLOW training for some corpora if we want HERE
# if any(l in x for l in langs_to_use)
][0]
temp = open_file(train_file, mappings_dict)
# train_sentences += open_file_with_lang(train_file, mappings_dict)
train_sentences += temp
......@@ -216,9 +221,9 @@ def open_sentences(path_to_corpora, mappings_dict):
all_labels[framework] += [l[-1] for l in temp]
# test_dict_sentences[corpus] += open_file_with_lang(test_file, mappings_dict)
labels = {framework:set(all_labels[framework]) for framework in all_labels}
corpus_labels = {framework:set(all_labels[framework]) for framework in all_labels}
return train_sentences, dev_dict_sentences, test_dict_sentences, labels
return train_sentences, dev_dict_sentences, test_dict_sentences, corpus_labels
def open_sentences_with_lang(path_to_corpora, mappings_dict):
......@@ -237,7 +242,8 @@ def open_sentences_with_lang(path_to_corpora, mappings_dict):
train_sentences = []
dev_dict_sentences = {}
test_dict_sentences = {}
corpus_labels = []
for corpus in corpora:
try:
......@@ -267,7 +273,7 @@ def open_sentences_with_lang(path_to_corpora, mappings_dict):
# test_dict_sentences[corpus] += open_file_with_lang(test_file, mappings_dict)
return train_sentences, dev_dict_sentences, test_dict_sentences
return train_sentences, dev_dict_sentences, test_dict_sentences, corpus_labels
......@@ -333,7 +339,7 @@ def get_predictions_huggingface(trainer,
'''
results = trainer.predict(test_set)
preds = np.softmax(results.predictions, axis=1)
# preds = np.softmax(results.predictions, axis=1)
top_preds = np.argmax(results.predictions, axis=1)
results = results.label_ids
test_acc = round(accuracy_score(top_preds, results), 4)
......@@ -341,7 +347,7 @@ def get_predictions_huggingface(trainer,
if print_results:
print(corpus, '\t', test_acc, '\n')
return preds
# return preds
def get_better_predictions(model,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment