Skip to content
Snippets Groups Projects
Commit 23d2149f authored by emetheni's avatar emetheni
Browse files

update before server shutdown

parent b14b8d54
No related branches found
No related tags found
No related merge requests found
......@@ -16,7 +16,6 @@ from utils import *
device = torch.device("cuda")
# print('\n\nwith Language token - eng + Corpus (no framework) \n')
# ---------------------------------------------------------------------------------------------------
args = parse_args()
......@@ -41,6 +40,11 @@ mappings, inv_mappings = open_mappings(args.mappings_file)
# Open sentences
train_sentences, dev_dict_sentences, test_dict_sentences, framework_labels = open_sentences_with_lang(args.data_path, mappings)
print('\nCheck encodings:\n')
print(train_sentences[0])
# make pandas dataframes
file_header = ['text', 'labels']
......@@ -152,18 +156,23 @@ trainer.train()
print('\nDev results:')
for corpus in encoded_dev_dataset:
print()
dev_results_ = get_predictions_huggingface(trainer,
corpus,
encoded_dev_dataset[corpus]
)
dev_results = better_predictions_huggingface(trainer,
corpus,
encoded_dev_dataset[corpus],
framework_labels[corpus.split('.')[1]]
)
print(dev_results)
path_results = 'results/dev/' + args.transformer_model + '_' + str(args.num_epochs)
if not os.path.exists(path_results):
os.makedirs(path_results)
# path_results = 'results/dev/' + args.transformer_model + '_' + str(args.num_epochs)
# if not os.path.exists(path_results):
# os.makedirs(path_results)
# print_results_to_file(corpus,
# dev_dict_sentences[corpus],
......
......@@ -172,8 +172,8 @@ def train(model,
total_loss_train = 0
batch_counter = 0
# for train_input, train_label in tqdm(train_dataloader):
for train_input, train_label in train_dataloader:
for train_input, train_label in tqdm(train_dataloader):
# for train_input, train_label in train_dataloader:
batch_counter += 1
train_label = train_label.to(device)
mask = train_input['attention_mask'].to(device)
......
......@@ -32,7 +32,7 @@ print('Frozen layers:', args.freeze_layers.replace(';', ', '))
mappings, inv_mappings = open_mappings(args.mappings_file)
# Open sentences
train_sentences, dev_dict_sentences, _ = open_sentences_with_lang(args.data_path, mappings)
train_sentences, dev_dict_sentences, _, framework_labels = open_sentences_with_lang(args.data_path, mappings)
print('\nCheck encodings:\n')
......
......@@ -11,6 +11,20 @@ from sklearn.metrics import accuracy_score
args = parse_args()
def switch_dimensions(vector_list):
target_dim_len = len(vector_list[0])
new_vector = []
for n in range(target_dim_len):
temp = []
for x in vector_list:
temp.append(x[n])
new_vector.append(temp)
return new_vector
def open_mappings(mappings_file):
''' Open the mappings file into a dictionary.'''
......@@ -21,19 +35,18 @@ def open_mappings(mappings_file):
for line in f:
l = line.strip().split('\t')
mappings[l[0]] = int(l[-1])
inv_mappings = []
# this cannot be a dictionary! it has to be tuples
# because we have some labels which are replaced, e.g.
# joint-list is replaced
# reject the converted labels
inv_mappings = {}
for k, v in mappings.items():
inv_mappings.append((v, k))
if v not in inv_mappings:
inv_mappings[v] = k
return mappings, inv_mappings
def encode_label(og_label, mappings_dict):
label = og_label.lower()
label = og_label.lower().strip()
if label in mappings_dict:
return mappings_dict[label]
else:
......@@ -72,11 +85,11 @@ def open_file(filename, mappings_dict):
# flip them if different direction
if args.normalize_direction == 'yes':
if l[9] == '1>2':
lines.append(l + [sent_1 + [SEP_token] + sent_2, encode_label(l[11], mappings_dict)])
lines.append(l + [sent_1 + [SEP_token] + sent_2, encode_label(l[-1], mappings_dict)])
else:
lines.append(l + [sent_2 + [SEP_token] + sent_1, encode_label(l[11], mappings_dict)])
lines.append(l + [sent_2 + [SEP_token] + sent_1, encode_label(l[-1], mappings_dict)])
else:
lines.append(l + [sent_1 + [SEP_token] + sent_2, encode_label(l[11], mappings_dict)])
lines.append(l + [sent_1 + [SEP_token] + sent_2, encode_label(l[-1], mappings_dict)])
return lines
......@@ -142,20 +155,22 @@ def open_file_with_lang(filename, mappings_dict):
# flip them if different direction
if args.normalize_direction == 'yes':
if l[9] == '1>2':
lines.append(l + [[lang, fullname] + sent_1 + [SEP_token] + sent_2, encode_label(l[11], mappings_dict)])
lines.append(l + [[lang, fullname, framework] + sent_1 + [SEP_token] + sent_2, encode_label(l[11], mappings_dict)])
else:
lines.append(l + [[lang, fullname] + sent_2 + [SEP_token] + sent_1, encode_label(l[11], mappings_dict)])
lines.append(l + [[lang, fullname, framework] + sent_2 + [SEP_token] + sent_1, encode_label(l[11], mappings_dict)])
else:
lines.append(l + [[lang, fullname] + sent_1 + [SEP_token] + sent_2, encode_label(l[11], mappings_dict)])
lines.append(l + [[lang, fullname, framework] + sent_1 + [SEP_token] + sent_2, encode_label(l[11], mappings_dict)])
return lines
def encode_batch(batch):
""" Encodes a batch of input data using the model tokenizer.
Works for a pandas DF column, instead of a list.
"""
tokenizer = AutoTokenizer.from_pretrained('xlm-roberta-base')
tokenizer = AutoTokenizer.from_pretrained(args.transformer_model)
return tokenizer(batch["text"],
max_length=512,
truncation=True,
......@@ -289,7 +304,7 @@ def open_sentences_with_lang(path_to_corpora, mappings_dict):
for x in os.listdir(path_to_corpora + '/' + corpus)
if 'train' in x and 'rels' in x
][0]
temp = open_file(train_file, mappings_dict)
temp = open_file_with_lang(train_file, mappings_dict)
train_sentences += temp
all_labels[framework] += [l[-1] for l in temp]
except: # some of them don't have train
......@@ -373,7 +388,7 @@ def get_predictions_huggingface(trainer,
test_set,
print_results=True):
''' SPECIFI FUNCTION FOR THE HUGGINGFACE TRAINER.
''' SPECIFIC FUNCTION FOR THE HUGGINGFACE TRAINER.
Function to get the model's predictions for one corpus' test set.
Can print accuracy using scikit-learn.
Also works with dev sets -- just don't save the outputs.
......@@ -381,7 +396,6 @@ def get_predictions_huggingface(trainer,
'''
results = trainer.predict(test_set)
preds = np.softmax(results.predictions, axis=1)
top_preds = np.argmax(results.predictions, axis=1)
results = results.label_ids
test_acc = round(accuracy_score(top_preds, results), 4)
......@@ -389,7 +403,7 @@ def get_predictions_huggingface(trainer,
if print_results:
print(corpus, '\t', test_acc, '\n')
return preds
return top_preds
def better_predictions_huggingface(trainer,
......@@ -398,33 +412,33 @@ def better_predictions_huggingface(trainer,
corpus_labels,
print_results=True):
''' SPECIFI FUNCTION FOR THE HUGGINGFACE TRAINER.
Function to get the model's predictions for one corpus' test set.
Can print accuracy using scikit-learn.
Also works with dev sets -- just don't save the outputs.
Returns: list of predictions that match test file's lines.
'''
'''
results = trainer.predict(test_set)
preds = np.argmax(results.predictions, axis=1)
orig_labels = results.label_ids
test_acc = round(accuracy_score(top_preds, orig_labels), 4)
orig_labels = results.label_ids.tolist()
print('len sentences', len(orig_labels))
print('shape of preds', results.predictions.shape)
if print_results:
print(corpus + '\t' + str(test_acc) + '\n', flush='True')
results_per_sent = results.predictions.tolist()
print(type(results.predictions))
# try to make the better prediction bit
best_labels = []
for n, result in enumerate(results.predictions.tolist()):
orig_label = results.label_ids[n]
for sent, sent_results in enumerate(results_per_sent):
best_prob = -1000
best_label = -1
if orig_label in corpus_labels:
if result > best_prob:
best_prob = result
best_label = n
best_labels.append(n)
best_label = -1
#assert len(sent_results) == len(orig_labels)
for n, prob in enumerate(sent_results):
if n in corpus_labels:
if prob > best_prob:
best_prob = prob
best_label = n
best_labels.append(best_label)
test_acc = round(accuracy_score(best_labels, orig_labels), 4)
print('better:\t' + str(test_acc) + '\n', flush='True')
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment