Skip to content
Snippets Groups Projects
Commit 4679630e authored by emetheni's avatar emetheni
Browse files

update files

parent 23d2149f
Branches
No related tags found
No related merge requests found
......@@ -26,8 +26,8 @@ substitutions_file = 'mappings/substitutions.txt'
# mapping_classes = args.mappings_file[:-4].split('-')[-1]
# specific_results = open_specific_results('mappings/specific_results.txt')['B']
print('\nlangs to use: '+ args.langs_to_use + '\n', flush='True')
print('\nlangs to use: '+ args.langs_to_use)
print('mappings file: ' + args.mappings_file, flush='True')
set_seed(42)
torch.manual_seed(42)
......
......@@ -29,17 +29,8 @@ mappings, inv_mappings = open_mappings(args.mappings_file)
substitutions_file = 'mappings/substitutions.txt'
tokenizer = AutoTokenizer.from_pretrained(args.transformer_model)
# we are saving the test results of specific epochs
# specific_results = open_specific_results('mappings/specific_results.txt')
# if '1-2-3' in adapter_name or 'layer1;layer2;layer3' in adapter_name:
# specific_results = list(specific_results['A1_3'][args.num_epochs])
# else:
# specific_results = list(specific_results['A1'][args.num_epochs])
set_seed(42)
print('Train classifier with adapter\n')
print('Adapter name:', adapter_name)
print('Model:', args.transformer_model)
......@@ -50,7 +41,12 @@ print('Num epochs:', args.num_epochs)
mappings, inv_mappings = open_mappings(args.mappings_file)
# Open sentences
train_sentences, dev_dict_sentences, test_dict_sentences = open_sentences(args.data_path, mappings)
train_sentences, dev_dict_sentences, test_dict_sentences, framework_labels = open_sentences_with_lang(args.data_path, mappings)
print('\nCheck encodings:\n')
print(train_sentences[0])
# make pandas dataframes
file_header = ['text', 'labels']
......@@ -59,6 +55,13 @@ train_df = pd.DataFrame([[' '.join(x[-2]), x[-1]] for x in train_sentences],
columns =file_header)
train_df = train_df.sample(frac = 1) # shuffle the train
# make a joint dev dataset in order to save models
eval_df = pd.DataFrame([[' '.join(x[-2]), x[-1]]
for corpus, sents in dev_dict_sentences.items()
for x in sents],
columns =file_header)
dev_dict_df = {corpus : pd.DataFrame([[' '.join(x[-2]), x[-1]]
for x in sents],
columns = file_header)
......@@ -71,6 +74,7 @@ test_dict_df = {corpus : pd.DataFrame([[' '.join(x[-2]), x[-1]]
#Make datasets from dataframes
train_dataset = datasets.Dataset.from_pandas(train_df)
eval_dataset = datasets.Dataset.from_pandas(eval_df)
dev_dict_dataset = {corpus:datasets.Dataset.from_pandas(dev_df)
for corpus, dev_df in dev_dict_df.items()}
test_dict_dataset = {corpus:datasets.Dataset.from_pandas(dev_df)
......@@ -84,6 +88,9 @@ num_labels = len(set([int(x.strip())
train_dataset = train_dataset.map(encode_batch, batched=True)
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
eval_dataset = eval_dataset.map(encode_batch, batched=True)
eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
encoded_dev_dataset = {}
for corpus in dev_dict_dataset:
temp = dev_dict_dataset[corpus].map(encode_batch, batched=True)
......@@ -129,7 +136,8 @@ training_args = TrainingArguments(
trainer = Trainer(
model = model,
args = training_args,
train_dataset = train_dataset
train_dataset = train_dataset,
eval_dataset = eval_dataset
)
......@@ -153,72 +161,32 @@ trainer.train()
print('\nDev results:')
for corpus in encoded_dev_dataset:
print()
dev_results = get_predictions_huggingface(trainer, corpus,
encoded_dev_dataset[corpus])
dev_results_ = get_predictions_huggingface(trainer,
corpus,
encoded_dev_dataset[corpus]
)
path_results = 'results/dev/' + adapter_name + '_' + str(args.num_epochs)
if not os.path.exists(path_results):
os.makedirs(path_results)
print_results_to_file(corpus,
dev_dict_sentences[corpus],
dev_results,
inv_mappings,
#substitutions_file,
path_results)
# Test results
print('\ntest results:')
dev_results = better_predictions_huggingface(trainer,
corpus,
encoded_dev_dataset[corpus],
framework_labels[corpus.split('.')[1]]
)
print('\nTest results:')
for corpus in encoded_test_dataset:
print()
test_results = get_predictions_huggingface(trainer,
corpus,
encoded_test_dataset[corpus])
path_results = 'results/test/' + adapter_name + '_' + str(args.num_epochs)
if not os.path.exists(path_results):
os.makedirs(path_results)
print_results_to_file(corpus,
test_dict_sentences[corpus],
test_results,
inv_mappings,
#substitutions_file,
path_results)
# for corpus in test_dict_dataloader:
# test_results = get_predictions(model,
# corpus,
# test_dict_dataloader[corpus])
# path_results = 'results/test/pytorch' + str(epoch_num+1)
# if not os.path.exists(path_results):
# os.makedirs(path_results)
# print_results_to_file(corpus,
# test_dict_sentences[corpus],
# test_results,
# inv_mappings, substitutions_file,
# path_results)
dev_results_ = get_predictions_huggingface(trainer,
corpus,
encoded_test_dataset[corpus]
)
# Save specific test results
# print('\nTest results:')
# for corpus in encoded_test_dataset:
# print()
# test_results = get_predictions_huggingface(trainer, corpus,
# encoded_test_dataset[corpus])
#
# print_results_to_file(corpus, test_dict_sentences[corpus], test_results,
# inv_mappings, substitutions_file)
\ No newline at end of file
dev_results = better_predictions_huggingface(trainer,
corpus,
encoded_test_dataset[corpus],
framework_labels[corpus.split('.')[1]]
)
......@@ -6,7 +6,8 @@ import torch
from transformers import AutoConfig, AutoTokenizer
from configure import parse_args
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
args = parse_args()
......@@ -333,6 +334,14 @@ def open_sentences_with_lang(path_to_corpora, mappings_dict):
return train_sentences, dev_dict_sentences, test_dict_sentences, corpus_labels
def show_confusion(predictions, labels, save_name):
cm = confusion_matrix(y_test, y_pred)
cm_display = ConfusionMatrixDisplay(cm).plot()
plt.savefig(save_name, dpi=300)
return None
# ===============
# Testing functions
......@@ -417,9 +426,6 @@ def better_predictions_huggingface(trainer,
results = trainer.predict(test_set)
orig_labels = results.label_ids.tolist()
print('len sentences', len(orig_labels))
print('shape of preds', results.predictions.shape)
results_per_sent = results.predictions.tolist()
# try to make the better prediction bit
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment