diff --git a/classifier_bare_huggingface.py b/classifier_bare_huggingface.py index 2897b5b92e449192e49f215244bb3bf06be85dfb..6c3c93d758b57d581ec275a6fcc42b3b1ae688df 100644 --- a/classifier_bare_huggingface.py +++ b/classifier_bare_huggingface.py @@ -26,7 +26,7 @@ substitutions_file = 'mappings/substitutions.txt' mapping_classes = args.mappings_file[:-4].split('-')[-1] set_seed(42) -print('Model:', args.transformer_model) +print('\nModel:', args.transformer_model) print('Batch size:', args.batch_size * args.gradient_accumulation_steps) print('Num epochs:', args.num_epochs) @@ -38,7 +38,8 @@ print('Num epochs:', args.num_epochs) mappings, inv_mappings = open_mappings(args.mappings_file) # Open sentences -train_sentences, dev_dict_sentences, test_dict_sentences, framework_labels = open_sentences_with_lang(args.data_path, mappings) +# train_sentences, dev_dict_sentences, test_dict_sentences, framework_labels = open_sentences_with_lang(args.data_path, mappings) +train_sentences, dev_dict_sentences, test_dict_sentences, framework_labels = open_sentences(args.data_path, mappings) print('\nCheck encodings:\n') @@ -119,11 +120,11 @@ training_args = TrainingArguments( remove_unused_columns = False, warmup_steps = 1000, # number of warmup steps for learning rate # save_steps = (len(train_sentences)/(args.batch_size * args.gradient_accumulation_steps)) / 1368, - save_total_limit = args.num_epochs, - load_best_model_at_end = True, +# save_total_limit = args.num_epochs, +# load_best_model_at_end = True, weight_decay = 0.01, # strength of weight decay - save_strategy='epoch', - evaluation_strategy='epoch' +# save_strategy='epoch', +# evaluation_strategy='epoch' ) @@ -183,13 +184,13 @@ for corpus in encoded_dev_dataset: # Test results -# print('\ntest results:') -# for corpus in encoded_test_dataset: -# print() -# test_results = get_predictions_huggingface(trainer, -# corpus, -# framework_labels[corpus.split('.')[1]], -# encoded_test_dataset[corpus]) +print('\n\ntest results:') +for corpus in encoded_test_dataset: + print() + test_results = get_predictions_huggingface(trainer, + corpus, + framework_labels[corpus.split('.')[1]], + encoded_test_dataset[corpus]) # path_results = 'results/test/' + args.transformer_model + '_' + str(args.num_epochs) diff --git a/classifier_bare_pytorch-conf-matrix.py b/classifier_bare_pytorch-conf-matrix.py new file mode 100644 index 0000000000000000000000000000000000000000..289e5c52908e5f080cb7d01170e680875e231d8b --- /dev/null +++ b/classifier_bare_pytorch-conf-matrix.py @@ -0,0 +1,310 @@ +#!/usr/bin/env python +# coding: utf-8 + +import torch +import numpy as np +from transformers import AutoModel, AutoTokenizer, get_linear_schedule_with_warmup, set_seed +from torch import nn +from torch.optim import AdamW +from torch.utils.data import DataLoader +import torch.nn.functional as F +from torch.autograd import Variable +from tqdm import tqdm +import os +from time import sleep +from datetime import datetime +import sys +from sklearn.metrics import classification_report, accuracy_score +from configure import parse_args +from utils import * + +args = parse_args() +now = datetime.now() +dt_string = now.strftime("%d.%m.%y-%H:%M:%S") +layers_to_freeze = args.freeze_layers.split(";") +substitutions_file = 'mappings/substitutions.txt' +# mapping_classes = args.mappings_file[:-4].split('-')[-1] +# specific_results = open_specific_results('mappings/specific_results.txt')['B'] + +print('\nlangs to use: '+ args.langs_to_use) +print('mappings file: ' + args.mappings_file, flush='True') +set_seed(42) +torch.manual_seed(42) + +# =============== +# Dataset class +# =============== + +class Dataset(torch.utils.data.Dataset): + + def __init__(self, sentences): + + self.labels = [sent[-1] for sent in sentences] + self.texts = [tokenizer(sent[-2], + is_split_into_words=True, + padding='max_length', + max_length = 512, + truncation=True, + return_tensors="pt") + for sent in sentences] + + def classes(self): + return self.labels + + def __len__(self): + return len(self.labels) + + def get_batch_labels(self, idx): + # Fetch a batch of labels + return np.array(self.labels[idx]) + + def get_batch_texts(self, idx): + # Fetch a batch of inputs + return self.texts[idx] + + def __getitem__(self, idx): + + batch_texts = self.get_batch_texts(idx) + batch_y = self.get_batch_labels(idx) + + return batch_texts, batch_y + +# =============== +# Load datasets +# =============== + +# Open mappings +mappings, inv_mappings = open_mappings(args.mappings_file) +batch_size = args.batch_size +tokenizer = AutoTokenizer.from_pretrained(args.transformer_model) + +# train_sentences, dev_dict_sentences, test_dict_sentences, framework_labels = open_sentences(args.data_path, mappings) +train_sentences, dev_dict_sentences, test_dict_sentences, framework_labels = open_sentences_with_lang(args.data_path, mappings) + +print('\nCheck encodings:\n') +print(train_sentences[0]) + + +# Determine linear size (= number of classes in the sets + 1) +num_labels = len(set(sent[-1] for sent in train_sentences)) + 1 + +# make train/dev datasets +train_dataset = Dataset(train_sentences) +dev_dataset = {corpus: Dataset(s) for corpus, s in dev_dict_sentences.items()} +test_dataset = {corpus: Dataset(s) for corpus, s in test_dict_sentences.items()} + +# Make dasets with batches and dataloader +train_dataloader = DataLoader(train_dataset, batch_size, shuffle=True) +dev_dict_dataloader = {corpus: DataLoader(dev_data, batch_size) + for corpus, dev_data in dev_dataset.items()} +test_dict_dataloader = {corpus: DataLoader(test_data, batch_size) + for corpus, test_data in test_dataset.items()} + + +# =============== +# Model setup +# =============== + +class TransformerClassifier(nn.Module): + + def __init__(self, dropout=args.dropout): + + super(TransformerClassifier, self).__init__() + + self.tr_model = AutoModel.from_pretrained(args.transformer_model) + self.dropout = nn.Dropout(dropout) + self.linear = nn.Linear(768, num_labels) # bert input x num of classes + self.relu = nn.ReLU() + + def forward(self, input_id, mask): + + outputs = self.tr_model(input_ids = input_id, + attention_mask = mask, + return_dict = True)['last_hidden_state'][:, 0, :] + dropout_output = self.dropout(outputs) + linear_output = self.linear(dropout_output) + final_layer = self.relu(linear_output) + + return final_layer + + +model = TransformerClassifier() + + +def train(model, + train_dataloader, + dev_dict_dataloader, + test_dict_sentences, + test_dict_dataloader, + epochs, + #specific_results + ): + + device = torch.device("cuda" if args.use_cuda else "cpu") + + criterion = nn.CrossEntropyLoss() + optimizer = AdamW(model.parameters(), #Adam + lr = 2e-5, #1e-6 + eps = 1e-8 + ) + + if args.use_cuda: + model = model.cuda() + criterion = criterion.cuda() + + gradient_accumulation_steps = args.gradient_accumulation_steps + total_steps = len(train_dataloader) * epochs + scheduler = get_linear_schedule_with_warmup(optimizer, + num_warmup_steps = 0, + num_training_steps = total_steps) + + seed_val = 42 + torch.manual_seed(seed_val) + torch.cuda.manual_seed_all(seed_val) + + # freeze layers, see argument in configure.py + if args.freeze_layers != '': + for name, param in model.named_parameters(): + if any(x in name for x in layers_to_freeze): + param.requires_grad = False + + for epoch_num in range(0, epochs): + print('\n=== Epoch {:} / {:} ==='.format(epoch_num + 1, epochs)) + + model.train() + + total_acc_train = 0 + total_loss_train = 0 + batch_counter = 0 + + for train_input, train_label in tqdm(train_dataloader): +# for train_input, train_label in train_dataloader: + batch_counter += 1 + train_label = train_label.to(device) + mask = train_input['attention_mask'].to(device) + input_id = train_input['input_ids'].squeeze(1).to(device) + + output = model(input_id, mask) + +# batch_loss = criterion(output, train_label.long()) +# total_loss_train += batch_loss.item() + +# acc = (output.argmax(dim=1) == train_label).sum().item() +# total_acc_train += acc + + # Compute Loss and Perform Back-propagation + loss = criterion(output, train_label.long()) + + + # Normalize the Gradients + loss = loss / gradient_accumulation_steps + loss.backward() + + + if (batch_counter % gradient_accumulation_steps == 0): + # Update Optimizer + optimizer.step() # or flip them? + optimizer.zero_grad() + + model.zero_grad() +# loss.backward() + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) +# optimizer.step() + scheduler.step() + + # ------ Validation -------- + + print('\nValidation for epoch:', epoch_num + 1) + + # Dev and test results for each corpus. We don't need to save the results. + for corpus in dev_dict_dataloader: + dev_results = get_predictions( + model, + corpus, + dev_dict_dataloader[corpus] + ) + better_dev_results = get_better_predictions( + model, + corpus, + dev_dict_dataloader[corpus], + framework_labels[corpus.split('.')[1]], + inv_mappings, + epoch_num+1, + save_conf_matrix=True + + ) + +# path_results = 'results/dev/language_' + mapping_classes + '_' + str(epoch_num+1) +# if not os.path.exists(path_results): +# os.makedirs(path_results) + +# print_results_to_file(corpus, +# dev_dict_sentences[corpus], +# dev_results, +# inv_mappings, #substitutions_file, +# path_results) + + # ------ Test -------- + + print('\nTest results for epoch:', epoch_num + 1) + + for corpus in test_dict_dataloader: + test_results = get_predictions( + model, + corpus, + test_dict_dataloader[corpus] + ) + better_test_results = get_better_predictions( + model, + corpus, + test_dict_dataloader[corpus], + framework_labels[corpus.split('.')[1]], + inv_mappings, + epoch_num+1, + save_conf_matrix=False + + ) + +# path_results = 'results/test/language_' + mapping_classes + '_' + str(epoch_num+1) +# if not os.path.exists(path_results): +# os.makedirs(path_results) + +# print_results_to_file(corpus, +# test_dict_sentences[corpus], +# test_results, +# inv_mappings, #substitutions_file, +# path_results) + + +# # we want the results of specific epochs for specific corpora. +# # we define the epochs and the corpora and we save only these results. + +# if epoch_num+1 in specific_results: +# for corpus in specific_results[epoch_num+1]: +# test_results = get_predictions(model, +# corpus, +# test_dict_dataloader[corpus], +# print_results=False) + + + # ========= New Code! ============= + # Save for each epoch the dev and test results + + + + +# ------- Start the training ------- + +print('\nModel: ', args.transformer_model) +print('Batch size: ', args.batch_size * args.gradient_accumulation_steps) +print('\nStart training...\n') +train(model, + train_dataloader, + dev_dict_dataloader, + test_dict_sentences, + test_dict_dataloader, + args.num_epochs, +# specific_results + ) +print('\nTraining Done!') + diff --git a/classifier_bare_pytorch.py b/classifier_bare_pytorch.py index 9cfd03eead463acbd032f1f27667e26d46f89273..81c9bde055cd813d92f3c3328b837b87556f0af9 100644 --- a/classifier_bare_pytorch.py +++ b/classifier_bare_pytorch.py @@ -79,6 +79,11 @@ batch_size = args.batch_size tokenizer = AutoTokenizer.from_pretrained(args.transformer_model) train_sentences, dev_dict_sentences, test_dict_sentences, framework_labels = open_sentences_with_lang(args.data_path, mappings) +# train_sentences, dev_dict_sentences, test_dict_sentences, framework_labels = open_sentences(args.data_path, mappings) + +print('\nCheck encodings:\n') +print(train_sentences[0]) + # Determine linear size (= number of classes in the sets + 1) num_labels = len(set(sent[-1] for sent in train_sentences)) + 1 @@ -222,9 +227,13 @@ def train(model, model, corpus, dev_dict_dataloader[corpus], - framework_labels[corpus.split('.')[1]] + framework_labels[corpus.split('.')[1]], + inv_mappings, + epoch_num+1, + save_conf_matrix=False ) + # path_results = 'results/dev/language_' + mapping_classes + '_' + str(epoch_num+1) # if not os.path.exists(path_results): # os.makedirs(path_results) @@ -249,7 +258,10 @@ def train(model, model, corpus, test_dict_dataloader[corpus], - framework_labels[corpus.split('.')[1]] + framework_labels[corpus.split('.')[1]], + inv_mappings, + epoch_num+1, + save_conf_matrix=False ) # path_results = 'results/test/language_' + mapping_classes + '_' + str(epoch_num+1) diff --git a/classifier_with_adapter.py b/classifier_with_adapter.py index 79916e54bdf33bc89946742f69c1dcaf19df9a94..a8377675e4dedcfcb5fe45b98c02fd768e5a14f8 100644 --- a/classifier_with_adapter.py +++ b/classifier_with_adapter.py @@ -125,11 +125,11 @@ training_args = TrainingArguments( remove_unused_columns = False, warmup_steps = 1000, # number of warmup steps for learning rate # save_steps = (len(train_sentences)/(args.batch_size * args.gradient_accumulation_steps)) / 1368, - save_total_limit = args.num_epochs, - load_best_model_at_end = True, +# save_total_limit = args.num_epochs, +# load_best_model_at_end = True, weight_decay = 0.01, # strength of weight decay - save_strategy='epoch', - evaluation_strategy='epoch' +# save_strategy='epoch', +# evaluation_strategy='epoch' ) @@ -175,18 +175,18 @@ for corpus in encoded_dev_dataset: ) - print('\nTest results:') -for corpus in encoded_test_dataset: - print() +# print('\nTest results:') +# for corpus in encoded_test_dataset: +# print() - dev_results_ = get_predictions_huggingface(trainer, - corpus, - encoded_test_dataset[corpus] - ) +# dev_results_ = get_predictions_huggingface(trainer, +# corpus, +# encoded_test_dataset[corpus] +# ) - dev_results = better_predictions_huggingface(trainer, - corpus, - encoded_test_dataset[corpus], - framework_labels[corpus.split('.')[1]] - ) +# dev_results = better_predictions_huggingface(trainer, +# corpus, +# encoded_test_dataset[corpus], +# framework_labels[corpus.split('.')[1]] +# ) diff --git a/utils.py b/utils.py index e1cf997f375cc7c01a21d341f74def18f2a4d779..ec1f3f6e431a0017318335bf20255fefe13e02b8 100644 --- a/utils.py +++ b/utils.py @@ -6,24 +6,18 @@ import torch from transformers import AutoConfig, AutoTokenizer from configure import parse_args import numpy as np -from sklearn.metrics import accuracy_score, confusion_matrix, classification_report +from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay import matplotlib.pyplot as plt +import seaborn as sns +from time import sleep +from datetime import datetime -args = parse_args() +now = datetime.now() +dt_string = now.strftime("%d.%m.%y-%H:%M:%S") -def switch_dimensions(vector_list): - - target_dim_len = len(vector_list[0]) - new_vector = [] - - for n in range(target_dim_len): - temp = [] - for x in vector_list: - temp.append(x[n]) - new_vector.append(temp) - - return new_vector +args = parse_args() + def open_mappings(mappings_file): @@ -156,6 +150,7 @@ def open_file_with_lang(filename, mappings_dict): # flip them if different direction if args.normalize_direction == 'yes': if l[9] == '1>2': + #lang, fullname, framework lines.append(l + [[lang, fullname, framework] + sent_1 + [SEP_token] + sent_2, encode_label(l[11], mappings_dict)]) else: lines.append(l + [[lang, fullname, framework] + sent_2 + [SEP_token] + sent_1, encode_label(l[11], mappings_dict)]) @@ -257,6 +252,10 @@ def open_sentences(path_to_corpora, mappings_dict): all_labels[framework] += [l[-1] for l in temp] corpus_labels = {framework:set(all_labels[framework]) for framework in all_labels} + # delete unk as a sanity check + for framework in corpus_labels: + if 'unk' in corpus_labels[framework]: + corpus_labels[framework].remove('unk') return train_sentences, dev_dict_sentences, test_dict_sentences, corpus_labels @@ -330,18 +329,14 @@ def open_sentences_with_lang(path_to_corpora, mappings_dict): all_labels[framework] += [l[-1] for l in temp] corpus_labels = {framework:set(all_labels[framework]) for framework in all_labels} + # delete unk as a sanity check + for framework in corpus_labels: + if 'unk' in corpus_labels[framework]: + corpus_labels[framework].remove('unk') return train_sentences, dev_dict_sentences, test_dict_sentences, corpus_labels -def show_confusion(predictions, labels, save_name): - - cm = confusion_matrix(y_test, y_pred) - cm_display = ConfusionMatrixDisplay(cm).plot() - - plt.savefig(save_name, dpi=300) - - return None # =============== # Testing functions @@ -451,11 +446,64 @@ def better_predictions_huggingface(trainer, return best_labels +def make_confusion_matrices(y_test, + y_pred, + corpus_name, + inv_mappings, + epoch): + + save_path = 'conf_matrix/' + dt_string + if not os.path.exists(save_path): + os.makedirs(save_path) + + print(classification_report(y_test, + y_pred, + #target_names= sorted(list(set([inv_mappings[x] for x in y_test]))) + ) + ) + + + cm = confusion_matrix(y_test, + y_pred, + labels = list(inv_mappings.keys()) + ) + print(cm) + + xticklabels = list(inv_mappings.values()) + yticklabels = list(inv_mappings.values()) + + sns.color_palette("cubehelix", as_cmap=True) + # Plot the confusion matrix. + + fig, ax = plt.subplots() +# ax.tick_params(axis='both', which='major', labelsize=6) +# ax.tick_params(axis='both', which='minor', labelsize=6) + ax = sns.heatmap(cm, + #annot=Truex + xticklabels=xticklabels, + yticklabels=yticklabels + ) + plt.ylabel('Predicted label') + plt.xlabel('Corpus label') + plt.xticks(fontsize=2) + plt.yticks(fontsize=2) +# plt.xticks(x, labels, rotation='vertical') +# plt.margins(0.5) + plt.subplots_adjust(bottom=0.5, left=0.5) + plt.title('Confusion Matrix: '+corpus_name+' (epoch:'+ str(epoch) + ')') + plt.savefig(save_path + '/' + corpus_name + '_' + str(epoch) + '.png', + dpi=300) + plt.clf() + + def get_better_predictions(model, corpus, test_dataloader, corpus_labels, - print_results=True): + inv_mappings, + epoch, + print_results=True, + save_conf_matrix=False): device = torch.device("cuda" if args.use_cuda else "cpu") @@ -504,9 +552,25 @@ def get_better_predictions(model, if print_results: print('better:\t' + str(test_acc), flush='True') + print(classification_report(all_labels, top_preds)) + + if save_conf_matrix: + #try: + make_confusion_matrices(all_labels, + top_preds, + corpus, + inv_mappings, + epoch) +# except ValueError: +# print('matrix failed to print') +# pass + print() +# print(all_preds) + print('----') return all_labels, all_preds + def print_results_to_file(corpus, test_sentences,