Skip to content
Snippets Groups Projects
Commit 2daf5dc9 authored by emetheni's avatar emetheni
Browse files

update the classifiers

parent 4679630e
No related branches found
No related tags found
No related merge requests found
......@@ -26,7 +26,7 @@ substitutions_file = 'mappings/substitutions.txt'
mapping_classes = args.mappings_file[:-4].split('-')[-1]
set_seed(42)
print('Model:', args.transformer_model)
print('\nModel:', args.transformer_model)
print('Batch size:', args.batch_size * args.gradient_accumulation_steps)
print('Num epochs:', args.num_epochs)
......@@ -38,7 +38,8 @@ print('Num epochs:', args.num_epochs)
mappings, inv_mappings = open_mappings(args.mappings_file)
# Open sentences
train_sentences, dev_dict_sentences, test_dict_sentences, framework_labels = open_sentences_with_lang(args.data_path, mappings)
# train_sentences, dev_dict_sentences, test_dict_sentences, framework_labels = open_sentences_with_lang(args.data_path, mappings)
train_sentences, dev_dict_sentences, test_dict_sentences, framework_labels = open_sentences(args.data_path, mappings)
print('\nCheck encodings:\n')
......@@ -119,11 +120,11 @@ training_args = TrainingArguments(
remove_unused_columns = False,
warmup_steps = 1000, # number of warmup steps for learning rate
# save_steps = (len(train_sentences)/(args.batch_size * args.gradient_accumulation_steps)) / 1368,
save_total_limit = args.num_epochs,
load_best_model_at_end = True,
# save_total_limit = args.num_epochs,
# load_best_model_at_end = True,
weight_decay = 0.01, # strength of weight decay
save_strategy='epoch',
evaluation_strategy='epoch'
# save_strategy='epoch',
# evaluation_strategy='epoch'
)
......@@ -183,13 +184,13 @@ for corpus in encoded_dev_dataset:
# Test results
# print('\ntest results:')
# for corpus in encoded_test_dataset:
# print()
# test_results = get_predictions_huggingface(trainer,
# corpus,
# framework_labels[corpus.split('.')[1]],
# encoded_test_dataset[corpus])
print('\n\ntest results:')
for corpus in encoded_test_dataset:
print()
test_results = get_predictions_huggingface(trainer,
corpus,
framework_labels[corpus.split('.')[1]],
encoded_test_dataset[corpus])
# path_results = 'results/test/' + args.transformer_model + '_' + str(args.num_epochs)
......
#!/usr/bin/env python
# coding: utf-8
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer, get_linear_schedule_with_warmup, set_seed
from torch import nn
from torch.optim import AdamW
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.autograd import Variable
from tqdm import tqdm
import os
from time import sleep
from datetime import datetime
import sys
from sklearn.metrics import classification_report, accuracy_score
from configure import parse_args
from utils import *
args = parse_args()
now = datetime.now()
dt_string = now.strftime("%d.%m.%y-%H:%M:%S")
layers_to_freeze = args.freeze_layers.split(";")
substitutions_file = 'mappings/substitutions.txt'
# mapping_classes = args.mappings_file[:-4].split('-')[-1]
# specific_results = open_specific_results('mappings/specific_results.txt')['B']
print('\nlangs to use: '+ args.langs_to_use)
print('mappings file: ' + args.mappings_file, flush='True')
set_seed(42)
torch.manual_seed(42)
# ===============
# Dataset class
# ===============
class Dataset(torch.utils.data.Dataset):
def __init__(self, sentences):
self.labels = [sent[-1] for sent in sentences]
self.texts = [tokenizer(sent[-2],
is_split_into_words=True,
padding='max_length',
max_length = 512,
truncation=True,
return_tensors="pt")
for sent in sentences]
def classes(self):
return self.labels
def __len__(self):
return len(self.labels)
def get_batch_labels(self, idx):
# Fetch a batch of labels
return np.array(self.labels[idx])
def get_batch_texts(self, idx):
# Fetch a batch of inputs
return self.texts[idx]
def __getitem__(self, idx):
batch_texts = self.get_batch_texts(idx)
batch_y = self.get_batch_labels(idx)
return batch_texts, batch_y
# ===============
# Load datasets
# ===============
# Open mappings
mappings, inv_mappings = open_mappings(args.mappings_file)
batch_size = args.batch_size
tokenizer = AutoTokenizer.from_pretrained(args.transformer_model)
# train_sentences, dev_dict_sentences, test_dict_sentences, framework_labels = open_sentences(args.data_path, mappings)
train_sentences, dev_dict_sentences, test_dict_sentences, framework_labels = open_sentences_with_lang(args.data_path, mappings)
print('\nCheck encodings:\n')
print(train_sentences[0])
# Determine linear size (= number of classes in the sets + 1)
num_labels = len(set(sent[-1] for sent in train_sentences)) + 1
# make train/dev datasets
train_dataset = Dataset(train_sentences)
dev_dataset = {corpus: Dataset(s) for corpus, s in dev_dict_sentences.items()}
test_dataset = {corpus: Dataset(s) for corpus, s in test_dict_sentences.items()}
# Make dasets with batches and dataloader
train_dataloader = DataLoader(train_dataset, batch_size, shuffle=True)
dev_dict_dataloader = {corpus: DataLoader(dev_data, batch_size)
for corpus, dev_data in dev_dataset.items()}
test_dict_dataloader = {corpus: DataLoader(test_data, batch_size)
for corpus, test_data in test_dataset.items()}
# ===============
# Model setup
# ===============
class TransformerClassifier(nn.Module):
def __init__(self, dropout=args.dropout):
super(TransformerClassifier, self).__init__()
self.tr_model = AutoModel.from_pretrained(args.transformer_model)
self.dropout = nn.Dropout(dropout)
self.linear = nn.Linear(768, num_labels) # bert input x num of classes
self.relu = nn.ReLU()
def forward(self, input_id, mask):
outputs = self.tr_model(input_ids = input_id,
attention_mask = mask,
return_dict = True)['last_hidden_state'][:, 0, :]
dropout_output = self.dropout(outputs)
linear_output = self.linear(dropout_output)
final_layer = self.relu(linear_output)
return final_layer
model = TransformerClassifier()
def train(model,
train_dataloader,
dev_dict_dataloader,
test_dict_sentences,
test_dict_dataloader,
epochs,
#specific_results
):
device = torch.device("cuda" if args.use_cuda else "cpu")
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), #Adam
lr = 2e-5, #1e-6
eps = 1e-8
)
if args.use_cuda:
model = model.cuda()
criterion = criterion.cuda()
gradient_accumulation_steps = args.gradient_accumulation_steps
total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
num_warmup_steps = 0,
num_training_steps = total_steps)
seed_val = 42
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)
# freeze layers, see argument in configure.py
if args.freeze_layers != '':
for name, param in model.named_parameters():
if any(x in name for x in layers_to_freeze):
param.requires_grad = False
for epoch_num in range(0, epochs):
print('\n=== Epoch {:} / {:} ==='.format(epoch_num + 1, epochs))
model.train()
total_acc_train = 0
total_loss_train = 0
batch_counter = 0
for train_input, train_label in tqdm(train_dataloader):
# for train_input, train_label in train_dataloader:
batch_counter += 1
train_label = train_label.to(device)
mask = train_input['attention_mask'].to(device)
input_id = train_input['input_ids'].squeeze(1).to(device)
output = model(input_id, mask)
# batch_loss = criterion(output, train_label.long())
# total_loss_train += batch_loss.item()
# acc = (output.argmax(dim=1) == train_label).sum().item()
# total_acc_train += acc
# Compute Loss and Perform Back-propagation
loss = criterion(output, train_label.long())
# Normalize the Gradients
loss = loss / gradient_accumulation_steps
loss.backward()
if (batch_counter % gradient_accumulation_steps == 0):
# Update Optimizer
optimizer.step() # or flip them?
optimizer.zero_grad()
model.zero_grad()
# loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
# optimizer.step()
scheduler.step()
# ------ Validation --------
print('\nValidation for epoch:', epoch_num + 1)
# Dev and test results for each corpus. We don't need to save the results.
for corpus in dev_dict_dataloader:
dev_results = get_predictions(
model,
corpus,
dev_dict_dataloader[corpus]
)
better_dev_results = get_better_predictions(
model,
corpus,
dev_dict_dataloader[corpus],
framework_labels[corpus.split('.')[1]],
inv_mappings,
epoch_num+1,
save_conf_matrix=True
)
# path_results = 'results/dev/language_' + mapping_classes + '_' + str(epoch_num+1)
# if not os.path.exists(path_results):
# os.makedirs(path_results)
# print_results_to_file(corpus,
# dev_dict_sentences[corpus],
# dev_results,
# inv_mappings, #substitutions_file,
# path_results)
# ------ Test --------
print('\nTest results for epoch:', epoch_num + 1)
for corpus in test_dict_dataloader:
test_results = get_predictions(
model,
corpus,
test_dict_dataloader[corpus]
)
better_test_results = get_better_predictions(
model,
corpus,
test_dict_dataloader[corpus],
framework_labels[corpus.split('.')[1]],
inv_mappings,
epoch_num+1,
save_conf_matrix=False
)
# path_results = 'results/test/language_' + mapping_classes + '_' + str(epoch_num+1)
# if not os.path.exists(path_results):
# os.makedirs(path_results)
# print_results_to_file(corpus,
# test_dict_sentences[corpus],
# test_results,
# inv_mappings, #substitutions_file,
# path_results)
# # we want the results of specific epochs for specific corpora.
# # we define the epochs and the corpora and we save only these results.
# if epoch_num+1 in specific_results:
# for corpus in specific_results[epoch_num+1]:
# test_results = get_predictions(model,
# corpus,
# test_dict_dataloader[corpus],
# print_results=False)
# ========= New Code! =============
# Save for each epoch the dev and test results
# ------- Start the training -------
print('\nModel: ', args.transformer_model)
print('Batch size: ', args.batch_size * args.gradient_accumulation_steps)
print('\nStart training...\n')
train(model,
train_dataloader,
dev_dict_dataloader,
test_dict_sentences,
test_dict_dataloader,
args.num_epochs,
# specific_results
)
print('\nTraining Done!')
......@@ -79,6 +79,11 @@ batch_size = args.batch_size
tokenizer = AutoTokenizer.from_pretrained(args.transformer_model)
train_sentences, dev_dict_sentences, test_dict_sentences, framework_labels = open_sentences_with_lang(args.data_path, mappings)
# train_sentences, dev_dict_sentences, test_dict_sentences, framework_labels = open_sentences(args.data_path, mappings)
print('\nCheck encodings:\n')
print(train_sentences[0])
# Determine linear size (= number of classes in the sets + 1)
num_labels = len(set(sent[-1] for sent in train_sentences)) + 1
......@@ -222,9 +227,13 @@ def train(model,
model,
corpus,
dev_dict_dataloader[corpus],
framework_labels[corpus.split('.')[1]]
framework_labels[corpus.split('.')[1]],
inv_mappings,
epoch_num+1,
save_conf_matrix=False
)
# path_results = 'results/dev/language_' + mapping_classes + '_' + str(epoch_num+1)
# if not os.path.exists(path_results):
# os.makedirs(path_results)
......@@ -249,7 +258,10 @@ def train(model,
model,
corpus,
test_dict_dataloader[corpus],
framework_labels[corpus.split('.')[1]]
framework_labels[corpus.split('.')[1]],
inv_mappings,
epoch_num+1,
save_conf_matrix=False
)
# path_results = 'results/test/language_' + mapping_classes + '_' + str(epoch_num+1)
......
......@@ -125,11 +125,11 @@ training_args = TrainingArguments(
remove_unused_columns = False,
warmup_steps = 1000, # number of warmup steps for learning rate
# save_steps = (len(train_sentences)/(args.batch_size * args.gradient_accumulation_steps)) / 1368,
save_total_limit = args.num_epochs,
load_best_model_at_end = True,
# save_total_limit = args.num_epochs,
# load_best_model_at_end = True,
weight_decay = 0.01, # strength of weight decay
save_strategy='epoch',
evaluation_strategy='epoch'
# save_strategy='epoch',
# evaluation_strategy='epoch'
)
......@@ -175,18 +175,18 @@ for corpus in encoded_dev_dataset:
)
print('\nTest results:')
for corpus in encoded_test_dataset:
print()
# print('\nTest results:')
# for corpus in encoded_test_dataset:
# print()
dev_results_ = get_predictions_huggingface(trainer,
corpus,
encoded_test_dataset[corpus]
)
# dev_results_ = get_predictions_huggingface(trainer,
# corpus,
# encoded_test_dataset[corpus]
# )
dev_results = better_predictions_huggingface(trainer,
corpus,
encoded_test_dataset[corpus],
framework_labels[corpus.split('.')[1]]
)
# dev_results = better_predictions_huggingface(trainer,
# corpus,
# encoded_test_dataset[corpus],
# framework_labels[corpus.split('.')[1]]
# )
......@@ -6,24 +6,18 @@ import torch
from transformers import AutoConfig, AutoTokenizer
from configure import parse_args
import numpy as np
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import seaborn as sns
from time import sleep
from datetime import datetime
args = parse_args()
now = datetime.now()
dt_string = now.strftime("%d.%m.%y-%H:%M:%S")
def switch_dimensions(vector_list):
target_dim_len = len(vector_list[0])
new_vector = []
for n in range(target_dim_len):
temp = []
for x in vector_list:
temp.append(x[n])
new_vector.append(temp)
return new_vector
args = parse_args()
def open_mappings(mappings_file):
......@@ -156,6 +150,7 @@ def open_file_with_lang(filename, mappings_dict):
# flip them if different direction
if args.normalize_direction == 'yes':
if l[9] == '1>2':
#lang, fullname, framework
lines.append(l + [[lang, fullname, framework] + sent_1 + [SEP_token] + sent_2, encode_label(l[11], mappings_dict)])
else:
lines.append(l + [[lang, fullname, framework] + sent_2 + [SEP_token] + sent_1, encode_label(l[11], mappings_dict)])
......@@ -257,6 +252,10 @@ def open_sentences(path_to_corpora, mappings_dict):
all_labels[framework] += [l[-1] for l in temp]
corpus_labels = {framework:set(all_labels[framework]) for framework in all_labels}
# delete unk as a sanity check
for framework in corpus_labels:
if 'unk' in corpus_labels[framework]:
corpus_labels[framework].remove('unk')
return train_sentences, dev_dict_sentences, test_dict_sentences, corpus_labels
......@@ -330,18 +329,14 @@ def open_sentences_with_lang(path_to_corpora, mappings_dict):
all_labels[framework] += [l[-1] for l in temp]
corpus_labels = {framework:set(all_labels[framework]) for framework in all_labels}
# delete unk as a sanity check
for framework in corpus_labels:
if 'unk' in corpus_labels[framework]:
corpus_labels[framework].remove('unk')
return train_sentences, dev_dict_sentences, test_dict_sentences, corpus_labels
def show_confusion(predictions, labels, save_name):
cm = confusion_matrix(y_test, y_pred)
cm_display = ConfusionMatrixDisplay(cm).plot()
plt.savefig(save_name, dpi=300)
return None
# ===============
# Testing functions
......@@ -451,11 +446,64 @@ def better_predictions_huggingface(trainer,
return best_labels
def make_confusion_matrices(y_test,
y_pred,
corpus_name,
inv_mappings,
epoch):
save_path = 'conf_matrix/' + dt_string
if not os.path.exists(save_path):
os.makedirs(save_path)
print(classification_report(y_test,
y_pred,
#target_names= sorted(list(set([inv_mappings[x] for x in y_test])))
)
)
cm = confusion_matrix(y_test,
y_pred,
labels = list(inv_mappings.keys())
)
print(cm)
xticklabels = list(inv_mappings.values())
yticklabels = list(inv_mappings.values())
sns.color_palette("cubehelix", as_cmap=True)
# Plot the confusion matrix.
fig, ax = plt.subplots()
# ax.tick_params(axis='both', which='major', labelsize=6)
# ax.tick_params(axis='both', which='minor', labelsize=6)
ax = sns.heatmap(cm,
#annot=Truex
xticklabels=xticklabels,
yticklabels=yticklabels
)
plt.ylabel('Predicted label')
plt.xlabel('Corpus label')
plt.xticks(fontsize=2)
plt.yticks(fontsize=2)
# plt.xticks(x, labels, rotation='vertical')
# plt.margins(0.5)
plt.subplots_adjust(bottom=0.5, left=0.5)
plt.title('Confusion Matrix: '+corpus_name+' (epoch:'+ str(epoch) + ')')
plt.savefig(save_path + '/' + corpus_name + '_' + str(epoch) + '.png',
dpi=300)
plt.clf()
def get_better_predictions(model,
corpus,
test_dataloader,
corpus_labels,
print_results=True):
inv_mappings,
epoch,
print_results=True,
save_conf_matrix=False):
device = torch.device("cuda" if args.use_cuda else "cpu")
......@@ -504,9 +552,25 @@ def get_better_predictions(model,
if print_results:
print('better:\t' + str(test_acc), flush='True')
print(classification_report(all_labels, top_preds))
if save_conf_matrix:
#try:
make_confusion_matrices(all_labels,
top_preds,
corpus,
inv_mappings,
epoch)
# except ValueError:
# print('matrix failed to print')
# pass
print()
# print(all_preds)
print('----')
return all_labels, all_preds
def print_results_to_file(corpus,
test_sentences,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment