Skip to content
Snippets Groups Projects
Commit 571902b3 authored by emetheni's avatar emetheni
Browse files

format code to black

parent 7cec7e97
No related branches found
No related tags found
No related merge requests found
...@@ -18,7 +18,7 @@ The full list of datasets with statistics: [here](https://github.com/disrpt/shar ...@@ -18,7 +18,7 @@ The full list of datasets with statistics: [here](https://github.com/disrpt/shar
* transformers * transformers
* scikit-learn * scikit-learn
Install requirements with ```pip install requirements.txt```. Install requirements with ```pip install -r requirements.txt```.
## Run ## Run
......
...@@ -3,7 +3,12 @@ ...@@ -3,7 +3,12 @@
import torch import torch
import numpy as np import numpy as np
from transformers import AutoModel, AutoTokenizer, get_linear_schedule_with_warmup, set_seed from transformers import (
AutoModel,
AutoTokenizer,
get_linear_schedule_with_warmup,
set_seed,
)
from torch import nn from torch import nn
from torch.optim import AdamW from torch.optim import AdamW
from torch.utils.data import DataLoader from torch.utils.data import DataLoader
...@@ -23,33 +28,36 @@ now = datetime.now() ...@@ -23,33 +28,36 @@ now = datetime.now()
dt_string = now.strftime("%d.%m.%y-%H:%M:%S") dt_string = now.strftime("%d.%m.%y-%H:%M:%S")
layers_to_freeze = args.freeze_layers.split(";") layers_to_freeze = args.freeze_layers.split(";")
print('\nTraining with datasets: ' + args.langs_to_use) print("Training with datasets: " + args.langs_to_use)
print('Mappings file: ' + args.mappings_file, flush='True') print("Mappings file: " + args.mappings_file, flush="True")
# =============== # ===============
# Dataset class # Dataset class
# =============== # ===============
class Dataset(torch.utils.data.Dataset):
class Dataset(torch.utils.data.Dataset):
def __init__(self, sentences): def __init__(self, sentences):
self.labels = [sent[-1] for sent in sentences] self.labels = [sent[-1] for sent in sentences]
self.texts = [tokenizer(sent[-2], self.texts = [
is_split_into_words=True, tokenizer(
padding='max_length', sent[-2],
max_length = 512, is_split_into_words=True,
truncation=True, padding="max_length",
return_tensors="pt") max_length=512,
for sent in sentences] truncation=True,
return_tensors="pt",
)
for sent in sentences
]
def classes(self): def classes(self):
return self.labels return self.labels
def __len__(self): def __len__(self):
return len(self.labels) return len(self.labels)
def get_batch_labels(self, idx): def get_batch_labels(self, idx):
# Fetch a batch of labels # Fetch a batch of labels
return np.array(self.labels[idx]) return np.array(self.labels[idx])
...@@ -59,12 +67,12 @@ class Dataset(torch.utils.data.Dataset): ...@@ -59,12 +67,12 @@ class Dataset(torch.utils.data.Dataset):
return self.texts[idx] return self.texts[idx]
def __getitem__(self, idx): def __getitem__(self, idx):
batch_texts = self.get_batch_texts(idx) batch_texts = self.get_batch_texts(idx)
batch_y = self.get_batch_labels(idx) batch_y = self.get_batch_labels(idx)
return batch_texts, batch_y return batch_texts, batch_y
# =============== # ===============
# Load datasets # Load datasets
# =============== # ===============
...@@ -72,24 +80,32 @@ class Dataset(torch.utils.data.Dataset): ...@@ -72,24 +80,32 @@ class Dataset(torch.utils.data.Dataset):
# Open mappings # Open mappings
mappings, inv_mappings = open_mappings(args.mappings_file) mappings, inv_mappings = open_mappings(args.mappings_file)
batch_size = args.batch_size batch_size = args.batch_size
tokenizer = AutoTokenizer.from_pretrained(args.transformer_model) tokenizer = AutoTokenizer.from_pretrained(args.transformer_model)
train_sentences, dev_dict_sentences, test_dict_sentences, framework_labels = open_sentences_with_lang(args.data_path, mappings) (
train_sentences,
dev_dict_sentences,
test_dict_sentences,
framework_labels,
) = open_sentences_with_lang(args.data_path, mappings)
# Determine linear size (= number of classes in the sets + 1) # Determine linear size (= number of classes in the sets + 1)
num_labels = len(set(sent[-1] for sent in train_sentences)) + 1 num_labels = len(set(sent[-1] for sent in train_sentences)) + 1
# make train/dev datasets # make train/dev datasets
train_dataset = Dataset(train_sentences) train_dataset = Dataset(train_sentences)
dev_dataset = {corpus: Dataset(s) for corpus, s in dev_dict_sentences.items()} dev_dataset = {corpus: Dataset(s) for corpus, s in dev_dict_sentences.items()}
test_dataset = {corpus: Dataset(s) for corpus, s in test_dict_sentences.items()} test_dataset = {corpus: Dataset(s) for corpus, s in test_dict_sentences.items()}
# Make dasets with batches and dataloader # Make dasets with batches and dataloader
train_dataloader = DataLoader(train_dataset, batch_size, shuffle=True) train_dataloader = DataLoader(train_dataset, batch_size, shuffle=True)
dev_dict_dataloader = {corpus: DataLoader(dev_data, batch_size) dev_dict_dataloader = {
for corpus, dev_data in dev_dataset.items()} corpus: DataLoader(dev_data, batch_size) for corpus, dev_data in dev_dataset.items()
test_dict_dataloader = {corpus: DataLoader(test_data, batch_size) }
for corpus, test_data in test_dataset.items()} test_dict_dataloader = {
corpus: DataLoader(test_data, batch_size)
for corpus, test_data in test_dataset.items()
}
print("\nDatasets loaded!\n") print("\nDatasets loaded!\n")
...@@ -97,22 +113,20 @@ print("\nDatasets loaded!\n") ...@@ -97,22 +113,20 @@ print("\nDatasets loaded!\n")
# Model setup # Model setup
# =============== # ===============
class TransformerClassifier(nn.Module):
class TransformerClassifier(nn.Module):
def __init__(self, dropout=args.dropout): def __init__(self, dropout=args.dropout):
super(TransformerClassifier, self).__init__() super(TransformerClassifier, self).__init__()
self.tr_model = AutoModel.from_pretrained(args.transformer_model) self.tr_model = AutoModel.from_pretrained(args.transformer_model)
self.dropout = nn.Dropout(dropout) self.dropout = nn.Dropout(dropout)
self.linear = nn.Linear(768, num_labels) # bert input x num of classes self.linear = nn.Linear(768, num_labels) # bert input x num of classes
self.relu = nn.ReLU() self.relu = nn.ReLU()
def forward(self, input_id, mask): def forward(self, input_id, mask):
outputs = self.tr_model(
outputs = self.tr_model(input_ids = input_id, input_ids=input_id, attention_mask=mask, return_dict=True
attention_mask = mask, )["last_hidden_state"][:, 0, :]
return_dict = True)['last_hidden_state'][:, 0, :]
dropout_output = self.dropout(outputs) dropout_output = self.dropout(outputs)
linear_output = self.linear(dropout_output) linear_output = self.linear(dropout_output)
final_layer = self.relu(linear_output) final_layer = self.relu(linear_output)
...@@ -123,133 +137,120 @@ class TransformerClassifier(nn.Module): ...@@ -123,133 +137,120 @@ class TransformerClassifier(nn.Module):
model = TransformerClassifier() model = TransformerClassifier()
def train(model, def train(
train_dataloader, model,
dev_dict_dataloader, train_dataloader,
test_dict_sentences, dev_dict_dataloader,
test_dict_dataloader, test_dict_sentences,
epochs, test_dict_dataloader,
#specific_results epochs,
): # specific_results
):
device = torch.device("cpu") device = torch.device("cpu")
criterion = nn.CrossEntropyLoss() criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), #Adam optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8) # Adam # 1e-6
lr = 2e-5, #1e-6
eps = 1e-8
)
if args.use_cuda == 'yes': if args.use_cuda == "yes":
device = torch.device("cuda") device = torch.device("cuda")
model = model.cuda() model = model.cuda()
criterion = criterion.cuda() criterion = criterion.cuda()
gradient_accumulation_steps = args.gradient_accumulation_steps gradient_accumulation_steps = args.gradient_accumulation_steps
total_steps = len(train_dataloader) * epochs total_steps = len(train_dataloader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, scheduler = get_linear_schedule_with_warmup(
num_warmup_steps = 0, optimizer, num_warmup_steps=0, num_training_steps=total_steps
num_training_steps = total_steps) )
seed_val = 42 seed_val = 42
set_seed(seed_val) set_seed(seed_val)
torch.manual_seed(seed_val) torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val) torch.cuda.manual_seed_all(seed_val)
# Freeze layers if you want # Freeze layers if you want
if args.freeze_layers != '': if args.freeze_layers != "":
for name, param in model.named_parameters(): for name, param in model.named_parameters():
if any(x in name for x in layers_to_freeze): if any(x in name for x in layers_to_freeze):
param.requires_grad = False param.requires_grad = False
for epoch_num in range(0, epochs): for epoch_num in range(0, epochs):
print('\n=== Epoch {:} / {:} ==='.format(epoch_num + 1, epochs)) print("\n=== Epoch {:} / {:} ===".format(epoch_num + 1, epochs))
model.train() model.train()
total_acc_train = 0 total_acc_train = 0
total_loss_train = 0 total_loss_train = 0
batch_counter = 0 batch_counter = 0
for train_input, train_label in tqdm(train_dataloader): for train_input, train_label in tqdm(train_dataloader):
batch_counter += 1 batch_counter += 1
train_label = train_label.to(device) train_label = train_label.to(device)
mask = train_input['attention_mask'].to(device) mask = train_input["attention_mask"].to(device)
input_id = train_input['input_ids'].squeeze(1).to(device) input_id = train_input["input_ids"].squeeze(1).to(device)
output = model(input_id, mask) output = model(input_id, mask)
# Compute Loss and Perform Back-propagation # Compute Loss and Perform Back-propagation
loss = criterion(output, train_label.long()) loss = criterion(output, train_label.long())
# Normalize the Gradients # Normalize the Gradients
loss = loss / gradient_accumulation_steps loss = loss / gradient_accumulation_steps
loss.backward() loss.backward()
if batch_counter % gradient_accumulation_steps == 0:
if (batch_counter % gradient_accumulation_steps == 0):
# Update Optimizer # Update Optimizer
optimizer.step() optimizer.step()
optimizer.zero_grad() optimizer.zero_grad()
model.zero_grad() model.zero_grad()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
scheduler.step() scheduler.step()
# ------ Validation -------- # ------ Validation --------
print('\nValidation for epoch:', epoch_num + 1) print("\nValidation for epoch:", epoch_num + 1)
# Dev and test results for each corpus. We don't need to save the results. # Dev and test results for each corpus. We don't need to save the results.
for corpus in dev_dict_dataloader: for corpus in dev_dict_dataloader:
dev_results = get_predictions( dev_results = get_predictions(model, corpus, dev_dict_dataloader[corpus])
model,
corpus,
dev_dict_dataloader[corpus]
)
better_dev_results = get_better_predictions( better_dev_results = get_better_predictions(
model, model,
corpus, corpus,
dev_dict_dataloader[corpus], dev_dict_dataloader[corpus],
framework_labels[corpus.split('.')[1]], framework_labels[corpus.split(".")[1]],
inv_mappings, inv_mappings,
epoch_num+1, epoch_num + 1,
save_conf_matrix=False save_conf_matrix=False,
) )
# ------ Test -------- # ------ Test --------
print('\nTest results for epoch:', epoch_num + 1) print("\nTest results for epoch:", epoch_num + 1)
for corpus in test_dict_dataloader: for corpus in test_dict_dataloader:
test_results = get_predictions( test_results = get_predictions(model, corpus, test_dict_dataloader[corpus])
model,
corpus,
test_dict_dataloader[corpus]
)
better_test_results = get_better_predictions( better_test_results = get_better_predictions(
model, model,
corpus, corpus,
test_dict_dataloader[corpus], test_dict_dataloader[corpus],
framework_labels[corpus.split('.')[1]], framework_labels[corpus.split(".")[1]],
inv_mappings, inv_mappings,
epoch_num+1, epoch_num + 1,
save_conf_matrix=False save_conf_matrix=False,
) )
# ------- Start the training ------- # ------- Start the training -------
print('\nModel: ', args.transformer_model) print("\nModel: ", args.transformer_model)
print('Batch size: ', args.batch_size * args.gradient_accumulation_steps) print("Batch size: ", args.batch_size * args.gradient_accumulation_steps)
print('\nStart training...\n') print("\nStart training...\n")
train(model, train(
train_dataloader, model,
dev_dict_dataloader, train_dataloader,
test_dict_sentences, dev_dict_dataloader,
test_dict_dataloader, test_dict_sentences,
args.num_epochs test_dict_dataloader,
) args.num_epochs,
print('\nTraining Done!') )
print("\nTraining Done!")
import argparse import argparse
import sys import sys
def parse_args(): def parse_args():
""" """
Parse input arguments. Parse input arguments.
""" """
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
# path to data # path to data
parser.add_argument("--data_path", default="./data", type=str, parser.add_argument(
help="The path to the shared task data file from Github.") "--data_path",
default="./data",
type=str,
help="The path to the shared task data file from Github.",
)
# label mappings to integers # label mappings to integers
parser.add_argument("--mappings_file", default="mappings/mappings_substitutions.tsv", type=str, parser.add_argument(
help="The mappings file for all relations.") "--mappings_file",
default="mappings/mappings_substitutions.tsv",
type=str,
help="The mappings file for all relations.",
)
# transformer model # transformer model
parser.add_argument("--transformer_model", default="bert-base-multilingual-cased", type=str, parser.add_argument(
help="Model used, default: bert-multilingual-base-cased") "--transformer_model",
default="bert-base-multilingual-cased",
type=str,
help="Model used, default: bert-multilingual-base-cased",
)
# Number of training epochs # Number of training epochs
parser.add_argument("--num_epochs", default=10, type=int, parser.add_argument(
help="Number of training epochs. Default: 10") "--num_epochs",
default=10,
type=int,
help="Number of training epochs. Default: 10",
)
# Number of gradient accumulation steps # Number of gradient accumulation steps
parser.add_argument("--gradient_accumulation_steps", default=16, type=int, parser.add_argument(
help="Number of gradient accumulation steps. Default: 16") "--gradient_accumulation_steps",
default=16,
type=int,
help="Number of gradient accumulation steps. Default: 16",
)
# Dropout # Dropout
parser.add_argument("--dropout", default=0.1, type=float, parser.add_argument("--dropout", default=0.1, type=float, help="Dropout.")
help="Dropout.")
# Batch size # Batch size
parser.add_argument("--batch_size", default=8, type=int, parser.add_argument(
help="With CUDA: max. 8, without: max. 16. Default: 8") "--batch_size",
default=8,
type=int,
help="With CUDA: max. 8, without: max. 16. Default: 8",
)
# Use CUDA # Use CUDA
parser.add_argument("--use_cuda", default='yes', type=str, parser.add_argument(
help="Use CUDA [yes/no]. Careful of batch size!") "--use_cuda",
default="yes",
type=str,
help="Use CUDA [yes/no]. Careful of batch size!",
)
# freeze layers # freeze layers
parser.add_argument("--freeze_layers", default='', type=str, parser.add_argument(
help="List of layer(s) to freeze, a str separated by ;. Example: 'layer.1;layer.2'") "--freeze_layers",
default="",
type=str,
help="List of layer(s) to freeze, a str separated by ;. Example: 'layer.1;layer.2'",
)
# normalize direction # normalize direction
parser.add_argument("--normalize_direction", default='yes', type=str, parser.add_argument(
help="Change order of sentences when the direction of relations is 1<2 to 2>1.") "--normalize_direction",
default="yes",
type=str,
help="Change order of sentences when the direction of relations is 1<2 to 2>1.",
)
# only specific languages/corpora # only specific languages/corpora
parser.add_argument("--langs_to_use", default='@', type=str, parser.add_argument(
help="List of languages/corpora to use, a str separated by ;") "--langs_to_use",
default="@",
type=str,
help="List of languages/corpora to use, a str separated by ;",
)
args = parser.parse_args() args = parser.parse_args()
return args return args
...@@ -11,13 +11,13 @@ args = parse_args() ...@@ -11,13 +11,13 @@ args = parse_args()
# ----------------------------------- # -----------------------------------
# open substitutions per file # open substitutions per file
mappings = {} mappings = {}
with open('mappings/mappings_substitutions.tsv', 'r', encoding='utf-8') as f: with open("mappings/mappings_substitutions.tsv", "r", encoding="utf-8") as f:
next(f) next(f)
for line in f: for line in f:
l = line.strip().split('\t') l = line.strip().split("\t")
mappings[l[0]] = l[1] mappings[l[0]] = l[1]
# find the labels that were changed # find the labels that were changed
inv_mappings = {} inv_mappings = {}
subs = {} subs = {}
...@@ -26,31 +26,36 @@ for label, num in mappings.items(): ...@@ -26,31 +26,36 @@ for label, num in mappings.items():
inv_mappings[num] = label inv_mappings[num] = label
else: else:
subs[label] = inv_mappings[num] subs[label] = inv_mappings[num]
# ----------------------------------- # -----------------------------------
# define which language to use with the arguments # define which language to use with the arguments
languages = args.langs_to_use.split(';') languages = args.langs_to_use.split(";")
corpora = [folder corpora = [
for folder in os.listdir(args.data_path) folder
if any(l in folder for l in languages)] for folder in os.listdir(args.data_path)
if any(l in folder for l in languages)
]
files = [
"/".join([args.data_path, corpus, f])
for corpus in corpora
for f in os.listdir(args.data_path + "/" + corpus)
]
files = ['/'.join([args.data_path, corpus, f])
for corpus in corpora
for f in os.listdir(args.data_path + '/' + corpus)]
# open the files # open the files
def read_file(file): def read_file(file):
''' Open the relations file. ''' """Open the relations file."""
relations = [] relations = []
sub_rels = [] sub_rels = []
with open(file, 'r', encoding='utf-8') as f: with open(file, "r", encoding="utf-8") as f:
next(f) next(f)
for line in f: for line in f:
try: try:
l = line.strip().split('\t') l = line.strip().split("\t")
if not l[11].lower() in subs: if not l[11].lower() in subs:
relations.append(l[11].lower()) relations.append(l[11].lower())
else: else:
...@@ -60,8 +65,7 @@ def read_file(file): ...@@ -60,8 +65,7 @@ def read_file(file):
return relations, sub_rels return relations, sub_rels
rel_files = [f for f in files if any (x in f for x in ['train'] rel_files = [f for f in files if any(x in f for x in ["train"])]
)]
good_rels = [] good_rels = []
sub_rels = [] sub_rels = []
...@@ -71,7 +75,7 @@ for f in rel_files: ...@@ -71,7 +75,7 @@ for f in rel_files:
sub_rels += y sub_rels += y
dict_labels = dict(enumerate(list(set(good_rels)))) dict_labels = dict(enumerate(list(set(good_rels))))
corpora_labels = {v:k for k, v in dict_labels.items()} corpora_labels = {v: k for k, v in dict_labels.items()}
leftovers = [] leftovers = []
...@@ -80,12 +84,12 @@ for sub in sub_rels: ...@@ -80,12 +84,12 @@ for sub in sub_rels:
try: try:
corpora_labels[sub] = corpora_labels[subs[sub]] corpora_labels[sub] = corpora_labels[subs[sub]]
except KeyError: except KeyError:
corpora_labels[subs[sub]] = max(list(corpora_labels.values())) + 1 corpora_labels[subs[sub]] = max(list(corpora_labels.values())) + 1
corpora_labels[sub] = corpora_labels[subs[sub]] corpora_labels[sub] = corpora_labels[subs[sub]]
corpora_labels['unk'] = max(list(corpora_labels.values())) + 1 corpora_labels["unk"] = max(list(corpora_labels.values())) + 1
with open('mappings/' + args.mappings_file, 'w') as f: with open("mappings/" + args.mappings_file, "w") as f:
f.write('LABEL\tMAPPING\n') f.write("LABEL\tMAPPING\n")
for k, v in corpora_labels.items(): for k, v in corpora_labels.items():
f.write(k + '\t' + str(v) + '\n') f.write(k + "\t" + str(v) + "\n")
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment