classifier_with_adapter.py 6.84 KiB
#!/usr/bin/env python
# coding: utf-8
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer, get_linear_schedule_with_warmup, AutoAdapterModel, AutoModelWithHeads, AutoConfig, TrainingArguments, Trainer, EvalPrediction, set_seed
from torch import nn
from torch.optim import AdamW
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.autograd import Variable
from tqdm import tqdm
import os
from time import sleep
from datetime import datetime
import sys
from sklearn.metrics import classification_report, accuracy_score
from utils import open_file
import pandas as pd
import datasets
from configure import parse_args
from utils import *
args = parse_args()
now = datetime.now()
dt_string = now.strftime("%d.%m.%y-%H:%M:%S")
adapter_name = args.adapter_name
mappings, inv_mappings = open_mappings(args.mappings_file)
substitutions_file = 'mappings/substitutions.txt'
tokenizer = AutoTokenizer.from_pretrained(args.transformer_model)
set_seed(42)
print('Train classifier with adapter\n')
print('Adapter name:', adapter_name)
print('Model:', args.transformer_model)
print('Batch size:', args.batch_size * args.gradient_accumulation_steps)
print('Num epochs:', args.num_epochs)
# Open mappings
mappings, inv_mappings = open_mappings(args.mappings_file)
# Open sentences
train_sentences, dev_dict_sentences, test_dict_sentences, framework_labels = open_sentences_with_lang(args.data_path, mappings)
print('\nCheck encodings:\n')
print(train_sentences[0])
# make pandas dataframes
file_header = ['text', 'labels']
train_df = pd.DataFrame([[' '.join(x[-2]), x[-1]] for x in train_sentences],
columns =file_header)
train_df = train_df.sample(frac = 1) # shuffle the train
# make a joint dev dataset in order to save models
eval_df = pd.DataFrame([[' '.join(x[-2]), x[-1]]
for corpus, sents in dev_dict_sentences.items()
for x in sents],
columns =file_header)
dev_dict_df = {corpus : pd.DataFrame([[' '.join(x[-2]), x[-1]]
for x in sents],
columns = file_header)
for corpus, sents in dev_dict_sentences.items()}
test_dict_df = {corpus : pd.DataFrame([[' '.join(x[-2]), x[-1]]
for x in sents],
columns = file_header)
for corpus, sents in test_dict_sentences.items()}
#Make datasets from dataframes
train_dataset = datasets.Dataset.from_pandas(train_df)
eval_dataset = datasets.Dataset.from_pandas(eval_df)
dev_dict_dataset = {corpus:datasets.Dataset.from_pandas(dev_df)
for corpus, dev_df in dev_dict_df.items()}
test_dict_dataset = {corpus:datasets.Dataset.from_pandas(dev_df)
for corpus, dev_df in test_dict_df.items()}
# get number of labels
num_labels = len(set([int(x.strip())
for x in train_df['labels'].to_string(index=False).split('\n')])) +1
# Encode the data
train_dataset = train_dataset.map(encode_batch, batched=True)
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
eval_dataset = eval_dataset.map(encode_batch, batched=True)
eval_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
encoded_dev_dataset = {}
for corpus in dev_dict_dataset:
temp = dev_dict_dataset[corpus].map(encode_batch, batched=True)
temp.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
encoded_dev_dataset[corpus] = temp
encoded_test_dataset = {}
for corpus in test_dict_dataset:
temp = test_dict_dataset[corpus].map(encode_batch, batched=True)
temp.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
encoded_test_dataset[corpus] = temp
# ===============================
# Training params
# ===============================
model = AutoAdapterModel.from_pretrained(args.transformer_model)
active_adapter = model.load_adapter(adapter_name,
config = adapter_name + "/adapter_config.json")
model.set_active_adapters(active_adapter)
training_args = TrainingArguments(
learning_rate = 2e-5, #1e-4,
num_train_epochs = args.num_epochs,
per_device_train_batch_size = args.batch_size,
per_device_eval_batch_size = args.batch_size,
gradient_accumulation_steps = args.gradient_accumulation_steps,
logging_steps = (len(train_sentences)/(args.batch_size * args.gradient_accumulation_steps)),
output_dir = "./results/models/run_" + args.transformer_model,
overwrite_output_dir = True,
remove_unused_columns = False,
warmup_steps = 1000, # number of warmup steps for learning rate
# save_steps = (len(train_sentences)/(args.batch_size * args.gradient_accumulation_steps)) / 1368,
save_total_limit = args.num_epochs,
load_best_model_at_end = True,
weight_decay = 0.01, # strength of weight decay
save_strategy='epoch',
evaluation_strategy='epoch'
)
trainer = Trainer(
model = model,
args = training_args,
train_dataset = train_dataset,
eval_dataset = eval_dataset
)
# Freeze layers in the classifier if desired
if args.freeze_layers != '':
layers_to_freeze = args.freeze_layers.split(';')
for name, param in model.named_parameters():
if any(x in name for x in layers_to_freeze):
param.requires_grad = False
# ===============================
# Start the training 🚀
# ===============================
print('Start training...')
trainer.train()
# Dev results
print('\nDev results:')
for corpus in encoded_dev_dataset:
print()
dev_results_ = get_predictions_huggingface(trainer,
corpus,
encoded_dev_dataset[corpus]
)
dev_results = better_predictions_huggingface(trainer,
corpus,
encoded_dev_dataset[corpus],
framework_labels[corpus.split('.')[1]]
)
print('\nTest results:')
for corpus in encoded_test_dataset:
print()
dev_results_ = get_predictions_huggingface(trainer,
corpus,
encoded_test_dataset[corpus]
)
dev_results = better_predictions_huggingface(trainer,
corpus,
encoded_test_dataset[corpus],
framework_labels[corpus.split('.')[1]]
)