adapter_classifier.py 5.50 KiB
#!/usr/bin/env python
# coding: utf-8
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer, get_linear_schedule_with_warmup, AutoAdapterModel, AutoModelWithHeads, AutoConfig, TrainingArguments, Trainer, EvalPrediction, set_seed
from torch import nn
from torch.optim import AdamW
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.autograd import Variable
from tqdm import tqdm
import os
from time import sleep
from datetime import datetime
import sys
from sklearn.metrics import classification_report, accuracy_score
from utils import open_file
import pandas as pd
import datasets
from configure import parse_args
from utils import *
args = parse_args()
now = datetime.now()
dt_string = now.strftime("%d.%m.%y-%H:%M:%S")
adapter_name = args.adapter_name
mappings, inv_mappings = open_mappings(args.mappings_file)
substitutions_file = 'mappings/substitutions.txt'
tokenizer = AutoTokenizer.from_pretrained(args.transformer_model)
# we are saving the test results of specific epochs
specific_results = open_specific_results('mappings/specific_results.txt')
if '1-2-3' in adapter_name or 'layer1;layer2;layer3' in adapter_name:
specific_results = list(specific_results['A1_3'][args.num_epochs])
else:
specific_results = list(specific_results['A1'][args.num_epochs])
set_seed(42)
print('Train classifier with adapter\n')
print('Adapter name:', adapter_name)
print('Model:', args.transformer_model)
print('Batch size:', args.batch_size * args.gradient_accumulation_steps)
print('Num epochs:', args.num_epochs)
# Open mappings
mappings, inv_mappings = open_mappings(args.mappings_file)
# Open sentences
train_sentences, dev_dict_sentences, test_dict_sentences = open_sentences(args.data_path, mappings)
# make pandas dataframes
file_header = ['text', 'labels']
train_df = pd.DataFrame([[' '.join(x[-2]), x[-1]] for x in train_sentences],
columns =file_header)
train_df = train_df.sample(frac = 1) # shuffle the train
dev_dict_df = {corpus : pd.DataFrame([[' '.join(x[-2]), x[-1]]
for x in sents],
columns = file_header)
for corpus, sents in dev_dict_sentences.items()}
test_dict_df = {corpus : pd.DataFrame([[' '.join(x[-2]), x[-1]]
for x in sents],
columns = file_header)
for corpus, sents in test_dict_sentences.items()
if corpus in specific_results}
#Make datasets from dataframes
train_dataset = datasets.Dataset.from_pandas(train_df)
dev_dict_dataset = {corpus:datasets.Dataset.from_pandas(dev_df)
for corpus, dev_df in dev_dict_df.items()}
test_dict_dataset = {corpus:datasets.Dataset.from_pandas(dev_df)
for corpus, dev_df in test_dict_df.items()}
# get number of labels
num_labels = len(set([int(x.strip())
for x in train_df['labels'].to_string(index=False).split('\n')])) +1
# Encode the data
train_dataset = train_dataset.map(encode_batch, batched=True)
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
encoded_dev_dataset = {}
for corpus in dev_dict_dataset:
temp = dev_dict_dataset[corpus].map(encode_batch, batched=True)
temp.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
encoded_dev_dataset[corpus] = temp
encoded_test_dataset = {}
for corpus in test_dict_dataset:
temp = test_dict_dataset[corpus].map(encode_batch, batched=True)
temp.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
encoded_test_dataset[corpus] = temp
# ===============================
# ## Training params
# ===============================
model = AutoAdapterModel.from_pretrained(args.transformer_model)
active_adapter = model.load_adapter(adapter_name,
config = adapter_name + "/adapter_config.json")
model.set_active_adapters(active_adapter)
training_args = TrainingArguments(
learning_rate = 2e-5, #1e-4,
num_train_epochs = args.num_epochs,
per_device_train_batch_size = args.batch_size,
per_device_eval_batch_size = args.batch_size,
gradient_accumulation_steps = args.gradient_accumulation_steps,
logging_steps = (len(train_sentences)/(args.batch_size * args.gradient_accumulation_steps)),
output_dir = "./training_output",
overwrite_output_dir =True,
remove_unused_columns=False,
)
trainer = Trainer(
model = model,
args = training_args,
train_dataset = train_dataset
)
# Freeze layers in the classifier if desired
if args.freeze_layers != '':
layers_to_freeze = args.freeze_layers.split(';')
for name, param in model.named_parameters():
if any(x in name for x in layers_to_freeze):
param.requires_grad = False
# ===============================
# Start the training 🚀
# ===============================
print('Start training...')
trainer.train()
# Dev results
print('\nDev results:')
for corpus in encoded_dev_dataset:
print()
_ = get_predictions_huggingface(trainer, corpus,
encoded_dev_dataset[corpus])
# Save specific test results
print('\nTest results:')
for corpus in encoded_test_dataset:
print()
test_results = get_predictions_huggingface(trainer, corpus,
encoded_test_dataset[corpus])
print_results_to_file(corpus, test_dict_sentences[corpus], test_results,
inv_mappings, substitutions_file)