Skip to content
Snippets Groups Projects
Commit 7dcc02fb authored by emetheni's avatar emetheni
Browse files

clone discret files

parent 6fc7ee0f
Branches
No related tags found
No related merge requests found
......@@ -30,11 +30,11 @@ substitutions_file = 'mappings/substitutions.txt'
tokenizer = AutoTokenizer.from_pretrained(args.transformer_model)
# we are saving the test results of specific epochs
specific_results = open_specific_results('mappings/specific_results.txt')
if '1-2-3' in adapter_name or 'layer1;layer2;layer3' in adapter_name:
specific_results = list(specific_results['A1_3'][args.num_epochs])
else:
specific_results = list(specific_results['A1'][args.num_epochs])
# specific_results = open_specific_results('mappings/specific_results.txt')
# if '1-2-3' in adapter_name or 'layer1;layer2;layer3' in adapter_name:
# specific_results = list(specific_results['A1_3'][args.num_epochs])
# else:
# specific_results = list(specific_results['A1'][args.num_epochs])
set_seed(42)
......@@ -65,8 +65,7 @@ dev_dict_df = {corpus : pd.DataFrame([[' '.join(x[-2]), x[-1]]
test_dict_df = {corpus : pd.DataFrame([[' '.join(x[-2]), x[-1]]
for x in sents],
columns = file_header)
for corpus, sents in test_dict_sentences.items()
if corpus in specific_results}
for corpus, sents in test_dict_sentences.items()}
#Make datasets from dataframes
train_dataset = datasets.Dataset.from_pandas(train_df)
......@@ -96,7 +95,7 @@ for corpus in test_dict_dataset:
encoded_test_dataset[corpus] = temp
# ===============================
# ## Training params
# Training params
# ===============================
model = AutoAdapterModel.from_pretrained(args.transformer_model)
......@@ -144,15 +143,72 @@ trainer.train()
print('\nDev results:')
for corpus in encoded_dev_dataset:
print()
_ = get_predictions_huggingface(trainer, corpus,
dev_results = get_predictions_huggingface(trainer, corpus,
encoded_dev_dataset[corpus])
# Save specific test results
print('\nTest results:')
path_results = 'results/dev/' + adapter_name + '_' + str(args.num_epochs)
if not os.path.exists(path_results):
os.makedirs(path_results)
print_results_to_file(corpus,
dev_dict_sentences[corpus],
dev_results,
inv_mappings,
substitutions_file,
path_results)
# Test results
print('\ntest results:')
for corpus in encoded_test_dataset:
print()
test_results = get_predictions_huggingface(trainer, corpus,
encoded_test_dataset[corpus])
test_results = get_predictions_huggingface(trainer,
corpus,
encoded_test_dataset[corpus])
path_results = 'results/test/' + adapter_name + '_' + str(args.num_epochs)
if not os.path.exists(path_results):
os.makedirs(path_results)
print_results_to_file(corpus,
test_dict_sentences[corpus],
test_results,
inv_mappings,
substitutions_file,
path_results)
# for corpus in test_dict_dataloader:
# test_results = get_predictions(model,
# corpus,
# test_dict_dataloader[corpus])
# path_results = 'results/test/pytorch' + str(epoch_num+1)
# if not os.path.exists(path_results):
# os.makedirs(path_results)
# print_results_to_file(corpus,
# test_dict_sentences[corpus],
# test_results,
# inv_mappings, substitutions_file,
# path_results)
# Save specific test results
print_results_to_file(corpus, test_dict_sentences[corpus], test_results,
inv_mappings, substitutions_file)
\ No newline at end of file
# print('\nTest results:')
# for corpus in encoded_test_dataset:
# print()
# test_results = get_predictions_huggingface(trainer, corpus,
# encoded_test_dataset[corpus])
#
# print_results_to_file(corpus, test_dict_sentences[corpus], test_results,
# inv_mappings, substitutions_file)
\ No newline at end of file
%% Cell type:code id: tags:
``` python
import os
```
%% Cell type:code id: tags:
``` python
num_labels = 134 + 1
temp = {}
mappings = {}
subs = {}
with open('mappings/mappings_substitutions.tsv', 'r') as f:
counter = -1
for line in f:
counter += 1
if counter < num_labels:
mappings[line.split("\t")[0]] = int(line.strip().split("\t")[1])
else:
temp[line.split("\t")[0]] = int(line.strip().split("\t")[1])
inv_mappings = {v:k for k, v in mappings.items()}
subs = {k:inv_mappings[v] for k, v in temp.items()}
```
%% Cell type:code id: tags:
``` python
def read_corpus(file):
labels = []
with open(file, 'r') as f:
next(f)
for line in f:
labels.append(line.strip().split('\t')[-1])
return labels
```
%% Cell type:code id: tags:
``` python
path = '/users/melodi/emetheni/clean_data'
list_corpora = [x for x in os.listdir(path)]
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
for corpus in list_corpora:
labels = read_corpus(path + '/' + corpus + '/' + corpus + '_dev.rels')
labels = read_corpus(path + '/' + corpus + '/' + corpus + '_test.rels')
try:
labels = read_corpus(path + '/' + corpus + '/' + corpus + '_train.rels')
except:
pass
labels = set(labels)
print('\n')
print("==" + corpus + "==")
print(sorted(labels))
# for label in labels:
# if label not in mappings:
# if label.lower() in mappings:
# print(label + '\t' + corpus + '\t' + label.lower())
# elif label in subs:
# print(label + '\t' + corpus + '\t' + subs[label])
# elif label.lower() in subs:
# print(label + '\t' + corpus + '\t' + subs[label.lower()])
# else:
# print('AAAAAAAAAAAAAAAAAAAA', label, corpus)
test_labels = read_corpus('results/test/A_15-epochs_frozen-1_3/' + corpus +'.tsv')
test_labels = set(test_labels)
for l in test_labels:
if l not in labels:
temp = ''
if l.lower() in labels:
temp = l.lower()
elif l.lower() in inv_mappings:
temp = inv_mappings[l.lower()]
elif l in subs:
temp = subs[l]
elif l.lower() in subs:
temp = subs[l.lower()]
try:
assert temp in test_labels
print(l + ' ' + corpus + ' ' +temp)
except:
print(l + ' ' + corpus + ' ' +temp)
```
%% Output
==spa.rst.sctb==
['antithesis', 'attribution', 'background', 'cause', 'circumstance', 'concession', 'condition', 'conjunction', 'contrast', 'disjunction', 'elaboration', 'evaluation', 'evidence', 'interpretation', 'justify', 'list', 'means', 'motivation', 'preparation', 'purpose', 'restatement', 'result', 'sequence', 'summary']
==tha.pdtb.tdtb==
['Comparison.Concession', 'Comparison.Contrast', 'Comparison.Similarity', 'Contingency.Cause', 'Contingency.Cause+Belief', 'Contingency.Cause+SpeechAct', 'Contingency.Condition', 'Contingency.Negative-Condition', 'Contingency.Negative-Condition+SpeechAct', 'Contingency.Purpose', 'Expansion.Conjunction', 'Expansion.Disjunction', 'Expansion.Equivalence', 'Expansion.Exception', 'Expansion.GenExpansion', 'Expansion.Instantiation', 'Expansion.Level-of-detail', 'Expansion.Substitution', 'Temporal.Asynchronous', 'Temporal.Synchronous']
==rus.rst.rrt==
['antithesis', 'attribution', 'background', 'cause', 'cause-effect', 'comparison', 'concession', 'conclusion', 'condition', 'contrast', 'effect', 'elaboration', 'evaluation', 'evidence', 'interpretation-evaluation', 'joint', 'motivation', 'preparation', 'purpose', 'restatement', 'sequence', 'solutionhood']
==zho.rst.gcdt==
['adversative-antithesis', 'adversative-concession', 'adversative-contrast', 'attribution-negative', 'attribution-positive', 'causal-cause', 'causal-result', 'context-background', 'context-circumstance', 'contingency-condition', 'elaboration-additional', 'elaboration-attribute', 'evaluation-comment', 'explanation-evidence', 'explanation-justify', 'explanation-motivation', 'joint-disjunction', 'joint-list', 'joint-other', 'joint-sequence', 'mode-manner', 'mode-means', 'organization-heading', 'organization-phatic', 'organization-preparation', 'purpose-attribute', 'purpose-goal', 'restatement-partial', 'restatement-repetition', 'topic-question', 'topic-solutionhood']
==fra.sdrt.annodis==
['alternation', 'attribution', 'background', 'comment', 'conditional', 'continuation', 'contrast', 'e-elaboration', 'elaboration', 'explanation', 'explanation*', 'flashback', 'frame', 'goal', 'narration', 'parallel', 'result', 'temploc']
==por.rst.cstn==
['antithesis', 'attribution', 'background', 'circumstance', 'comparison', 'concession', 'conclusion', 'condition', 'contrast', 'elaboration', 'enablement', 'evaluation', 'evidence', 'explanation', 'interpretation', 'joint', 'justify', 'list', 'means', 'motivation', 'nonvolitional-cause', 'nonvolitional-cause-e', 'nonvolitional-result', 'nonvolitional-result-e', 'otherwise', 'parenthetical', 'purpose', 'restatement', 'sequence', 'solutionhood', 'volitional-cause', 'volitional-result']
==eng.sdrt.stac==
['Acknowledgement', 'Alternation', 'Background', 'Clarification_question', 'Comment', 'Conditional', 'Continuation', 'Contrast', 'Correction', 'Elaboration', 'Explanation', 'Narration', 'Parallel', 'Q_Elab', 'Question_answer_pair', 'Result']
==eus.rst.ert==
['antithesis', 'background', 'cause', 'circumstance', 'concession', 'condition', 'conjunction', 'contrast', 'disjunction', 'elaboration', 'enablement', 'evaluation', 'evidence', 'interpretation', 'joint', 'justify', 'list', 'means', 'motivation', 'otherwise', 'preparation', 'purpose', 'restatement', 'result', 'sequence', 'solutionhood', 'summary', 'unconditional', 'unless']
==zho.dep.scidtb==
['ROOT', 'attribution', 'bg-compare', 'bg-general', 'bg-goal', 'cause', 'comparison', 'condition', 'contrast', 'elab-addition', 'elab-aspect', 'elab-enumember', 'elab-process_step', 'enablement', 'evaluation', 'exp-evidence', 'exp-reason', 'joint', 'manner-means', 'progression', 'result', 'summary', 'temporal']
==eng.pdtb.pdtb==
['Comparison.Concession', 'Comparison.Concession+SpeechAct', 'Comparison.Contrast', 'Comparison.Similarity', 'Contingency.Cause', 'Contingency.Cause+Belief', 'Contingency.Cause+SpeechAct', 'Contingency.Condition', 'Contingency.Condition+SpeechAct', 'Contingency.Negative-cause', 'Contingency.Negative-condition', 'Contingency.Purpose', 'Expansion.Conjunction', 'Expansion.Disjunction', 'Expansion.Equivalence', 'Expansion.Exception', 'Expansion.Instantiation', 'Expansion.Level-of-detail', 'Expansion.Manner', 'Expansion.Substitution', 'Hypophora', 'Temporal.Asynchronous', 'Temporal.Synchronous']
topic eng.pdtb.pdtb
==deu.rst.pcc==
['antithesis', 'background', 'cause', 'circumstance', 'concession', 'condition', 'conjunction', 'contrast', 'disjunction', 'e-elaboration', 'elaboration', 'evaluation-n', 'evaluation-s', 'evidence', 'interpretation', 'joint', 'list', 'means', 'preparation', 'purpose', 'reason', 'restatement', 'result', 'sequence', 'solutionhood', 'summary']
==eng.rst.rstdt==
['attribution', 'background', 'cause', 'comparison', 'condition', 'contrast', 'elaboration', 'enablement', 'evaluation', 'explanation', 'joint', 'manner-means', 'summary', 'temporal', 'textual-organization', 'topic-change', 'topic-comment']
acknowledgement eng.rst.rstdt
==zho.rst.sctb==
['antithesis', 'attribution', 'background', 'cause', 'circumstance', 'concession', 'condition', 'conjunction', 'contrast', 'disjunction', 'elaboration', 'enablement', 'evaluation', 'evidence', 'interpretation', 'justify', 'list', 'means', 'motivation', 'preparation', 'purpose', 'restatement', 'result', 'sequence', 'solutionhood', 'summary']
==nld.rst.nldt==
['antithesis', 'background', 'circumstance', 'concession', 'condition', 'conjunction', 'contrast', 'disjunction', 'elaboration', 'enablement', 'evaluation', 'evidence', 'interpretation', 'joint', 'justify', 'list', 'means', 'motivation', 'nonvolitional-cause', 'nonvolitional-result', 'otherwise', 'preparation', 'purpose', 'restatement', 'restatement-mn', 'sequence', 'solutionhood', 'summary', 'unconditional', 'unless', 'volitional-cause', 'volitional-result']
==tur.pdtb.tdb==
['Comparison.Concession', 'Comparison.Concession+SpeechAct', 'Comparison.Contrast', 'Comparison.Degree', 'Comparison.Similarity', 'Contingency.Cause', 'Contingency.Cause+Belief', 'Contingency.Cause+SpeechAct', 'Contingency.Condition', 'Contingency.Negative-condition', 'Contingency.Purpose', 'Expansion.Conjunction', 'Expansion.Correction', 'Expansion.Disjunction', 'Expansion.Equivalence', 'Expansion.Exception', 'Expansion.Instantiation', 'Expansion.Level-of-detail', 'Expansion.Manner', 'Expansion.Substitution', 'Hypophora', 'Temporal.Asynchronous', 'Temporal.Synchronous']
==spa.rst.rststb==
['alternative', 'antithesis', 'background', 'cause', 'circumstance', 'concession', 'condition', 'conjunction', 'contrast', 'disjunction', 'elaboration', 'enablement', 'evaluation', 'evidence', 'interpretation', 'joint', 'justify', 'list', 'means', 'motivation', 'preparation', 'purpose', 'restatement', 'result', 'sequence', 'solutionhood', 'summary', 'unless']
==por.pdtb.tedm==
['Comparison.Concession', 'Comparison.Contrast', 'Comparison.Similarity', 'Contingency.Cause', 'Contingency.Cause+Belief', 'Contingency.Condition', 'Contingency.Condition+SpeechAct', 'Contingency.Purpose', 'Expansion.Conjunction', 'Expansion.Disjunction', 'Expansion.Equivalence', 'Expansion.Instantiation', 'Expansion.Level-of-detail', 'Expansion.Manner', 'Expansion.Substitution', 'Hypophora', 'Temporal.Asynchronous', 'Temporal.Synchronous']
==ita.pdtb.luna==
['', 'Comparison', 'Comparison.Concession', 'Comparison.Contrast', 'Contingency.Cause', 'Contingency.Condition', 'Contingency.Goal', 'Expansion.Alternative', 'Expansion.Conjunction', 'Expansion.Instantiation', 'Expansion.Restatement', 'Interrupted', 'Repetition', 'Temporal.Asynchronous', 'Temporal.Synchrony']
parallel ita.pdtb.luna
==fas.rst.prstc==
['attribution', 'background', 'cause', 'comparison', 'condition', 'contrast', 'elaboration', 'enablement', 'evaluation', 'explanation', 'joint', 'manner-means', 'summary', 'temporal', 'topic-change', 'topic-comment', 'topic-drift']
==por.pdtb.crpc==
['Comparison', 'Comparison.Concession', 'Comparison.Contrast', 'Comparison.Similarity', 'Contingency.Cause', 'Contingency.Condition', 'Contingency.Negative', 'Contingency.Purpose', 'Expansion.Conjunction', 'Expansion.Disjunction', 'Expansion.Equivalence', 'Expansion.Exception', 'Expansion.Instantiation', 'Expansion.Level', 'Expansion.Manner', 'Expansion.Substitution', 'Hypophora', 'QAP', 'QAP.Hypophora', 'Temporal', 'Temporal.Asynchronous', 'Temporal.Synchronous']
==zho.pdtb.cdtb==
['Alternative', 'Causation', 'Conditional', 'Conjunction', 'Contrast', 'Expansion', 'Progression', 'Purpose', 'Temporal']
preparation zho.pdtb.cdtb
==eng.pdtb.tedm==
['Comparison.Concession', 'Comparison.Contrast', 'Comparison.Similarity', 'Contingency.Cause', 'Contingency.Cause+Belief', 'Contingency.Cause+SpeechAct', 'Contingency.Condition', 'Contingency.Purpose', 'Expansion.Conjunction', 'Expansion.Disjunction', 'Expansion.Equivalence', 'Expansion.Instantiation', 'Expansion.Level-of-detail', 'Expansion.Manner', 'Expansion.Substitution', 'Hypophora', 'Temporal.Asynchronous', 'Temporal.Synchronous']
parallel eng.pdtb.tedm
==eng.rst.gum==
['adversative', 'attribution', 'causal', 'context', 'contingency', 'elaboration', 'evaluation', 'explanation', 'joint', 'mode', 'organization', 'purpose', 'restatement', 'topic']
attribution-positive eng.rst.gum
==eng.dep.covdtb==
['ATTRIBUTION', 'BACKGROUND', 'CAUSE-RESULT', 'COMPARISON', 'CONDITION', 'ELABORATION', 'ENABLEMENT', 'FINDINGS', 'JOINT', 'MANNER-MEANS', 'TEMPORAL', 'TEXTUAL-ORGANIZATION']
interpretation eng.dep.covdtb
==tur.pdtb.tedm==
['Comparison.Concession', 'Comparison.Concession+SpeechAct', 'Comparison.Contrast', 'Comparison.Similarity', 'Contingency.Cause', 'Contingency.Cause+Belief', 'Contingency.Cause+SpeechAct', 'Contingency.Condition', 'Contingency.Negative-condition', 'Contingency.Purpose', 'Expansion', 'Expansion.Conjunction', 'Expansion.Disjunction', 'Expansion.Equivalence', 'Expansion.Exception', 'Expansion.Instantiation', 'Expansion.Level-of-detail', 'Expansion.Manner', 'Expansion.Substitution', 'Hypophora', 'Temporal.Asynchronous', 'Temporal.Synchronous']
==eng.dep.scidtb==
['attribution', 'bg-compare', 'bg-general', 'bg-goal', 'cause', 'comparison', 'condition', 'contrast', 'elab-addition', 'elab-aspect', 'elab-definition', 'elab-enumember', 'elab-example', 'elab-process_step', 'enablement', 'evaluation', 'exp-evidence', 'exp-reason', 'joint', 'manner-means', 'progression', 'result', 'summary', 'temporal']
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
%% Cell type:code id: tags:
``` python
```
#!/usr/bin/env python
# coding: utf-8
import torch
import numpy as np
from transformers import AutoModel, AutoTokenizer, get_linear_schedule_with_warmup, AutoAdapterModel, AutoModelWithHeads, AutoConfig, TrainingArguments, Trainer, EvalPrediction, set_seed
from torch import nn
from torch.optim import AdamW
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torch.autograd import Variable
from tqdm import tqdm
import os
from time import sleep
from datetime import datetime
import sys
from sklearn.metrics import classification_report, accuracy_score
from utils import open_file
import pandas as pd
import datasets
from configure import parse_args
from utils import *
args = parse_args()
now = datetime.now()
dt_string = now.strftime("%d.%m.%y-%H:%M:%S")
adapter_name = args.adapter_name
mappings, inv_mappings = open_mappings(args.mappings_file)
substitutions_file = 'mappings/substitutions.txt'
tokenizer = AutoTokenizer.from_pretrained(args.transformer_model)
# we are saving the test results of specific epochs
# specific_results = open_specific_results('mappings/specific_results.txt')
# if '1-2-3' in adapter_name or 'layer1;layer2;layer3' in adapter_name:
# specific_results = list(specific_results['A1_3'][args.num_epochs])
# else:
# specific_results = list(specific_results['A1'][args.num_epochs])
set_seed(42)
print('Train classifier with adapter\n')
print('Adapter name:', adapter_name)
print('Model:', args.transformer_model)
print('Batch size:', args.batch_size * args.gradient_accumulation_steps)
print('Num epochs:', args.num_epochs)
# Open mappings
mappings, inv_mappings = open_mappings(args.mappings_file)
# Open sentences
train_sentences, dev_dict_sentences, test_dict_sentences = open_sentences(args.data_path, mappings)
# make pandas dataframes
file_header = ['text', 'labels']
train_df = pd.DataFrame([[' '.join(x[-2]), x[-1]] for x in train_sentences],
columns =file_header)
train_df = train_df.sample(frac = 1) # shuffle the train
dev_dict_df = {corpus : pd.DataFrame([[' '.join(x[-2]), x[-1]]
for x in sents],
columns = file_header)
for corpus, sents in dev_dict_sentences.items()}
test_dict_df = {corpus : pd.DataFrame([[' '.join(x[-2]), x[-1]]
for x in sents],
columns = file_header)
for corpus, sents in test_dict_sentences.items()}
#Make datasets from dataframes
train_dataset = datasets.Dataset.from_pandas(train_df)
dev_dict_dataset = {corpus:datasets.Dataset.from_pandas(dev_df)
for corpus, dev_df in dev_dict_df.items()}
test_dict_dataset = {corpus:datasets.Dataset.from_pandas(dev_df)
for corpus, dev_df in test_dict_df.items()}
# get number of labels
num_labels = len(set([int(x.strip())
for x in train_df['labels'].to_string(index=False).split('\n')])) +1
# Encode the data
train_dataset = train_dataset.map(encode_batch, batched=True)
train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
encoded_dev_dataset = {}
for corpus in dev_dict_dataset:
temp = dev_dict_dataset[corpus].map(encode_batch, batched=True)
temp.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
encoded_dev_dataset[corpus] = temp
encoded_test_dataset = {}
for corpus in test_dict_dataset:
temp = test_dict_dataset[corpus].map(encode_batch, batched=True)
temp.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
encoded_test_dataset[corpus] = temp
# ===============================
# Training params
# ===============================
model = AutoAdapterModel.from_pretrained(args.transformer_model)
active_adapter = model.load_adapter(adapter_name,
config = adapter_name + "/adapter_config.json")
model.set_active_adapters(active_adapter)
training_args = TrainingArguments(
learning_rate = 2e-5, #1e-4,
num_train_epochs = args.num_epochs,
per_device_train_batch_size = args.batch_size,
per_device_eval_batch_size = args.batch_size,
gradient_accumulation_steps = args.gradient_accumulation_steps,
logging_steps = (len(train_sentences)/(args.batch_size * args.gradient_accumulation_steps)),
output_dir = "./training_output",
overwrite_output_dir =True,
remove_unused_columns=False,
)
trainer = Trainer(
model = model,
args = training_args,
train_dataset = train_dataset
)
# Freeze layers in the classifier if desired
if args.freeze_layers != '':
layers_to_freeze = args.freeze_layers.split(';')
for name, param in model.named_parameters():
if any(x in name for x in layers_to_freeze):
param.requires_grad = False
# ===============================
# Start the training 🚀
# ===============================
print('Start training...')
trainer.train()
# Dev results
print('\nDev results:')
for corpus in encoded_dev_dataset:
print()
dev_results = get_predictions_huggingface(trainer, corpus,
encoded_dev_dataset[corpus])
path_results = 'results/dev/' + adapter_name + '_' + str(args.num_epochs)
if not os.path.exists(path_results):
os.makedirs(path_results)
print_results_to_file(corpus,
dev_dict_sentences[corpus],
dev_results,
inv_mappings,
substitutions_file,
path_results)
# Test results
print('\ntest results:')
for corpus in encoded_test_dataset:
print()
test_results = get_predictions_huggingface(trainer,
corpus,
encoded_test_dataset[corpus])
path_results = 'results/test/' + adapter_name + '_' + str(args.num_epochs)
if not os.path.exists(path_results):
os.makedirs(path_results)
print_results_to_file(corpus,
test_dict_sentences[corpus],
test_results,
inv_mappings,
substitutions_file,
path_results)
# for corpus in test_dict_dataloader:
# test_results = get_predictions(model,
# corpus,
# test_dict_dataloader[corpus])
# path_results = 'results/test/pytorch' + str(epoch_num+1)
# if not os.path.exists(path_results):
# os.makedirs(path_results)
# print_results_to_file(corpus,
# test_dict_sentences[corpus],
# test_results,
# inv_mappings, substitutions_file,
# path_results)
# Save specific test results
# print('\nTest results:')
# for corpus in encoded_test_dataset:
# print()
# test_results = get_predictions_huggingface(trainer, corpus,
# encoded_test_dataset[corpus])
#
# print_results_to_file(corpus, test_dict_sentences[corpus], test_results,
# inv_mappings, substitutions_file)
\ No newline at end of file
......@@ -20,7 +20,7 @@ set_seed(42)
batch_size = args.batch_size
# Set name for adapter
adapter_name = 'adapter_' + str(args.num_epochs) + '-epochs_frozen' + args.freeze_layers.replace('layer.', '-').replace(';', '')
adapter_name = 'A_' + str(args.num_epochs) + '-epochs_frozen' + args.freeze_layers.replace('layer.', '-').replace(';', '')
print('Create classifier adapter\n')
print('Name:', adapter_name)
......
......@@ -166,7 +166,8 @@ def train(model,
total_loss_train = 0
batch_counter = 0
for train_input, train_label in tqdm(train_dataloader):
# for train_input, train_label in tqdm(train_dataloader):
for train_input, train_label in train_dataloader:
batch_counter += 1
train_label = train_label.to(device)
mask = train_input['attention_mask'].to(device)
......@@ -201,27 +202,60 @@ def train(model,
scheduler.step()
# ------ Validation --------
print('\nValidation for epoch:', epoch_num + 1)
# Dev results for each corpus. We don't need to save the results.
# Dev and test results for each corpus. We don't need to save the results.
for corpus in dev_dict_dataloader:
_ = get_predictions(model,
dev_results = get_predictions(model,
corpus,
dev_dict_dataloader[corpus])
# we want the results of specific epochs for specific corpora.
# we define the epochs and the corpora and we save only these results.
path_results = 'results/dev/pytorch-' + str(epoch_num+1)
if not os.path.exists(path_results):
os.makedirs(path_results)
print_results_to_file(corpus,
dev_dict_sentences[corpus],
dev_results,
inv_mappings, substitutions_file,
path_results)
# ------ Test --------
print('\nTest results for epoch:', epoch_num + 1)
for corpus in test_dict_dataloader:
test_results = get_predictions(model,
corpus,
test_dict_dataloader[corpus])
path_results = 'results/test/pytorch' + str(epoch_num+1)
if not os.path.exists(path_results):
os.makedirs(path_results)
print_results_to_file(corpus,
test_dict_sentences[corpus],
test_results,
inv_mappings, substitutions_file,
path_results)
# # we want the results of specific epochs for specific corpora.
# # we define the epochs and the corpora and we save only these results.
# if epoch_num+1 in specific_results:
# for corpus in specific_results[epoch_num+1]:
# test_results = get_predictions(model,
# corpus,
# test_dict_dataloader[corpus],
# print_results=False)
# ========= New Code! =============
# Save for each epoch the dev and test results
if epoch_num+1 in specific_results:
for corpus in specific_results[epoch_num+1]:
test_results = get_predictions(model,
corpus,
test_dict_dataloader[corpus],
print_results=False)
print_results_to_file(corpus,
test_dict_sentences[corpus],
test_results,
inv_mappings, substitutions_file)
# ------- Start the training -------
......@@ -241,12 +275,12 @@ print('\nTraining Done!')
# ------- Testing ---------
print('Testing...')
for corpus in test_dict_dataloader:
test_results = get_predictions(model,
corpus,
test_dict_dataloader[corpus]
)
# print('Testing...')
# for corpus in test_dict_dataloader:
# test_results = get_predictions(model,
# corpus,
# test_dict_dataloader[corpus]
# )
# print_results_to_file(corpus,
# test_dict_sentences[corpus],
# test_results,
......
adapter-transformers==3.0.1
aiohttp==3.8.4
aiosignal==1.3.1
async-timeout==4.0.2
attrs==23.1.0
certifi==2023.5.7
charset-normalizer==3.1.0
click==8.1.3
charset-normalizer
cmake==3.26.3
datasets==2.4.0
dill==0.3.5.1
filelock==3.12.0
frozenlist==1.3.3
fsspec==2023.5.0
fsspec
huggingface-hub==0.14.1
idna==3.4
Jinja2==3.1.2
......@@ -22,18 +14,6 @@ mpmath==1.3.0
multidict==6.0.4
multiprocess==0.70.13
networkx==3.1
numpy==1.24.3
nvidia-cublas-cu11==11.10.3.66
nvidia-cuda-cupti-cu11==11.7.101
nvidia-cuda-nvrtc-cu11==11.7.99
nvidia-cuda-runtime-cu11==11.7.99
nvidia-cudnn-cu11==8.5.0.96
nvidia-cufft-cu11==10.9.0.58
nvidia-curand-cu11==10.2.10.91
nvidia-cusolver-cu11==11.4.0.1
nvidia-cusparse-cu11==11.7.4.91
nvidia-nccl-cu11==2.14.3
nvidia-nvtx-cu11==11.7.91
packaging==23.1
pandas==2.0.1
Pillow==9.5.0
......@@ -53,7 +33,7 @@ threadpoolctl==3.1.0
tokenizers==0.12.1
torch==2.0.1
torchaudio==2.0.2
torchvision==0.15.2
torchvision
tqdm==4.65.0
transformers==4.18.0
triton==2.0.0
......
#!/usr/bin/env bash
#SBATCH --job-name=finals
#SBATCH --ntasks=1
#SBATCH --cpus-per-task=4
#SBATCH --partition=GPUNodes
#SBATCH --gres=gpu:1
# tests tests
# srun singularity exec /logiciels/containerCollections/CUDA10/pytorch.sif python3 pytorch_classifier.py --batch_size 8 --num_epochs 6 --data_path '/users/melodi/emetheni/clean_data'
# Train the adapter:
# srun singularity exec /logiciels/containerCollections/CUDA10/pytorch.sif python3 make_adapter.py --batch_size 8 --num_epochs 15 --data_path '/users/melodi/emetheni/sharedtask2023/data' --freeze_layers 'layer.1;layer.2;layer.3'
# srun singularity exec /logiciels/containerCollections/CUDA10/pytorch.sif python3 make_adapter.py --batch_size 8 --num_epochs 15 --data_path '/users/melodi/emetheni/sharedtask2023/data' --freeze_layers 'layer.1'
# Run classifier with adapter for corpora:
# srun singularity exec /logiciels/containerCollections/CUDA10/pytorch.sif python3 adapter_classifier.py --batch_size 8 --num_epochs 1 --data_path '/users/melodi/emetheni/clean_data' --adapter_name 'A_15-epochs_frozen-1-2-3'
# srun singularity exec /logiciels/containerCollections/CUDA10/pytorch.sif python3 adapter_classifier.py --batch_size 8 --num_epochs 2 --data_path '/users/melodi/emetheni/clean_data' --adapter_name 'A_15-epochs_frozen-1-2-3'
# srun singularity exec /logiciels/containerCollections/CUDA10/pytorch.sif python3 adapter_classifier.py --batch_size 8 --num_epochs 3 --data_path '/users/melodi/emetheni/clean_data' --adapter_name 'A_15-epochs_frozen-1-2-3'
# srun singularity exec /logiciels/containerCollections/CUDA10/pytorch.sif python3 adapter_classifier.py --batch_size 8 --num_epochs 4 --data_path '/users/melodi/emetheni/clean_data' --adapter_name 'A_15-epochs_frozen-1-2-3'
# srun singularity exec /logiciels/containerCollections/CUDA10/pytorch.sif python3 adapter_classifier.py --batch_size 8 --num_epochs 5 --data_path '/users/melodi/emetheni/clean_data' --adapter_name 'A_15-epochs_frozen-1-2-3'
# srun singularity exec /logiciels/containerCollections/CUDA10/pytorch.sif python3 adapter_classifier.py --batch_size 8 --num_epochs 6 --data_path '/users/melodi/emetheni/clean_data' --adapter_name 'A_15-epochs_frozen-1-2-3'
# srun singularity exec /logiciels/containerCollections/CUDA10/pytorch.sif python3 adapter_classifier.py --batch_size 8 --num_epochs 1 --data_path '/users/melodi/emetheni/clean_data' --adapter_name 'A_15-epochs_frozen-1'
# srun singularity exec /logiciels/containerCollections/CUDA10/pytorch.sif python3 adapter_classifier.py --batch_size 8 --num_epochs 2 --data_path '/users/melodi/emetheni/clean_data' --adapter_name 'A_15-epochs_frozen-1'
srun singularity exec /logiciels/containerCollections/CUDA10/pytorch.sif python3 adapter_classifier.py --batch_size 8 --num_epochs 3 --data_path '/users/melodi/emetheni/clean_data' --adapter_name 'A_15-epochs_frozen-1'
# srun singularity exec /logiciels/containerCollections/CUDA10/pytorch.sif python3 adapter_classifier.py --batch_size 8 --num_epochs 4 --data_path '/users/melodi/emetheni/clean_data' --adapter_name 'A_15-epochs_frozen-1'
# srun singularity exec /logiciels/containerCollections/CUDA10/pytorch.sif python3 adapter_classifier.py --batch_size 8 --num_epochs 5 --data_path '/users/melodi/emetheni/clean_data' --adapter_name 'A_15-epochs_frozen-1'
# srun singularity exec /logiciels/containerCollections/CUDA10/pytorch.sif python3 adapter_classifier.py --batch_size 8 --num_epochs 6 --data_path '/users/melodi/emetheni/clean_data' --adapter_name 'A_15-epochs_frozen-1'
%% Cell type:code id: tags:
``` python
from sklearn.metrics import accuracy_score
import os, io
from collections import OrderedDict, Counter
```
%% Cell type:code id: tags:
``` python
connectives = {"elaboration": ["and", "also", "besides", "further", "furthermore", "too", "moreover", "in addition", "then", "of equal importance", "equally important", "another", "additionally", "also", "moreover", "furthermore", "again", "further", "then", "besides", "too", "similarly", "correspondingly", "indeed", "regarding"],
"time": ["next", "afterward", "finally", "later", "last", "lastly", "at last", "now", "subsequently", "then", "when", "soon", "thereafter", "after a short time", "the next week", "a minute later", "in the meantime", "meanwhile", "on the following day", "at length", "ultimately", "presently"],
"sequence": ["first", "second", "third", "finally", "hence", "next", "then", "from here on", "to begin with", "last of all", "after", "before", "as soon as", "in the end", "gradually", "when", "after", "after that", "afterwards", "next", "subsequently", "later (on)", "followed by", "to go on to", "finally", "another", "additionally", "finally moreover", "also", "subsequently", "eventually", "next", "then"],
"example": ["for example", "to illustrate", "for instance", "to be specific", "such as", "moreover", "furthermore", "just as important", "similarly", "in the same way", "for example", "for instance", "namely", "such as", "as follows", "as exemplified by", "such as", "including", "especially", "particularly", "in particular", "notably", "mainly"],
"result": ["as a result", "hence", "so", "accordingly", "as a consequence", "consequently", "thus", "since", "therefore", "for this reason", "because of this", "therefore", "accordingly", "as a result of", "the result is/results are", "the consequence is", "resulting from", "consequently", "it can be seen", "evidence illustrates that", "because of this", "thus", "hence", "for this reason", "owing to x", "this suggests that", "it follows that", "otherwise", "in that case", "that implies", "As a result", "therefore", "thus"],
"purpose": ["for this purpose", "with this in mind", "for this reason"],
"comparison": ["like", "in the same manner", "as so", "similarly"],
"contrast": ["but", "in contrast", "conversely", "however", "still", "nevertheless", "nonetheless", "yet", "and yet", "on the other hand", "on the contrary", "or", "in spite of this", "actually", "in fact", "whereas", "conversely", "in comparison", "by contrast", "in contrast", "contrasting", "alternatively", "although", "otherwise", "instead"],
"summary": ["in summary", "to sum up", "to repeat", "briefly", "in short", "finally", "on the whole", "therefore", "as I have said", "in conclusion", "as seen", "in conclusion", "therefore", "to conclude", "on the whole", "hence", "thus to summarise", "altogether", "overall"],
"rephrasing": ["in other terms", "rather", "or", "better", "in view of this", "in contrast"]}
```
%% Cell type:code id: tags:
``` python
def parse_data(infile, string_input=False) -> list:
"""
This function is to read a gold or a pred file to obtain the label column for accuracy calculation.
:param infile: shared task .rels file
:param string_input: If True, files are replaced by strings with file contents (for import inside other scripts)
:return: a list of labels
"""
if not string_input:
data = io.open(infile, encoding="utf-8").read().strip().replace("\r", "")
else:
data = infile.strip()
labels = [line.split("\t")[-1].lower()
for i, line in enumerate(data.split("\n")) if "\t" in line and i>0]
sentences = [(line.split("\t")[3], line.split("\t")[4], line.split("\t")[-3])
for i, line in enumerate(data.split("\n")) if "\t" in line and i>0]
return sentences, labels
def get_accuracy_score(gold_file, pred_file, string_input=False) -> dict:
_, gold_labels = parse_data(gold_file, string_input)
_, pred_labels = parse_data(pred_file, string_input)
filename = gold_file.split(os.sep)[-1]
assert len(gold_labels) == len(pred_labels), "FATAL: different number of labels detected in gold and pred"
acc = accuracy_score(gold_labels, pred_labels)
score_dict = {"filename": filename,
"acc_score": round(acc, 4),
"gold_rel_count": len(gold_labels),
"pred_rel_count": len(pred_labels)}
return score_dict
def separate_right_wrong(gold_file, pred_file, string_input=False):
rights = []
wrongs = []
gold_sents, gold_labels = parse_data(gold_file, string_input)
pred_sents, pred_labels = parse_data(pred_file, string_input)
for n in range(len(gold_sents)):
if gold_labels[n] == pred_labels[n]:
rights.append([gold_sents[n], gold_labels[n], pred_labels[n]])
else:
wrongs.append([gold_sents[n], gold_labels[n], pred_labels[n]])
return rights, wrongs
```
%% Cell type:code id: tags:
``` python
# Print accuracies
model = 'A_15-epochs_frozen-1_2'
corpus = 'eng.dep.covdtb'
gold_path = '/users/melodi/emetheni/clean_data/'
results_path = 'results/test/' + model + '/'
corpora = sorted([x[:-4] for x in os.listdir('results/test/' + model)
if not "DS" in x if not 'ipy' in x])
# for corpus in corpora:
# score = get_accuracy_score(gold_path + corpus + '/' + corpus + '_test.rels',
# results_path + corpus + '.tsv')
# print(corpus, '\t', score['acc_score'])
```
%% Cell type:code id: tags:
``` python
# Separate
# model = 'A_15-epochs_frozen-1_2'
# corpus = 'eng.dep.covdtb'
model = 'A_15-epochs_frozen-1-2-3_3'
corpus = 'eng.rst.gum'
gold_path = '/users/melodi/emetheni/clean_data/'
results_path = 'results/test/' + model + '/'
corpora = sorted([x[:-4] for x in os.listdir('results/test/' + model)
if not "DS" in x if not 'ipy' in x])
rights, wrongs = separate_right_wrong(gold_path + corpus + '/' + corpus + '_test.rels',
results_path + corpus + '.tsv')
rights_count = dict(OrderedDict(Counter([x[-1] for x in rights])))
wrongs_count = dict(OrderedDict(Counter([x[-1] for x in wrongs])))
# for label in sorted(set(list(rights_count.keys()) + list(wrongs_count.keys())), reverse=False):
# if label in rights_count:
# r = rights_count[label]
# else:
# r = 0
# if label in wrongs_count:
# w = wrongs_count[label]
# else:
# w = 0
# print(label, '\t', r, '\t', w)
```
%% Cell type:code id: tags:
``` python
# Presence of connectives in right/wrong sents
counter = 0
for sent in rights:
sentence = (sent[0][0] + ' ' + sent[0][1]).lower()
if sent[1] in connectives:
if any(x in sentence for x in connectives[sent[1]]):
# print(sent)
counter += 1
print('rights', counter, '/', len(rights), counter/len(rights))
counter = 0
for sent in wrongs:
sentence = (sent[0][0] + ' ' + sent[0][1]).lower()
if sent[1] in connectives:
if any(x in sentence for x in connectives[sent[1]]):
# print(sent)
counter += 1
print('wrongs', counter, '/', len(wrongs), counter/len(wrongs))
```
%% Output
rights 203 / 1657 0.12251056125528063
wrongs 71 / 918 0.07734204793028322
%% Cell type:code id: tags:
``` python
# See direction
counter = 0
for sent in rights:
if sent[0][2] == '1<2':
counter += 1
print('rights', counter, '/', len(rights), counter/len(rights))
counter = 0
for sent in wrongs:
if sent[0][2] == '1<2':
counter += 1
print('wrongs', counter, '/', len(wrongs), counter/len(wrongs))
```
%% Output
rights 1253 / 1657 0.756185878092939
wrongs 735 / 918 0.8006535947712419
%% Cell type:code id: tags:
``` python
rights[:10]
```
%% Output
[[('The prevalence of discrimination across racial groups in contemporary America :',
'The current study seeks to build on this research',
'1>2'),
'organization',
'organization'],
[('The prevalence of discrimination across racial groups in contemporary America :',
'Results from a nationally representative sample of adults',
'1<2'),
'elaboration',
'elaboration'],
[('Introduction .',
'The current study seeks to build on this research',
'1>2'),
'organization',
'organization'],
[('Personal experiences of discrimination and bias have been the focus of much social science research .',
'In many respects , researchers already possess a wealth of knowledge',
'1>2'),
'context',
'context'],
[('Personal experiences of discrimination and bias have been the focus of much social science research .',
'[ 1 - 3 ]',
'1<2'),
'explanation',
'explanation'],
[('Sociologists have explored the adverse consequences of discrimination',
'[ 3 – 5 ] ;',
'1<2'),
'explanation',
'explanation'],
[('Sociologists have explored the adverse consequences of discrimination',
'psychologists have examined the mental processes',
'1<2'),
'joint',
'joint'],
[('psychologists have examined the mental processes',
'that underpin conscious and unconscious biases',
'1<2'),
'elaboration',
'elaboration'],
[('psychologists have examined the mental processes', '[ 6 ] ;', '1<2'),
'explanation',
'explanation'],
[('Sociologists have explored the adverse consequences of discrimination',
'neuroscientists have examined the neurobiological underpinnings of discrimination',
'1<2'),
'joint',
'joint']]
%% Cell type:code id: tags:
``` python
subs = {"Attribution": ["attribution", "attribution-negative"],
"Background": ["background", "circumstance"],
"Cause": ["cause", "result", "consequence"],
"Comparison": ["comparison", "preference", "analogy", "proportion"],
"Condition": ["condition", "hypothetical", "contingency", "otherwise"],
"Contrast": ["contrast", "concession", "antithesis"],
"Elaboration": ["elaboration-additional", "elaboration-general-specific", "elaboration-part-whole", "elaboration-process-step", "elaboration-object-attribute", "elaboration-set-member", "example", "definition"],
"Enablement": ["purpose", "enablement"],
"Evaluation": ["evaluation", "interpretation", "conclusion", "comment"],
"Explanation": ["evidence", "explanation-argumentative", "reason"],
"Joint": ["list", "disjunction"],
"Manner-Means": ["manner", "means"],
"Topic-Comment": ["problem-solution", "question-answer", "statement-response", "topic-comment", "comment-topic", "rhetorical-question"],
"Summary": ["summary", "restatement"],
"Temporal": ["temporal-before", "temporal-after", "temporal-same-time", "sequence", "inverted-sequence"],
"Topic Change": ["topic-shift", "topic-drift"]}
```
%% Cell type:code id: tags:
``` python
rst = ["bg-general", "elab-addition", "manner-means", "attribution", "evaluation", "enablement", "elab-aspect", "joint", "temporal", "result", "bg-goal", "progression", "contrast", "elab-process_step", "elab-enumember", "comparison", "cause", "exp-reason", "exp-evidence", "condition", "summary", "bg-compare", "elab-example", "elab-definition", "cause-result", "findings"]
```
%% Cell type:code id: tags:
``` python
for label in rst:
temp = ''
for k, v in subs.items():
if label in v:
temp = k.lower()
elif '-' in label:
for l in label.split('-'):
if l in v:
temp = temp = k.lower()
elif '.' in label:
for l in label.split('.'):
if l in v:
temp = temp = k.lower()
print(label, '\t', temp)
```
%% Output
bg-general
elab-addition
manner-means manner-means
attribution attribution
evaluation evaluation
enablement enablement
elab-aspect
joint
temporal
result cause
bg-goal
progression
contrast contrast
elab-process_step
elab-enumember
comparison comparison
cause cause
exp-reason explanation
exp-evidence explanation
condition condition
summary summary
bg-compare
elab-example elaboration
elab-definition elaboration
cause-result cause
findings
%% Cell type:code id: tags:
``` python
```
......@@ -213,13 +213,14 @@ def print_results_to_file(corpus,
test_sentences,
test_results,
inv_mappings_dict,
substitutions_file):
substitutions_file,
output_folder):
''' Function to print a new file with the test predictions per
the specifications of the Shared task.
Returns: one file per corpus with predictions.
'''
output_folder = 'results'
# output_folder = 'results'
header = '\t'.join(['doc',
'unit1_toks',
'unit2_toks',
......@@ -260,6 +261,8 @@ def print_results_to_file(corpus,
pass
temp = sent[:11] + [label]
results_to_write.append(temp)
assert len(results_to_write) == len(test_sentences)
with open(output_folder + '/' + corpus + '.tsv', 'a+', encoding='utf-8') as f:
f.write(header + '\n')
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment