Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
D
DiscReT-Zero-Shot
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
MELODI
AnDiAMO
DiscReT-Zero-Shot
Commits
23d2149f
Commit
23d2149f
authored
1 year ago
by
emetheni
Browse files
Options
Downloads
Patches
Plain Diff
update before server shutdown
parent
b14b8d54
No related branches found
No related tags found
No related merge requests found
Changes
4
Hide whitespace changes
Inline
Side-by-side
Showing
4 changed files
classifier_bare_huggingface.py
+16
-7
16 additions, 7 deletions
classifier_bare_huggingface.py
classifier_bare_pytorch.py
+2
-2
2 additions, 2 deletions
classifier_bare_pytorch.py
make_adapter.py
+1
-1
1 addition, 1 deletion
make_adapter.py
utils.py
+53
-39
53 additions, 39 deletions
utils.py
with
72 additions
and
49 deletions
classifier_bare_huggingface.py
+
16
−
7
View file @
23d2149f
...
...
@@ -16,7 +16,6 @@ from utils import *
device
=
torch
.
device
(
"
cuda
"
)
# print('\n\nwith Language token - eng + Corpus (no framework) \n')
# ---------------------------------------------------------------------------------------------------
args
=
parse_args
()
...
...
@@ -41,6 +40,11 @@ mappings, inv_mappings = open_mappings(args.mappings_file)
# Open sentences
train_sentences
,
dev_dict_sentences
,
test_dict_sentences
,
framework_labels
=
open_sentences_with_lang
(
args
.
data_path
,
mappings
)
print
(
'
\n
Check encodings:
\n
'
)
print
(
train_sentences
[
0
])
# make pandas dataframes
file_header
=
[
'
text
'
,
'
labels
'
]
...
...
@@ -152,18 +156,23 @@ trainer.train()
print
(
'
\n
Dev results:
'
)
for
corpus
in
encoded_dev_dataset
:
print
()
dev_results_
=
get_predictions_huggingface
(
trainer
,
corpus
,
encoded_dev_dataset
[
corpus
]
)
dev_results
=
better_predictions_huggingface
(
trainer
,
corpus
,
encoded_dev_dataset
[
corpus
],
framework_labels
[
corpus
.
split
(
'
.
'
)[
1
]]
)
print
(
dev_results
)
path_results
=
'
results/dev/
'
+
args
.
transformer_model
+
'
_
'
+
str
(
args
.
num_epochs
)
if
not
os
.
path
.
exists
(
path_results
):
os
.
makedirs
(
path_results
)
# path_results = 'results/dev/' + args.transformer_model + '_' + str(args.num_epochs)
# if not os.path.exists(path_results):
# os.makedirs(path_results)
# print_results_to_file(corpus,
# dev_dict_sentences[corpus],
...
...
This diff is collapsed.
Click to expand it.
classifier_bare_pytorch.py
+
2
−
2
View file @
23d2149f
...
...
@@ -172,8 +172,8 @@ def train(model,
total_loss_train
=
0
batch_counter
=
0
#
for train_input, train_label in tqdm(train_dataloader):
for
train_input
,
train_label
in
train_dataloader
:
for
train_input
,
train_label
in
tqdm
(
train_dataloader
):
#
for train_input, train_label in train_dataloader:
batch_counter
+=
1
train_label
=
train_label
.
to
(
device
)
mask
=
train_input
[
'
attention_mask
'
].
to
(
device
)
...
...
This diff is collapsed.
Click to expand it.
make_adapter.py
+
1
−
1
View file @
23d2149f
...
...
@@ -32,7 +32,7 @@ print('Frozen layers:', args.freeze_layers.replace(';', ', '))
mappings
,
inv_mappings
=
open_mappings
(
args
.
mappings_file
)
# Open sentences
train_sentences
,
dev_dict_sentences
,
_
=
open_sentences_with_lang
(
args
.
data_path
,
mappings
)
train_sentences
,
dev_dict_sentences
,
_
,
framework_labels
=
open_sentences_with_lang
(
args
.
data_path
,
mappings
)
print
(
'
\n
Check encodings:
\n
'
)
...
...
This diff is collapsed.
Click to expand it.
utils.py
+
53
−
39
View file @
23d2149f
...
...
@@ -11,6 +11,20 @@ from sklearn.metrics import accuracy_score
args
=
parse_args
()
def
switch_dimensions
(
vector_list
):
target_dim_len
=
len
(
vector_list
[
0
])
new_vector
=
[]
for
n
in
range
(
target_dim_len
):
temp
=
[]
for
x
in
vector_list
:
temp
.
append
(
x
[
n
])
new_vector
.
append
(
temp
)
return
new_vector
def
open_mappings
(
mappings_file
):
'''
Open the mappings file into a dictionary.
'''
...
...
@@ -21,19 +35,18 @@ def open_mappings(mappings_file):
for
line
in
f
:
l
=
line
.
strip
().
split
(
'
\t
'
)
mappings
[
l
[
0
]]
=
int
(
l
[
-
1
])
inv_mappings
=
[]
# this cannot be a dictionary! it has to be tuples
# because we have some labels which are replaced, e.g.
# joint-list is replaced
# reject the converted labels
inv_mappings
=
{}
for
k
,
v
in
mappings
.
items
():
inv_mappings
.
append
((
v
,
k
))
if
v
not
in
inv_mappings
:
inv_mappings
[
v
]
=
k
return
mappings
,
inv_mappings
def
encode_label
(
og_label
,
mappings_dict
):
label
=
og_label
.
lower
()
label
=
og_label
.
lower
()
.
strip
()
if
label
in
mappings_dict
:
return
mappings_dict
[
label
]
else
:
...
...
@@ -72,11 +85,11 @@ def open_file(filename, mappings_dict):
# flip them if different direction
if
args
.
normalize_direction
==
'
yes
'
:
if
l
[
9
]
==
'
1>2
'
:
lines
.
append
(
l
+
[
sent_1
+
[
SEP_token
]
+
sent_2
,
encode_label
(
l
[
1
1
],
mappings_dict
)])
lines
.
append
(
l
+
[
sent_1
+
[
SEP_token
]
+
sent_2
,
encode_label
(
l
[
-
1
],
mappings_dict
)])
else
:
lines
.
append
(
l
+
[
sent_2
+
[
SEP_token
]
+
sent_1
,
encode_label
(
l
[
1
1
],
mappings_dict
)])
lines
.
append
(
l
+
[
sent_2
+
[
SEP_token
]
+
sent_1
,
encode_label
(
l
[
-
1
],
mappings_dict
)])
else
:
lines
.
append
(
l
+
[
sent_1
+
[
SEP_token
]
+
sent_2
,
encode_label
(
l
[
1
1
],
mappings_dict
)])
lines
.
append
(
l
+
[
sent_1
+
[
SEP_token
]
+
sent_2
,
encode_label
(
l
[
-
1
],
mappings_dict
)])
return
lines
...
...
@@ -142,20 +155,22 @@ def open_file_with_lang(filename, mappings_dict):
# flip them if different direction
if
args
.
normalize_direction
==
'
yes
'
:
if
l
[
9
]
==
'
1>2
'
:
lines
.
append
(
l
+
[[
lang
,
fullname
]
+
sent_1
+
[
SEP_token
]
+
sent_2
,
encode_label
(
l
[
11
],
mappings_dict
)])
lines
.
append
(
l
+
[[
lang
,
fullname
,
framework
]
+
sent_1
+
[
SEP_token
]
+
sent_2
,
encode_label
(
l
[
11
],
mappings_dict
)])
else
:
lines
.
append
(
l
+
[[
lang
,
fullname
]
+
sent_2
+
[
SEP_token
]
+
sent_1
,
encode_label
(
l
[
11
],
mappings_dict
)])
lines
.
append
(
l
+
[[
lang
,
fullname
,
framework
]
+
sent_2
+
[
SEP_token
]
+
sent_1
,
encode_label
(
l
[
11
],
mappings_dict
)])
else
:
lines
.
append
(
l
+
[[
lang
,
fullname
]
+
sent_1
+
[
SEP_token
]
+
sent_2
,
encode_label
(
l
[
11
],
mappings_dict
)])
lines
.
append
(
l
+
[[
lang
,
fullname
,
framework
]
+
sent_1
+
[
SEP_token
]
+
sent_2
,
encode_label
(
l
[
11
],
mappings_dict
)])
return
lines
def
encode_batch
(
batch
):
"""
Encodes a batch of input data using the model tokenizer.
Works for a pandas DF column, instead of a list.
"""
tokenizer
=
AutoTokenizer
.
from_pretrained
(
'
xlm-roberta-base
'
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
transformer_model
)
return
tokenizer
(
batch
[
"
text
"
],
max_length
=
512
,
truncation
=
True
,
...
...
@@ -289,7 +304,7 @@ def open_sentences_with_lang(path_to_corpora, mappings_dict):
for
x
in
os
.
listdir
(
path_to_corpora
+
'
/
'
+
corpus
)
if
'
train
'
in
x
and
'
rels
'
in
x
][
0
]
temp
=
open_file
(
train_file
,
mappings_dict
)
temp
=
open_file
_with_lang
(
train_file
,
mappings_dict
)
train_sentences
+=
temp
all_labels
[
framework
]
+=
[
l
[
-
1
]
for
l
in
temp
]
except
:
# some of them don't have train
...
...
@@ -373,7 +388,7 @@ def get_predictions_huggingface(trainer,
test_set
,
print_results
=
True
):
'''
SPECIFI FUNCTION FOR THE HUGGINGFACE TRAINER.
'''
SPECIFI
C
FUNCTION FOR THE HUGGINGFACE TRAINER.
Function to get the model
'
s predictions for one corpus
'
test set.
Can print accuracy using scikit-learn.
Also works with dev sets -- just don
'
t save the outputs.
...
...
@@ -381,7 +396,6 @@ def get_predictions_huggingface(trainer,
'''
results
=
trainer
.
predict
(
test_set
)
preds
=
np
.
softmax
(
results
.
predictions
,
axis
=
1
)
top_preds
=
np
.
argmax
(
results
.
predictions
,
axis
=
1
)
results
=
results
.
label_ids
test_acc
=
round
(
accuracy_score
(
top_preds
,
results
),
4
)
...
...
@@ -389,7 +403,7 @@ def get_predictions_huggingface(trainer,
if
print_results
:
print
(
corpus
,
'
\t
'
,
test_acc
,
'
\n
'
)
return
preds
return
top_
preds
def
better_predictions_huggingface
(
trainer
,
...
...
@@ -398,33 +412,33 @@ def better_predictions_huggingface(trainer,
corpus_labels
,
print_results
=
True
):
'''
SPECIFI FUNCTION FOR THE HUGGINGFACE TRAINER.
Function to get the model
'
s predictions for one corpus
'
test set.
Can print accuracy using scikit-learn.
Also works with dev sets -- just don
'
t save the outputs.
Returns: list of predictions that match test file
'
s lines.
'''
'''
results
=
trainer
.
predict
(
test_set
)
preds
=
np
.
argmax
(
results
.
predictions
,
axis
=
1
)
o
ri
g_labels
=
results
.
label
_ids
test_acc
=
round
(
accuracy_score
(
top_preds
,
orig_labels
),
4
)
orig_labels
=
results
.
label_ids
.
tolist
(
)
p
ri
nt
(
'
len sentences
'
,
len
(
orig_
label
s
))
print
(
'
shape of preds
'
,
results
.
predictions
.
shape
)
if
print_results
:
print
(
corpus
+
'
\t
'
+
str
(
test_acc
)
+
'
\n
'
,
flush
=
'
True
'
)
results_per_sent
=
results
.
predictions
.
tolist
()
print
(
type
(
results
.
predictions
))
# try to make the better prediction bit
best_labels
=
[]
for
n
,
result
in
enumerate
(
results
.
predictions
.
tolist
()
):
orig_label
=
results
.
label_ids
[
n
]
for
sent
,
sent_
result
s
in
enumerate
(
results
_per_sent
):
best_prob
=
-
1000
best_label
=
-
1
if
orig_label
in
corpus_labels
:
if
result
>
best_prob
:
best_prob
=
result
best_label
=
n
best_labels
.
append
(
n
)
best_label
=
-
1
#assert len(sent_results) == len(orig_labels)
for
n
,
prob
in
enumerate
(
sent_results
):
if
n
in
corpus_labels
:
if
prob
>
best_prob
:
best_prob
=
prob
best_label
=
n
best_labels
.
append
(
best_label
)
test_acc
=
round
(
accuracy_score
(
best_labels
,
orig_labels
),
4
)
print
(
'
better:
\t
'
+
str
(
test_acc
)
+
'
\n
'
,
flush
=
'
True
'
)
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment