Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
D
DiscReT-Zero-Shot
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
MELODI
AnDiAMO
DiscReT-Zero-Shot
Commits
571902b3
Commit
571902b3
authored
1 year ago
by
emetheni
Browse files
Options
Downloads
Patches
Plain Diff
format code to black
parent
7cec7e97
No related branches found
No related tags found
No related merge requests found
Changes
5
Expand all
Hide whitespace changes
Inline
Side-by-side
Showing
5 changed files
README.md
+1
-1
1 addition, 1 deletion
README.md
classifier_pytorch.py
+112
-111
112 additions, 111 deletions
classifier_pytorch.py
configure.py
+72
-33
72 additions, 33 deletions
configure.py
make_mappings_zero-shot.py
+27
-23
27 additions, 23 deletions
make_mappings_zero-shot.py
utils.py
+299
-256
299 additions, 256 deletions
utils.py
with
511 additions
and
424 deletions
README.md
+
1
−
1
View file @
571902b3
...
@@ -18,7 +18,7 @@ The full list of datasets with statistics: [here](https://github.com/disrpt/shar
...
@@ -18,7 +18,7 @@ The full list of datasets with statistics: [here](https://github.com/disrpt/shar
*
transformers
*
transformers
*
scikit-learn
*
scikit-learn
Install requirements with
```pip install requirements.txt```
.
Install requirements with
```pip install
-r
requirements.txt```
.
## Run
## Run
...
...
This diff is collapsed.
Click to expand it.
classifier_pytorch.py
+
112
−
111
View file @
571902b3
...
@@ -3,7 +3,12 @@
...
@@ -3,7 +3,12 @@
import
torch
import
torch
import
numpy
as
np
import
numpy
as
np
from
transformers
import
AutoModel
,
AutoTokenizer
,
get_linear_schedule_with_warmup
,
set_seed
from
transformers
import
(
AutoModel
,
AutoTokenizer
,
get_linear_schedule_with_warmup
,
set_seed
,
)
from
torch
import
nn
from
torch
import
nn
from
torch.optim
import
AdamW
from
torch.optim
import
AdamW
from
torch.utils.data
import
DataLoader
from
torch.utils.data
import
DataLoader
...
@@ -23,33 +28,36 @@ now = datetime.now()
...
@@ -23,33 +28,36 @@ now = datetime.now()
dt_string
=
now
.
strftime
(
"
%d.%m.%y-%H:%M:%S
"
)
dt_string
=
now
.
strftime
(
"
%d.%m.%y-%H:%M:%S
"
)
layers_to_freeze
=
args
.
freeze_layers
.
split
(
"
;
"
)
layers_to_freeze
=
args
.
freeze_layers
.
split
(
"
;
"
)
print
(
'
\n
Training with datasets:
'
+
args
.
langs_to_use
)
print
(
"
Training with datasets:
"
+
args
.
langs_to_use
)
print
(
'
Mappings file:
'
+
args
.
mappings_file
,
flush
=
'
True
'
)
print
(
"
Mappings file:
"
+
args
.
mappings_file
,
flush
=
"
True
"
)
# ===============
# ===============
# Dataset class
# Dataset class
# ===============
# ===============
class
Dataset
(
torch
.
utils
.
data
.
Dataset
):
class
Dataset
(
torch
.
utils
.
data
.
Dataset
):
def
__init__
(
self
,
sentences
):
def
__init__
(
self
,
sentences
):
self
.
labels
=
[
sent
[
-
1
]
for
sent
in
sentences
]
self
.
labels
=
[
sent
[
-
1
]
for
sent
in
sentences
]
self
.
texts
=
[
tokenizer
(
sent
[
-
2
],
self
.
texts
=
[
is_split_into_words
=
True
,
tokenizer
(
padding
=
'
max_length
'
,
sent
[
-
2
],
max_length
=
512
,
is_split_into_words
=
True
,
truncation
=
True
,
padding
=
"
max_length
"
,
return_tensors
=
"
pt
"
)
max_length
=
512
,
for
sent
in
sentences
]
truncation
=
True
,
return_tensors
=
"
pt
"
,
)
for
sent
in
sentences
]
def
classes
(
self
):
def
classes
(
self
):
return
self
.
labels
return
self
.
labels
def
__len__
(
self
):
def
__len__
(
self
):
return
len
(
self
.
labels
)
return
len
(
self
.
labels
)
def
get_batch_labels
(
self
,
idx
):
def
get_batch_labels
(
self
,
idx
):
# Fetch a batch of labels
# Fetch a batch of labels
return
np
.
array
(
self
.
labels
[
idx
])
return
np
.
array
(
self
.
labels
[
idx
])
...
@@ -59,12 +67,12 @@ class Dataset(torch.utils.data.Dataset):
...
@@ -59,12 +67,12 @@ class Dataset(torch.utils.data.Dataset):
return
self
.
texts
[
idx
]
return
self
.
texts
[
idx
]
def
__getitem__
(
self
,
idx
):
def
__getitem__
(
self
,
idx
):
batch_texts
=
self
.
get_batch_texts
(
idx
)
batch_texts
=
self
.
get_batch_texts
(
idx
)
batch_y
=
self
.
get_batch_labels
(
idx
)
batch_y
=
self
.
get_batch_labels
(
idx
)
return
batch_texts
,
batch_y
return
batch_texts
,
batch_y
# ===============
# ===============
# Load datasets
# Load datasets
# ===============
# ===============
...
@@ -72,24 +80,32 @@ class Dataset(torch.utils.data.Dataset):
...
@@ -72,24 +80,32 @@ class Dataset(torch.utils.data.Dataset):
# Open mappings
# Open mappings
mappings
,
inv_mappings
=
open_mappings
(
args
.
mappings_file
)
mappings
,
inv_mappings
=
open_mappings
(
args
.
mappings_file
)
batch_size
=
args
.
batch_size
batch_size
=
args
.
batch_size
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
transformer_model
)
tokenizer
=
AutoTokenizer
.
from_pretrained
(
args
.
transformer_model
)
train_sentences
,
dev_dict_sentences
,
test_dict_sentences
,
framework_labels
=
open_sentences_with_lang
(
args
.
data_path
,
mappings
)
(
train_sentences
,
dev_dict_sentences
,
test_dict_sentences
,
framework_labels
,
)
=
open_sentences_with_lang
(
args
.
data_path
,
mappings
)
# Determine linear size (= number of classes in the sets + 1)
# Determine linear size (= number of classes in the sets + 1)
num_labels
=
len
(
set
(
sent
[
-
1
]
for
sent
in
train_sentences
))
+
1
num_labels
=
len
(
set
(
sent
[
-
1
]
for
sent
in
train_sentences
))
+
1
# make train/dev datasets
# make train/dev datasets
train_dataset
=
Dataset
(
train_sentences
)
train_dataset
=
Dataset
(
train_sentences
)
dev_dataset
=
{
corpus
:
Dataset
(
s
)
for
corpus
,
s
in
dev_dict_sentences
.
items
()}
dev_dataset
=
{
corpus
:
Dataset
(
s
)
for
corpus
,
s
in
dev_dict_sentences
.
items
()}
test_dataset
=
{
corpus
:
Dataset
(
s
)
for
corpus
,
s
in
test_dict_sentences
.
items
()}
test_dataset
=
{
corpus
:
Dataset
(
s
)
for
corpus
,
s
in
test_dict_sentences
.
items
()}
# Make dasets with batches and dataloader
# Make dasets with batches and dataloader
train_dataloader
=
DataLoader
(
train_dataset
,
batch_size
,
shuffle
=
True
)
train_dataloader
=
DataLoader
(
train_dataset
,
batch_size
,
shuffle
=
True
)
dev_dict_dataloader
=
{
corpus
:
DataLoader
(
dev_data
,
batch_size
)
dev_dict_dataloader
=
{
for
corpus
,
dev_data
in
dev_dataset
.
items
()}
corpus
:
DataLoader
(
dev_data
,
batch_size
)
for
corpus
,
dev_data
in
dev_dataset
.
items
()
test_dict_dataloader
=
{
corpus
:
DataLoader
(
test_data
,
batch_size
)
}
for
corpus
,
test_data
in
test_dataset
.
items
()}
test_dict_dataloader
=
{
corpus
:
DataLoader
(
test_data
,
batch_size
)
for
corpus
,
test_data
in
test_dataset
.
items
()
}
print
(
"
\n
Datasets loaded!
\n
"
)
print
(
"
\n
Datasets loaded!
\n
"
)
...
@@ -97,22 +113,20 @@ print("\nDatasets loaded!\n")
...
@@ -97,22 +113,20 @@ print("\nDatasets loaded!\n")
# Model setup
# Model setup
# ===============
# ===============
class
TransformerClassifier
(
nn
.
Module
):
class
TransformerClassifier
(
nn
.
Module
):
def
__init__
(
self
,
dropout
=
args
.
dropout
):
def
__init__
(
self
,
dropout
=
args
.
dropout
):
super
(
TransformerClassifier
,
self
).
__init__
()
super
(
TransformerClassifier
,
self
).
__init__
()
self
.
tr_model
=
AutoModel
.
from_pretrained
(
args
.
transformer_model
)
self
.
tr_model
=
AutoModel
.
from_pretrained
(
args
.
transformer_model
)
self
.
dropout
=
nn
.
Dropout
(
dropout
)
self
.
dropout
=
nn
.
Dropout
(
dropout
)
self
.
linear
=
nn
.
Linear
(
768
,
num_labels
)
# bert input x num of classes
self
.
linear
=
nn
.
Linear
(
768
,
num_labels
)
# bert input x num of classes
self
.
relu
=
nn
.
ReLU
()
self
.
relu
=
nn
.
ReLU
()
def
forward
(
self
,
input_id
,
mask
):
def
forward
(
self
,
input_id
,
mask
):
outputs
=
self
.
tr_model
(
outputs
=
self
.
tr_model
(
input_ids
=
input_id
,
input_ids
=
input_id
,
attention_mask
=
mask
,
return_dict
=
True
attention_mask
=
mask
,
)[
"
last_hidden_state
"
][:,
0
,
:]
return_dict
=
True
)[
'
last_hidden_state
'
][:,
0
,
:]
dropout_output
=
self
.
dropout
(
outputs
)
dropout_output
=
self
.
dropout
(
outputs
)
linear_output
=
self
.
linear
(
dropout_output
)
linear_output
=
self
.
linear
(
dropout_output
)
final_layer
=
self
.
relu
(
linear_output
)
final_layer
=
self
.
relu
(
linear_output
)
...
@@ -123,133 +137,120 @@ class TransformerClassifier(nn.Module):
...
@@ -123,133 +137,120 @@ class TransformerClassifier(nn.Module):
model
=
TransformerClassifier
()
model
=
TransformerClassifier
()
def
train
(
model
,
def
train
(
train_dataloader
,
model
,
dev_dict
_dataloader
,
train
_dataloader
,
test_dict_sentences
,
dev_dict_dataloader
,
test_dict_
dataloader
,
test_dict_
sentences
,
epochs
,
test_dict_dataloader
,
#specific_results
epochs
,
):
# specific_results
):
device
=
torch
.
device
(
"
cpu
"
)
device
=
torch
.
device
(
"
cpu
"
)
criterion
=
nn
.
CrossEntropyLoss
()
criterion
=
nn
.
CrossEntropyLoss
()
optimizer
=
AdamW
(
model
.
parameters
(),
#Adam
optimizer
=
AdamW
(
model
.
parameters
(),
lr
=
2e-5
,
eps
=
1e-8
)
# Adam # 1e-6
lr
=
2e-5
,
#1e-6
eps
=
1e-8
)
if
args
.
use_cuda
==
'
yes
'
:
if
args
.
use_cuda
==
"
yes
"
:
device
=
torch
.
device
(
"
cuda
"
)
device
=
torch
.
device
(
"
cuda
"
)
model
=
model
.
cuda
()
model
=
model
.
cuda
()
criterion
=
criterion
.
cuda
()
criterion
=
criterion
.
cuda
()
gradient_accumulation_steps
=
args
.
gradient_accumulation_steps
gradient_accumulation_steps
=
args
.
gradient_accumulation_steps
total_steps
=
len
(
train_dataloader
)
*
epochs
total_steps
=
len
(
train_dataloader
)
*
epochs
scheduler
=
get_linear_schedule_with_warmup
(
optimizer
,
scheduler
=
get_linear_schedule_with_warmup
(
num_warmup_steps
=
0
,
optimizer
,
num_warmup_steps
=
0
,
num_training_steps
=
total_steps
num_training_steps
=
total_steps
)
)
seed_val
=
42
seed_val
=
42
set_seed
(
seed_val
)
set_seed
(
seed_val
)
torch
.
manual_seed
(
seed_val
)
torch
.
manual_seed
(
seed_val
)
torch
.
cuda
.
manual_seed_all
(
seed_val
)
torch
.
cuda
.
manual_seed_all
(
seed_val
)
# Freeze layers if you want
# Freeze layers if you want
if
args
.
freeze_layers
!=
''
:
if
args
.
freeze_layers
!=
""
:
for
name
,
param
in
model
.
named_parameters
():
for
name
,
param
in
model
.
named_parameters
():
if
any
(
x
in
name
for
x
in
layers_to_freeze
):
if
any
(
x
in
name
for
x
in
layers_to_freeze
):
param
.
requires_grad
=
False
param
.
requires_grad
=
False
for
epoch_num
in
range
(
0
,
epochs
):
for
epoch_num
in
range
(
0
,
epochs
):
print
(
'
\n
=== Epoch {:} / {:} ===
'
.
format
(
epoch_num
+
1
,
epochs
))
print
(
"
\n
=== Epoch {:} / {:} ===
"
.
format
(
epoch_num
+
1
,
epochs
))
model
.
train
()
model
.
train
()
total_acc_train
=
0
total_acc_train
=
0
total_loss_train
=
0
total_loss_train
=
0
batch_counter
=
0
batch_counter
=
0
for
train_input
,
train_label
in
tqdm
(
train_dataloader
):
for
train_input
,
train_label
in
tqdm
(
train_dataloader
):
batch_counter
+=
1
batch_counter
+=
1
train_label
=
train_label
.
to
(
device
)
train_label
=
train_label
.
to
(
device
)
mask
=
train_input
[
'
attention_mask
'
].
to
(
device
)
mask
=
train_input
[
"
attention_mask
"
].
to
(
device
)
input_id
=
train_input
[
'
input_ids
'
].
squeeze
(
1
).
to
(
device
)
input_id
=
train_input
[
"
input_ids
"
].
squeeze
(
1
).
to
(
device
)
output
=
model
(
input_id
,
mask
)
output
=
model
(
input_id
,
mask
)
# Compute Loss and Perform Back-propagation
# Compute Loss and Perform Back-propagation
loss
=
criterion
(
output
,
train_label
.
long
())
loss
=
criterion
(
output
,
train_label
.
long
())
# Normalize the Gradients
# Normalize the Gradients
loss
=
loss
/
gradient_accumulation_steps
loss
=
loss
/
gradient_accumulation_steps
loss
.
backward
()
loss
.
backward
()
if
batch_counter
%
gradient_accumulation_steps
==
0
:
if
(
batch_counter
%
gradient_accumulation_steps
==
0
):
# Update Optimizer
# Update Optimizer
optimizer
.
step
()
optimizer
.
step
()
optimizer
.
zero_grad
()
optimizer
.
zero_grad
()
model
.
zero_grad
()
model
.
zero_grad
()
torch
.
nn
.
utils
.
clip_grad_norm_
(
model
.
parameters
(),
1.0
)
torch
.
nn
.
utils
.
clip_grad_norm_
(
model
.
parameters
(),
1.0
)
scheduler
.
step
()
scheduler
.
step
()
# ------ Validation --------
# ------ Validation --------
print
(
'
\n
Validation for epoch:
'
,
epoch_num
+
1
)
print
(
"
\n
Validation for epoch:
"
,
epoch_num
+
1
)
# Dev and test results for each corpus. We don't need to save the results.
# Dev and test results for each corpus. We don't need to save the results.
for
corpus
in
dev_dict_dataloader
:
for
corpus
in
dev_dict_dataloader
:
dev_results
=
get_predictions
(
dev_results
=
get_predictions
(
model
,
corpus
,
dev_dict_dataloader
[
corpus
])
model
,
corpus
,
dev_dict_dataloader
[
corpus
]
)
better_dev_results
=
get_better_predictions
(
better_dev_results
=
get_better_predictions
(
model
,
model
,
corpus
,
corpus
,
dev_dict_dataloader
[
corpus
],
dev_dict_dataloader
[
corpus
],
framework_labels
[
corpus
.
split
(
'
.
'
)[
1
]],
framework_labels
[
corpus
.
split
(
"
.
"
)[
1
]],
inv_mappings
,
inv_mappings
,
epoch_num
+
1
,
epoch_num
+
1
,
save_conf_matrix
=
False
save_conf_matrix
=
False
,
)
)
# ------ Test --------
# ------ Test --------
print
(
'
\n
Test results for epoch:
'
,
epoch_num
+
1
)
print
(
"
\n
Test results for epoch:
"
,
epoch_num
+
1
)
for
corpus
in
test_dict_dataloader
:
for
corpus
in
test_dict_dataloader
:
test_results
=
get_predictions
(
test_results
=
get_predictions
(
model
,
corpus
,
test_dict_dataloader
[
corpus
])
model
,
corpus
,
test_dict_dataloader
[
corpus
]
)
better_test_results
=
get_better_predictions
(
better_test_results
=
get_better_predictions
(
model
,
model
,
corpus
,
corpus
,
test_dict_dataloader
[
corpus
],
test_dict_dataloader
[
corpus
],
framework_labels
[
corpus
.
split
(
'
.
'
)[
1
]],
framework_labels
[
corpus
.
split
(
"
.
"
)[
1
]],
inv_mappings
,
inv_mappings
,
epoch_num
+
1
,
epoch_num
+
1
,
save_conf_matrix
=
False
save_conf_matrix
=
False
,
)
)
# ------- Start the training -------
# ------- Start the training -------
print
(
'
\n
Model:
'
,
args
.
transformer_model
)
print
(
"
\n
Model:
"
,
args
.
transformer_model
)
print
(
'
Batch size:
'
,
args
.
batch_size
*
args
.
gradient_accumulation_steps
)
print
(
"
Batch size:
"
,
args
.
batch_size
*
args
.
gradient_accumulation_steps
)
print
(
'
\n
Start training...
\n
'
)
print
(
"
\n
Start training...
\n
"
)
train
(
model
,
train
(
train_dataloa
de
r
,
mo
de
l
,
dev_dict
_dataloader
,
train
_dataloader
,
test_dict_sentences
,
dev_dict_dataloader
,
test_dict_
dataloader
,
test_dict_
sentences
,
args
.
num_epochs
test_dict_dataloader
,
)
args
.
num_epochs
,
print
(
'
\n
Training Done!
'
)
)
print
(
"
\n
Training Done!
"
)
This diff is collapsed.
Click to expand it.
configure.py
+
72
−
33
View file @
571902b3
import
argparse
import
argparse
import
sys
import
sys
def
parse_args
():
def
parse_args
():
"""
"""
Parse input arguments.
Parse input arguments.
"""
"""
parser
=
argparse
.
ArgumentParser
()
parser
=
argparse
.
ArgumentParser
()
# path to data
# path to data
parser
.
add_argument
(
"
--data_path
"
,
default
=
"
./data
"
,
type
=
str
,
parser
.
add_argument
(
help
=
"
The path to the shared task data file from Github.
"
)
"
--data_path
"
,
default
=
"
./data
"
,
type
=
str
,
help
=
"
The path to the shared task data file from Github.
"
,
)
# label mappings to integers
# label mappings to integers
parser
.
add_argument
(
"
--mappings_file
"
,
default
=
"
mappings/mappings_substitutions.tsv
"
,
type
=
str
,
parser
.
add_argument
(
help
=
"
The mappings file for all relations.
"
)
"
--mappings_file
"
,
default
=
"
mappings/mappings_substitutions.tsv
"
,
type
=
str
,
help
=
"
The mappings file for all relations.
"
,
)
# transformer model
# transformer model
parser
.
add_argument
(
"
--transformer_model
"
,
default
=
"
bert-base-multilingual-cased
"
,
type
=
str
,
parser
.
add_argument
(
help
=
"
Model used, default: bert-multilingual-base-cased
"
)
"
--transformer_model
"
,
default
=
"
bert-base-multilingual-cased
"
,
type
=
str
,
help
=
"
Model used, default: bert-multilingual-base-cased
"
,
)
# Number of training epochs
# Number of training epochs
parser
.
add_argument
(
"
--num_epochs
"
,
default
=
10
,
type
=
int
,
parser
.
add_argument
(
help
=
"
Number of training epochs. Default: 10
"
)
"
--num_epochs
"
,
default
=
10
,
type
=
int
,
help
=
"
Number of training epochs. Default: 10
"
,
)
# Number of gradient accumulation steps
# Number of gradient accumulation steps
parser
.
add_argument
(
"
--gradient_accumulation_steps
"
,
default
=
16
,
type
=
int
,
parser
.
add_argument
(
help
=
"
Number of gradient accumulation steps. Default: 16
"
)
"
--gradient_accumulation_steps
"
,
default
=
16
,
type
=
int
,
help
=
"
Number of gradient accumulation steps. Default: 16
"
,
)
# Dropout
# Dropout
parser
.
add_argument
(
"
--dropout
"
,
default
=
0.1
,
type
=
float
,
parser
.
add_argument
(
"
--dropout
"
,
default
=
0.1
,
type
=
float
,
help
=
"
Dropout.
"
)
help
=
"
Dropout.
"
)
# Batch size
# Batch size
parser
.
add_argument
(
"
--batch_size
"
,
default
=
8
,
type
=
int
,
parser
.
add_argument
(
help
=
"
With CUDA: max. 8, without: max. 16. Default: 8
"
)
"
--batch_size
"
,
default
=
8
,
type
=
int
,
help
=
"
With CUDA: max. 8, without: max. 16. Default: 8
"
,
)
# Use CUDA
# Use CUDA
parser
.
add_argument
(
"
--use_cuda
"
,
default
=
'
yes
'
,
type
=
str
,
parser
.
add_argument
(
help
=
"
Use CUDA [yes/no]. Careful of batch size!
"
)
"
--use_cuda
"
,
default
=
"
yes
"
,
type
=
str
,
help
=
"
Use CUDA [yes/no]. Careful of batch size!
"
,
)
# freeze layers
# freeze layers
parser
.
add_argument
(
"
--freeze_layers
"
,
default
=
''
,
type
=
str
,
parser
.
add_argument
(
help
=
"
List of layer(s) to freeze, a str separated by ;. Example:
'
layer.1;layer.2
'"
)
"
--freeze_layers
"
,
default
=
""
,
type
=
str
,
help
=
"
List of layer(s) to freeze, a str separated by ;. Example:
'
layer.1;layer.2
'"
,
)
# normalize direction
# normalize direction
parser
.
add_argument
(
"
--normalize_direction
"
,
default
=
'
yes
'
,
type
=
str
,
parser
.
add_argument
(
help
=
"
Change order of sentences when the direction of relations is 1<2 to 2>1.
"
)
"
--normalize_direction
"
,
default
=
"
yes
"
,
type
=
str
,
help
=
"
Change order of sentences when the direction of relations is 1<2 to 2>1.
"
,
)
# only specific languages/corpora
# only specific languages/corpora
parser
.
add_argument
(
"
--langs_to_use
"
,
default
=
'
@
'
,
type
=
str
,
parser
.
add_argument
(
help
=
"
List of languages/corpora to use, a str separated by ;
"
)
"
--langs_to_use
"
,
default
=
"
@
"
,
type
=
str
,
help
=
"
List of languages/corpora to use, a str separated by ;
"
,
)
args
=
parser
.
parse_args
()
args
=
parser
.
parse_args
()
return
args
return
args
This diff is collapsed.
Click to expand it.
make_mappings_zero-shot.py
+
27
−
23
View file @
571902b3
...
@@ -11,13 +11,13 @@ args = parse_args()
...
@@ -11,13 +11,13 @@ args = parse_args()
# -----------------------------------
# -----------------------------------
# open substitutions per file
# open substitutions per file
mappings
=
{}
mappings
=
{}
with
open
(
'
mappings/mappings_substitutions.tsv
'
,
'
r
'
,
encoding
=
'
utf-8
'
)
as
f
:
with
open
(
"
mappings/mappings_substitutions.tsv
"
,
"
r
"
,
encoding
=
"
utf-8
"
)
as
f
:
next
(
f
)
next
(
f
)
for
line
in
f
:
for
line
in
f
:
l
=
line
.
strip
().
split
(
'
\t
'
)
l
=
line
.
strip
().
split
(
"
\t
"
)
mappings
[
l
[
0
]]
=
l
[
1
]
mappings
[
l
[
0
]]
=
l
[
1
]
# find the labels that were changed
# find the labels that were changed
inv_mappings
=
{}
inv_mappings
=
{}
subs
=
{}
subs
=
{}
...
@@ -26,31 +26,36 @@ for label, num in mappings.items():
...
@@ -26,31 +26,36 @@ for label, num in mappings.items():
inv_mappings
[
num
]
=
label
inv_mappings
[
num
]
=
label
else
:
else
:
subs
[
label
]
=
inv_mappings
[
num
]
subs
[
label
]
=
inv_mappings
[
num
]
# -----------------------------------
# -----------------------------------
# define which language to use with the arguments
# define which language to use with the arguments
languages
=
args
.
langs_to_use
.
split
(
'
;
'
)
languages
=
args
.
langs_to_use
.
split
(
"
;
"
)
corpora
=
[
folder
corpora
=
[
for
folder
in
os
.
listdir
(
args
.
data_path
)
folder
if
any
(
l
in
folder
for
l
in
languages
)]
for
folder
in
os
.
listdir
(
args
.
data_path
)
if
any
(
l
in
folder
for
l
in
languages
)
]
files
=
[
"
/
"
.
join
([
args
.
data_path
,
corpus
,
f
])
for
corpus
in
corpora
for
f
in
os
.
listdir
(
args
.
data_path
+
"
/
"
+
corpus
)
]
files
=
[
'
/
'
.
join
([
args
.
data_path
,
corpus
,
f
])
for
corpus
in
corpora
for
f
in
os
.
listdir
(
args
.
data_path
+
'
/
'
+
corpus
)]
# open the files
# open the files
def
read_file
(
file
):
def
read_file
(
file
):
'''
Open the relations file.
'''
"""
Open the relations file.
"""
relations
=
[]
relations
=
[]
sub_rels
=
[]
sub_rels
=
[]
with
open
(
file
,
'
r
'
,
encoding
=
'
utf-8
'
)
as
f
:
with
open
(
file
,
"
r
"
,
encoding
=
"
utf-8
"
)
as
f
:
next
(
f
)
next
(
f
)
for
line
in
f
:
for
line
in
f
:
try
:
try
:
l
=
line
.
strip
().
split
(
'
\t
'
)
l
=
line
.
strip
().
split
(
"
\t
"
)
if
not
l
[
11
].
lower
()
in
subs
:
if
not
l
[
11
].
lower
()
in
subs
:
relations
.
append
(
l
[
11
].
lower
())
relations
.
append
(
l
[
11
].
lower
())
else
:
else
:
...
@@ -60,8 +65,7 @@ def read_file(file):
...
@@ -60,8 +65,7 @@ def read_file(file):
return
relations
,
sub_rels
return
relations
,
sub_rels
rel_files
=
[
f
for
f
in
files
if
any
(
x
in
f
for
x
in
[
'
train
'
]
rel_files
=
[
f
for
f
in
files
if
any
(
x
in
f
for
x
in
[
"
train
"
])]
)]
good_rels
=
[]
good_rels
=
[]
sub_rels
=
[]
sub_rels
=
[]
...
@@ -71,7 +75,7 @@ for f in rel_files:
...
@@ -71,7 +75,7 @@ for f in rel_files:
sub_rels
+=
y
sub_rels
+=
y
dict_labels
=
dict
(
enumerate
(
list
(
set
(
good_rels
))))
dict_labels
=
dict
(
enumerate
(
list
(
set
(
good_rels
))))
corpora_labels
=
{
v
:
k
for
k
,
v
in
dict_labels
.
items
()}
corpora_labels
=
{
v
:
k
for
k
,
v
in
dict_labels
.
items
()}
leftovers
=
[]
leftovers
=
[]
...
@@ -80,12 +84,12 @@ for sub in sub_rels:
...
@@ -80,12 +84,12 @@ for sub in sub_rels:
try
:
try
:
corpora_labels
[
sub
]
=
corpora_labels
[
subs
[
sub
]]
corpora_labels
[
sub
]
=
corpora_labels
[
subs
[
sub
]]
except
KeyError
:
except
KeyError
:
corpora_labels
[
subs
[
sub
]]
=
max
(
list
(
corpora_labels
.
values
()))
+
1
corpora_labels
[
subs
[
sub
]]
=
max
(
list
(
corpora_labels
.
values
()))
+
1
corpora_labels
[
sub
]
=
corpora_labels
[
subs
[
sub
]]
corpora_labels
[
sub
]
=
corpora_labels
[
subs
[
sub
]]
corpora_labels
[
'
unk
'
]
=
max
(
list
(
corpora_labels
.
values
()))
+
1
corpora_labels
[
"
unk
"
]
=
max
(
list
(
corpora_labels
.
values
()))
+
1
with
open
(
'
mappings/
'
+
args
.
mappings_file
,
'
w
'
)
as
f
:
with
open
(
"
mappings/
"
+
args
.
mappings_file
,
"
w
"
)
as
f
:
f
.
write
(
'
LABEL
\t
MAPPING
\n
'
)
f
.
write
(
"
LABEL
\t
MAPPING
\n
"
)
for
k
,
v
in
corpora_labels
.
items
():
for
k
,
v
in
corpora_labels
.
items
():
f
.
write
(
k
+
'
\t
'
+
str
(
v
)
+
'
\n
'
)
f
.
write
(
k
+
"
\t
"
+
str
(
v
)
+
"
\n
"
)
This diff is collapsed.
Click to expand it.
utils.py
+
299
−
256
View file @
571902b3
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment