Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
D
discut22
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Package registry
Model registry
Operate
Environments
Terraform modules
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
MELODI
AnDiAMO
discourseSegmentation
discut22
Commits
b43945e7
Commit
b43945e7
authored
2 years ago
by
laura.riviere
Browse files
Options
Downloads
Patches
Plain Diff
begin of refacto code
parent
415aed4a
No related branches found
No related tags found
1 merge request
!3
Refacto 1205
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
code/classes_def_2.py
+57
-0
57 additions, 0 deletions
code/classes_def_2.py
code/config_global_1.2.json
+48
-0
48 additions, 0 deletions
code/config_global_1.2.json
code/discut22_2.py
+110
-0
110 additions, 0 deletions
code/discut22_2.py
with
215 additions
and
0 deletions
code/classes_def_2.py
0 → 100644
+
57
−
0
View file @
b43945e7
# Classes for discut22
class
Data
:
def
__init__
(
self
,
infos
,
stamp
):
self
.
name
=
infos
[
'
name
'
]
self
.
lang
=
infos
[
'
language
'
]
self
.
path
=
f
"
../data/
{
self
.
name
}
"
self
.
exte
=
infos
[
'
exte
'
]
self
.
stamp
=
stamp
self
.
conv
=
f
"
{
self
.
path
}
/data_converted_
{
stamp
}
"
# à intégrer
self
.
resu
=
f
"
{
self
.
path
}
/results_
{
stamp
}
"
self
.
meta
=
infos
[
'
existing_metadata
'
]
class
Process
:
def
__init__
(
self
,
infos
):
self
.
main
=
infos
[
"
main
"
]
# train test annotation
self
.
pre_process_to_do
=
infos
[
'
pre-processing
'
][
'
to_do
'
]
self
.
synt_tool
=
infos
[
'
pre-processing
'
][
'
syntactic_tool
'
]
self
.
synt_parse
=
infos
[
'
pre-processing
'
][
'
syntactic_parsing
'
]
self
.
toke
=
infos
[
'
pre-processing
'
][
'
tokenization
'
]
self
.
ssplit
=
infos
[
'
pre-processing
'
][
'
sentence_split
'
]
self
.
crea_meta
=
infos
[
'
pre-processing
'
][
'
create_metadata
'
][
'
to_do
'
]
self
.
meta_line
=
infos
[
'
pre-processing
'
][
'
create_metadata
'
][
'
line
'
]
self
.
meta_sent
=
infos
[
'
pre-processing
'
][
'
create_metadata
'
][
'
sent
'
]
#self.ner_init = infos['pre-processing']['NER_format_initialisation'] # useless because done anyway
#if self.main == "train":
#if self.ner_init == True : # à faire en relatif !! split truc
# self.train_data = f"{self.data.path}/{self.data.name}_train.ner{self.data.file}"
# self.dev_data = f"{self.data.path}/{self.data.name}_dev.ner{self.data.file}"
#else :
# self.train_data = infos['discourse_segmenter']['training']['train_data_path']
# self.dev_data = infos['discourse_segmenter']['training']['validation_data_path']
self
.
toolkit
=
infos
[
'
discourse_segmenter
'
][
'
training
'
][
'
toolkit
'
]
self
.
tr_config
=
infos
[
'
discourse_segmenter
'
][
'
training
'
][
'
config_file
'
]
self
.
pretr_lm
=
infos
[
'
discourse_segmenter
'
][
'
training
'
][
'
pre_trained_lm
'
]
self
.
model
=
infos
[
'
discourse_segmenter
'
][
'
model
'
]
# ezpz for Tony
#self.post_tab = infos['post-processing']['json_to_tab']
self
.
eval
=
infos
[
'
evaluation
'
]
self
.
test_data
=
infos
[
'
gold_test_data_path
'
]
class
Output
:
def
__init__
(
self
,
infos
):
self
.
prod_tab
=
infos
[
'
file
'
][
'
json_to_tab
'
]
self
.
prod_bracket
=
infos
[
'
file
'
][
'
tab_to_bracket
'
]
self
.
prod_conll
=
infos
[
'
file
'
][
'
conllu
'
]
self
.
metadata
=
infos
[
'
file
'
][
'
metadata
'
]
\ No newline at end of file
This diff is collapsed.
Click to expand it.
code/config_global_1.2.json
0 → 100644
+
48
−
0
View file @
b43945e7
{
"usecase_description"
:
"Config file for usecase_1 : from a text, get the same text but with EDU bracket."
,
"data_raw"
:
{
"name"
:
"edgar_poe_en"
,
"exte"
:
".txt"
,
"language"
:
"en"
,
"existing_metadata"
:
false
},
"steps"
:{
"main"
:
"annotation"
,
"pre-processing"
:
{
"to_do"
:
true
,
"syntactic_tool"
:
"stanza"
,
"sentence_split"
:
true
,
"tokenization"
:
true
,
"syntactic_parsing"
:
true
,
"create_metadata"
:
{
"to_do"
:
true
,
"line"
:
"paragraph"
,
"sent"
:
"sent"
}
},
"discourse_segmenter"
:
{
"model"
:
"/home/lriviere/andiamo/discut/Results_conllu/results_eng.rst.gum-eng_bert/model.tar.gz"
,
"training"
:
{
"toolkit"
:
null
,
"pre_trained_lm"
:
null
,
"config_file"
:
null
,
"train_data_path"
:
null
,
"validation_data_path"
:
null
}
},
"evaluation"
:
false
,
"gold_test_data_path"
:
null
},
"output"
:{
"file"
:{
"json_to_tab"
:
true
,
"tab_to_bracket"
:
true
,
"conllu"
:
true
,
"metadata"
:
true
},
"scores"
:
false
}
}
This diff is collapsed.
Click to expand it.
code/discut22_2.py
0 → 100644
+
110
−
0
View file @
b43945e7
######################################
###### DISCOURSE SEGMENTOR 2022 ######
######################################
"""
This the main script
And the only one to run,
after completion of config.json
Discut22 uses allennlp toolkit. For that, it need NER intermediary format.
"""
import
argparse
from
datetime
import
datetime
import
os
import
re
import
json
from
classes_def_2
import
Data
,
Process
,
Output
import
utils_2.syntactic_parsing
as
synt_pars
import
utils.conv2ner
as
conv_to_ner
def
get_stamp
():
now
=
datetime
.
now
()
stamp
=
re
.
sub
(
'
[\s:]
'
,
'
_
'
,
str
(
now
))
return
stamp
def
get_config_infos
(
config
,
stamp
):
with
open
(
config
,
'
r
'
,
encoding
=
'
utf-8
'
)
as
f
:
infos
=
json
.
load
(
f
)
data
=
Data
(
infos
[
'
data_raw
'
],
stamp
)
steps
=
Process
(
infos
[
'
steps
'
])
prod
=
Output
(
infos
[
'
output
'
])
my_logs
[
"
config
"
]
=
infos
return
data
,
steps
,
prod
def
create_folders
(
li
):
for
it
in
li
:
if
not
os
.
path
.
isdir
(
it
):
os
.
mkdir
(
it
)
def
print_logs
():
file_logs
=
f
"
{
data
.
resu
}
/processes_logs.json
"
print
(
my_logs
)
def
pre_processing
(
data
,
steps
):
data_in
=
f
"
{
data
.
path
}
/
{
data
.
name
}{
data
.
exte
}
"
if
steps
.
pre_process_to_do
==
True
:
data_out
=
f
"
{
data
.
path
}
/
{
data
.
name
}
.conll
"
if
steps
.
synt_tool
==
"
stanza
"
:
processors
=
[]
metadata
=
{}
if
steps
.
toke
==
True
:
processors
.
extend
([
'
tokenize
'
,
'
mwt
'
])
if
steps
.
synt_parse
==
True
:
processors
.
extend
([
'
pos
'
,
'
lemma
'
,
'
depparse
'
])
#if steps.ssplit == True:
# processors.append('constituency')
if
steps
.
crea_meta
==
True
:
metadata
[
'
line
'
]
=
steps
.
meta_line
metadata
[
'
sent
'
]
=
steps
.
meta_sent
if
data
.
meta
==
True
:
metadata
[
'
meta
'
]
=
True
processors_str
=
"
,
"
.
join
(
processors
)
synt_pars
.
with_stanza
(
data
.
lang
,
data_in
,
data_out
,
processors_str
,
metadata
)
else
:
exit
(
f
"
Exited. Not valid syntactic tool:
\"
{
steps
.
synt_tool
}
\"
. Options:
\"
stanza
\"
. Change your config file.
"
)
else
:
data_out
=
data_in
my_logs
[
'
data_preprocessed
'
]
=
data_out
return
data_out
def
data_to_ner_format
(
data_in
):
"""
This fonction build the NER format upon the Segmentor works.
INPUT: Tokenized text with whatever number of columns.
OUTPUT: Tokenized text with just 4 columns.
"""
data_ner
=
f
"
{
data_in
}
.ner
"
conv_to_ner
.
main
(
data_in
,
data_ner
,
"
conll
"
)
#TODO add same for train/dev/test for config train
my_logs
[
'
data_ner
'
]
=
data_ner
return
data_ner
if
__name__
==
'
__main__
'
:
my_logs
=
{}
stamp
=
get_stamp
()
parser
=
argparse
.
ArgumentParser
()
parser
.
add_argument
(
'
--config
'
,
help
=
'
Config file in JSON.
'
)
parser
.
add_argument
(
'
--name
'
,
default
=
stamp
,
help
=
'
Run name.
'
)
args
=
parser
.
parse_args
()
config
=
args
.
config
stamp
=
args
.
name
my_logs
[
"
stamp
"
]
=
stamp
data
,
steps
,
prod
=
get_config_infos
(
config
,
stamp
)
create_folders
([
data
.
conv
,
data
.
resu
])
data_preprocessed
=
pre_processing
(
data
,
steps
)
data_ner
=
data_to_ner_format
(
data_preprocessed
)
#print_logs()
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment