Skip to content
Snippets Groups Projects
Commit 4c24c831 authored by laura.riviere's avatar laura.riviere
Browse files

add documentation and new config files

parent 1c404fc4
No related branches found
No related tags found
1 merge request!2Dev expes
......@@ -23,15 +23,15 @@ Code: https://gitlab.inria.fr/andiamo/tony
## Content description
[TBD : xplain directories automatically created during scripts run]
- `data/MyProjet/` Contains input data, raw and/or pre-processed format(s).
- `results/` Contains output data, scores and post-processed data. (Also logs of allennlp)
- `data/my.cool.dataset/` Contains input data, raw and/or pre-processed format(s).
- `results.{stamp}/` Contains output data, scores and post-processed data. (Also logs of allennlp)
- `code/` Contains main scripts.
- `discut22_1.py` One python script to run them all.
- `config_XX.json` A file to be completed (or a dir with choise between simple use_case configs and a template for a custom config).
- `config_XX.json` A file to be completed for your specific project (or a dir with choise between simple use_case configs and a template for a custom config). See **
- `utils/` Contains useful scripts to be called.
- `model/` Contains model to be loaded or created.
- `config_training.jsonnet` A file to be completed. (TBD automatically saved with model when done)
- `documentation.md` Contains detailed documentation (TBD?)
- `global_config_file_guideline.md` Contains detailed documentation to build well formed config file.
## Set up environnement
- Conda stuff pour python 3.7 (TBD ?)
......@@ -51,8 +51,13 @@ Run this command:
python discut22.py --config config_XX.json
```
## Support
laura.riviere@irit.fr
## Authors and acknowledgment
Morteza Ezzabady
Laura Rivière
Amir Zeldes
<!---
......
......@@ -3,14 +3,15 @@
class Input:
def __init__(self, infos):
def __init__(self, infos, stamp):
self.name = infos['name']
self.lang = infos['language']
self.path = infos['folder_path'] # misused
self.file = infos['file']
self.form = infos['format'] # not used
self.gold = infos['gold'] # not used
self.resu = f"{self.path}/results"
# self.path = infos['folder_path'] # misused
self.path = f"../data/{self.name}"
self.file = infos['file']
self.stamp = stamp
self.conv = f"{self.path}/data_converted_{stamp}" # à intégrer
self.resu = f"{self.path}/results_{stamp}"
class Process:
......@@ -18,10 +19,11 @@ class Process:
self.data = data
self.main = infos["main"] # train test annotation
self.toke = infos['pre-processing']['tokenization'] # not used
self.toke = infos['pre-processing']['tokenization']
self.toke_tool = infos['pre-processing']['tokenization_tool']
self.ssplit = infos['pre-processing']['sentence_split']
self.ssplitor = infos['pre-processing']['sentence_split_splitor']
self.ner_init = infos['pre-processing']['NER_format_initialisation']
self.ner_init = infos['pre-processing']['NER_format_initialisation'] # useless because done anyway
if self.main == "train":
if self.ner_init == True : # à faire en relatif !! split truc
......@@ -41,12 +43,5 @@ class Process:
self.eval = infos['evaluation']
self.test_data = infos['gold_test_data_path']
#if self.eval == True :
# if self.ner_init == True :
# self.test_data = f"{self.data.path}/{self.data.name}_test.ner{self.data.file}"
# #self.test_data = infos['gold_test_data_path']
# else :
# self.test_data = infos['gold_test_data_path']
self.post_bracket = infos['post-processing']['tab_to_bracket']
\ No newline at end of file
......@@ -15,17 +15,18 @@
"steps":{
"main": "annotation",
"pre-processing": {
"tokenization": true,
"tokenization": false,
"sentence_split": false,
"sentence_split_splitor": false,
"syntactic_parsing": false,
"NER_format_initialisation": true
"NER_format_initialisation": false
},
"discourse_segmenter": {
"model": "tony"
},
"post-processing": {
"json_to_tab": true,
"tab_to_bracket":true
"tab_to_bracket":false
},
"evaluation": false
}
......
{
"usecase_description": "Config file for usecase_1 : from a text, get the same text but with EDU using an existing model.",
"input": {
"name": "my.cool.project",
"file": [".conllu", ".tok"],
"folder_path": "../data/my.cool.project",
"name": "chaperontest",
"file": ".ss",
"folder_path": "../data/chaperontest",
"format": "truc",
"language": "en",
"language": "fr",
"gold": true
},
"output": {
......@@ -15,14 +15,14 @@
"steps":{
"main": "annotation",
"pre-processing": {
"tokenization": [false, true],
"sentence_split": [false, true],
"tokenization": true,
"sentence_split": false,
"sentence_split_splitor": "stanza",
"syntactic_parsing": [false],
"syntactic_parsing": false,
"NER_format_initialisation": true
},
"discourse_segmenter": {
"model": "/moredata/andiamo/discut/Results_split.tok/results_eng.rst.gum_bert/model.tar.gz",
"model": "tony",
"training": {
"toolkit": null,
"pre_trained_lm": null,
......@@ -32,8 +32,8 @@
}
},
"post-processing": {
"json_to_tab": [false, true],
"tab_to_bracket": [false, true]
"json_to_tab": true,
"tab_to_bracket": true
},
"evaluation": false,
"gold_test_data_path": null
......
{
"usecase_description": "Config file for usecase_1 : from a text, get the same text but with EDU bracket using ToNy segmenter.",
"input": {
"name": "chaperontest",
"file": ".ss",
"language": "fr"
},
"steps":{
"main": "annotation",
"pre-processing": {
"tokenization": true,
"tokenization_tool" : "spacy",
"sentence_split": false,
"sentence_split_splitor": null,
"syntactic_parsing": false,
"NER_format_initialisation": true
},
"discourse_segmenter": {
"model": "tony",
"training": {
"toolkit": null,
"pre_trained_lm": null,
"config_file": null,
"train_data_path": null,
"validation_data_path": null
}
},
"post-processing": {
"json_to_tab": true,
"tab_to_bracket":true
},
"evaluation": false,
"gold_test_data_path": null
}
}
{
"usecase_description": "Config file for usecase_2",
"input": {
"name": "fra.sdrt.annodis_dev",
"file": ".ttok",
"language": "fr"
},
"steps":{
"main": "annotation",
"pre-processing": {
"tokenization": false,
"tokenization_tool" : "spacy",
"sentence_split": true,
"sentence_split_splitor": "stanza",
"syntactic_parsing": false,
"NER_format_initialisation": true
},
"discourse_segmenter": {
"model": "tony",
"training": {
"toolkit": null,
"pre_trained_lm": null,
"config_file": null,
"train_data_path": null,
"validation_data_path": null
}
},
"post-processing": {
"json_to_tab": true,
"tab_to_bracket":true
},
"evaluation": false,
"gold_test_data_path": null
}
}
{
"usecase_description": "Config file for usecase_3 : Take a EDU gold segmented set of train/dev/test of texts au format conll as input, train a model, output scores.",
"input": {
"name": "eng.rst.rstdt",
"file": ".conllu",
"language": "en"
},
"steps":{
"main": "train",
"pre-processing": {
"tokenization": false,
"tokenization_tool" : "spacy",
"sentence_split": false,
"sentence_split_splitor": "stanza",
"syntactic_parsing": false,
"NER_format_initialisation": true
},
"discourse_segmenter": {
"model": null,
"training": {
"toolkit": "allennlp",
"pre_trained_lm": "bert",
"config_file": "../model/config_training_bert.jsonnet",
"train_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_train.conllu",
"validation_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_dev.conllu"
}
},
"post-processing": {
"json_to_tab": false,
"tab_to_bracket":false
},
"evaluation": true,
"gold_test_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_test.conllu"
}
}
......@@ -8,6 +8,8 @@
import os
import sys
import argparse
import re
from datetime import datetime
import pandas as pd # for futur clean output in df
import json
......@@ -22,11 +24,11 @@ import utils.training_allennlp as tr_allen
# fonction to get config stuffs
def get_config_infos(config_file):
def get_config_infos(stamp, config_file):
with open(config_file) as f:
infos = json.load(f)
data_in = Input(infos['input'])
data_in = Input(infos['input'], stamp)
actions = Process(infos['steps'], data_in)
print(f"data to be process : {data_in.name}")
return actions
......@@ -38,13 +40,20 @@ def get_model(model_name):
if name == "tony":
arch = "french_tokens.tar.gz"
if not os.path.isfile(f"../model/{arch}"):
if not os.path.isfile(f"../model/{name}/{arch}"):
dl = "wget https://zenodo.org/record/4235850/files/french_tokens.tar.gz -P ../model --progress=bar"
os.system(dl)
else:
print("Tony already in place !")
return f"../model/{arch}"
return f"../model/{name}/{arch}"
def text_tokenization(f_in, f_out, lang, tool):
if lang == "fr" :
if tool == "spacy" :
tk.main(f_in, f_out) # .ss -> .tok
......@@ -69,9 +78,9 @@ def main(steps):
#### Tokenization du text # #python ${SEG_DIR}/code/utils/fr_tokenize.py $RAW > ${RAW}.tok
data_in = f"{steps.data.path}/{steps.data.name}{steps.data.file}"
data_tok = f"{steps.data.path}/{steps.data.name}.tok"
# sys.exit("check path")
print(f"Starting Tokenization...to {data_tok}")
tk.main(data_in, data_tok) # .ss -> .tok
#tk.main(f_in, f_out) # .ss -> .tok
text_tokenization(data_in, data_tok, steps.data.lang, steps.toke_tool) # .ss -> .tok
else:
data_tok = f"{steps.data.path}/{steps.data.name}{steps.data.file}"
......@@ -124,9 +133,6 @@ def main(steps):
else:
print(" pb define model")
if steps.post_tab == True :
#### Appliquer les predictions au texte et sortir le texte tokenisé avec la colone des tags-prédis
# # #python $RUNTIME/json2conll.py ${RESULT_DIR}/${FILE}.json split.tok > ${RESULT_DIR}/${FILE}.split.tok
......@@ -190,7 +196,14 @@ if __name__ == '__main__':
parser.add_argument('--config', help='Config file in JSON')
args = parser.parse_args()
config = args.config
steps = get_config_infos(config)
now = datetime.now()
stamp = re.sub('[\s:]', '_', str(now))
my_logs = {}
my_logs['stamp'] = stamp
steps = get_config_infos(stamp, config)
print(stamp)
main(steps)
print("Done.")
\ No newline at end of file
# DisCut22 - Global Config File Guideline
## Good practice tips
- You can rename the `config_global.json` file as convenient : a good practice is to make one experiment = one global config file. If so, do not forget to type your file name when you will run the main command `python discut22.py --config **config_XX.json**```
- Data can only be :
- [boolean] `true`, `false`,
- [string] `"my_string_in_between_quote_marks"`,
- or `null`.
- In this documentation, values of fields in **bold** can not be changed and are specific to the usecase.
- Keep comas as in the templates to avoid errors on JSON format.
## For Usecase 1 : **Discourse Segmentation**
- `"usecase_description":` [string] This field is not a fonctional one. You can describe your project or keep the default text. e.g. ```"Config file for usecase_1 : from a text, get the same text but with EDU bracket using ToNy segmenter."```
- `input:`{ These fields are mandatory for every Usecases.
- `"name":` [string] The name of your input dataset, without the extension. This is also the same name of the directory where you put your input dataset. e.g. ```"my.cool.dataset"```
- `"file":` [string] The extension of your input dataset that reflects its format.
- OPTIONS :[".conllu", ".tok", ".ttok", ".ss"]
- `"language":` [string] Language ID of your dataset following the ISO 639-1 Code. e.g. ```"en"```
- `"steps":` {
- `"main":` [string] : **"annotation"**
- `"pre-processing":` {
- `"tokenization":` [false, true] *available for FR*
- `"sentence_split":` [false, true] *available for FR*
- `"sentence_split_splitor":` [string] This is the toolkit you want for sentence spliting.
- OPTIONS : ["stanza"]
- `"syntactic_parsing":` [boolean] : **false** *Not yet available*
- `"NER_format_initialisation":` [boolean] Set to true if your are working with ToNy. *Set to true anyway ??*
- `"discourse_segmenter":` {
- `"model":` [string] Here the name or the path to the existing model you want to use. e.g. `"tony"`, `"/moredata/andiamo/discut/Results_split.tok/results_eng.rst.gum_bert/model.tar.gz"`
- `"training":` {
- `"toolkit":` **null**
- `"pre_trained_lm":` **null**
- `"config_file":` **null**
- `"train_data_path":` **null**
- `"validation_data_path":` **null**
- `"post-processing":` { The toolkit AllenNlp output a JSON.
- `"json_to_tab":` [boolean] Set to true if you want also a conll-style output with predictions as last column.
- `"tab_to_bracket":` [boolean] Set to true if you want also an output as the raw text with brackets as EDU delimiter. If so, `"json_to_tab"` has to be set to true too.
- `"evaluation":` [boolean] : **false**
- `"gold_test_data_path":` [string] **null**
## For Usecase 2 : **Segmentation Evaluation**
- `"usecase_description":` [string] This field is not a fonctional one. You can describe your project or keep the default text. ```"Config file for usecase_2 : Take a EDU gold segmented text au format tok as input, use a loaded model to make predictions. Output scores of model predictions against gold"```
- `input:`{ These fields are mandatory for every Usecases.
- `"name":` [string] The name of your input dataset, without the extension. This is also the same name of the directory where you put your input dataset. e.g. ```"my.cool.dataset"```
- `"file":` [string] The extension of your input dataset that reflects its format.
- OPTIONS :[".conllu", ".tok", ".ttok", ".ss"]
- `"language":` [string] Language ID of your dataset following the ISO 639-1 Code. e.g. ```"en"```
- `"steps":` {
- `"main":` [string] : **"test"**
- `"pre-processing":` {
- `"tokenization":` [false, true] *available for FR*
- `"sentence_split":` [false, true] *available for FR*
- `"sentence_split_splitor":` [string] This is the toolkit you want for sentence spliting.
- OPTIONS : ["stanza"]
- `"syntactic_parsing":` [boolean] : **false** *Not yet available*
- `"NER_format_initialisation":` [boolean] Set to true if your are working with ToNy. *Set to true anyway ??*
- `"discourse_segmenter":` {
- `"model":` [string] Here the name or the path to the existing model you want to use. e.g. `"tony"`, `"/moredata/andiamo/discut/Results_split.tok/results_eng.rst.gum_bert/model.tar.gz"`
- `"training":` {
- `"toolkit":` **null**
- `"pre_trained_lm":` **null**
- `"config_file":` **null**
- `"train_data_path":` **null**
- `"validation_data_path":` **null**
- `"post-processing":` { The toolkit AllenNlp output a JSON.
- `"json_to_tab":` [boolean] : **true**
- `"tab_to_bracket":` [boolean] Set to true if you want also an output as the raw text with brackets as EDU delimiter. If so, `"json_to_tab"` has to be set to true too.
- `"evaluation":` [boolean] : **true**
- `"gold_test_data_path":` [string] The path to your gold dataset to make predictions to, and to evaluate against.
## For Usecase 3 : **Custom Model Creation**
- `"usecase_description":` [string] This field is not a fonctional one. You can describe your project or keep the default text. ```"Config file for usecase_3 : Take a EDU gold segmented set of train/dev/test of texts au format conll as input, train a model, output scores."```
- `input:`{ These fields are mandatory for every Usecases.
- `"name":` [string] The name of your input dataset, without the extension. This is also the same name of the directory where you put your input dataset. e.g. ```"my.cool.dataset"```
- `"file":` [string] The extension of your input dataset that reflects its format.
- OPTIONS :[".conllu", ".tok", ".ttok", ".ss"]
- `"language":` [string] Language ID of your dataset following the ISO 639-1 Code. e.g. ```"en"```
- `"steps":` {
- `"main":` [string] : **"train"**
- `"pre-processing":` {
- `"tokenization":` [false, true] *available for FR*
- `"sentence_split":` [false, true] *available for FR*
- `"sentence_split_splitor":` [string] This is the toolkit you want for sentence spliting.
- OPTIONS : ["stanza"]
- `"syntactic_parsing":` [boolean] : **false** *Not yet available*
- `"NER_format_initialisation":` [boolean] Set to true if your are working with ToNy. *Set to true anyway ??*
- `"discourse_segmenter":` {
- `"model":` **null**
- `"training":` {
- `"toolkit":` [string] The toolkit to build your model (to be added : "jiant").
- OPTIONS : ["allennlp"]
- `"pre_trained_lm":` **bert** (to be added : roberta..)
- `"config_file":` [string] The path to the config file for training. e.g. `"../model/config_training.jsonnet"`
- `"train_data_path":` [string] The path to your training dataset. e.g. `"../data/eng.rst.rstdt/eng.rst.rstdt_train.conllu"` *conflict with training_config ??*
- `"validation_data_path":` [string] The path to your development dataset. e.g. `"../data/eng.rst.rstdt/eng.rst.rstdt_dev.conllu"` *idem*
- `"post-processing":` { The toolkit AllenNlp output a JSON.
- `"json_to_tab":` [boolean] : **true**
- `"tab_to_bracket":` [boolean] Set to true if you want also an output as the raw text with brackets as EDU delimiter. If so, `"json_to_tab"` has to be set to true too.
- `"evaluation":` [boolean] : **true**
- `"gold_test_data_path":` [string] The path to your gold test dataset to make predictions on, and to evaluate against.
{
"dataset_reader": {
"type": "conll2003",
"coding_scheme": "BIOUL",
"tag_label": "ner",
"token_indexers": {
"bert": {
"type": "bert-pretrained",
"do_lowercase": false,
"pretrained_model": "bert-base-multilingual-cased",
"use_starting_offsets": true
},
"token_characters": {
"type": "characters",
"min_padding_length": 3
}
}
},
"iterator": {
"type": "basic",
"batch_size": 2
},
"model": {
"type": "simple_tagger",
"encoder": {
"type": "lstm",
"bidirectional": true,
"dropout": 0.5,
"hidden_size": 100,
"input_size": 896,
"num_layers": 2
},
"text_field_embedder": {
"allow_unmatched_keys": true,
"embedder_to_indexer_map": {
"bert": [
"bert",
"bert-offsets"
],
"token_characters": [
"token_characters"
]
},
"token_embedders": {
"bert": {
"type": "bert-pretrained",
"pretrained_model": "bert-base-multilingual-cased"
},
"token_characters": {
"type": "character_encoding",
"embedding": {
"embedding_dim": 16
},
"encoder": {
"type": "cnn",
"conv_layer_activation": "relu",
"embedding_dim": 16,
"ngram_filter_sizes": [
3
],
"num_filters": 128
}
}
}
}
},
"train_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_train.ner.conllu",
"validation_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_dev.ner.conllu",
"trainer": {
"cuda_device": 1,
"grad_norm": 5,
"num_epochs": 4,
"num_serialized_models_to_keep": 3,
"optimizer": {
"type": "bert_adam",
"lr": 0.001
}
}
}
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment