diff --git a/README.md b/README.md index d0dcb14010b9d10cc3a79741177038b7de0758db..0575190aa91d6135f50f82e01a23345665d6b8f7 100644 --- a/README.md +++ b/README.md @@ -23,15 +23,15 @@ Code: https://gitlab.inria.fr/andiamo/tony ## Content description [TBD : xplain directories automatically created during scripts run] -- `data/MyProjet/` Contains input data, raw and/or pre-processed format(s). - - `results/` Contains output data, scores and post-processed data. (Also logs of allennlp) +- `data/my.cool.dataset/` Contains input data, raw and/or pre-processed format(s). + - `results.{stamp}/` Contains output data, scores and post-processed data. (Also logs of allennlp) - `code/` Contains main scripts. - `discut22_1.py` One python script to run them all. - - `config_XX.json` A file to be completed (or a dir with choise between simple use_case configs and a template for a custom config). + - `config_XX.json` A file to be completed for your specific project (or a dir with choise between simple use_case configs and a template for a custom config). See ** - `utils/` Contains useful scripts to be called. - `model/` Contains model to be loaded or created. - `config_training.jsonnet` A file to be completed. (TBD automatically saved with model when done) -- `documentation.md` Contains detailed documentation (TBD?) +- `global_config_file_guideline.md` Contains detailed documentation to build well formed config file. ## Set up environnement - Conda stuff pour python 3.7 (TBD ?) @@ -51,8 +51,13 @@ Run this command: python discut22.py --config config_XX.json ``` +## Support +laura.riviere@irit.fr + + ## Authors and acknowledgment Morteza Ezzabady +Laura Rivière Amir Zeldes <!--- diff --git a/code/classes_def.py b/code/classes_def.py index bcc9b77bb4d3c2a07ba6a231e1b748df8c4f2d85..baf173327ca66311592349889ab1658895466d0c 100644 --- a/code/classes_def.py +++ b/code/classes_def.py @@ -3,14 +3,15 @@ class Input: - def __init__(self, infos): + def __init__(self, infos, stamp): self.name = infos['name'] self.lang = infos['language'] - self.path = infos['folder_path'] # misused - self.file = infos['file'] - self.form = infos['format'] # not used - self.gold = infos['gold'] # not used - self.resu = f"{self.path}/results" +# self.path = infos['folder_path'] # misused + self.path = f"../data/{self.name}" + self.file = infos['file'] + self.stamp = stamp + self.conv = f"{self.path}/data_converted_{stamp}" # à intégrer + self.resu = f"{self.path}/results_{stamp}" class Process: @@ -18,10 +19,11 @@ class Process: self.data = data self.main = infos["main"] # train test annotation - self.toke = infos['pre-processing']['tokenization'] # not used + self.toke = infos['pre-processing']['tokenization'] + self.toke_tool = infos['pre-processing']['tokenization_tool'] self.ssplit = infos['pre-processing']['sentence_split'] self.ssplitor = infos['pre-processing']['sentence_split_splitor'] - self.ner_init = infos['pre-processing']['NER_format_initialisation'] + self.ner_init = infos['pre-processing']['NER_format_initialisation'] # useless because done anyway if self.main == "train": if self.ner_init == True : # à faire en relatif !! split truc @@ -41,12 +43,5 @@ class Process: self.eval = infos['evaluation'] self.test_data = infos['gold_test_data_path'] - #if self.eval == True : - # if self.ner_init == True : - # self.test_data = f"{self.data.path}/{self.data.name}_test.ner{self.data.file}" - # #self.test_data = infos['gold_test_data_path'] - # else : - # self.test_data = infos['gold_test_data_path'] - self.post_bracket = infos['post-processing']['tab_to_bracket'] \ No newline at end of file diff --git a/code/config_1.json b/code/config_1.json index ed6c31f65e83f917c5cd4f24d603274593b807a8..d8a166e03700d76c5ac47d6c43d385d4616968d7 100644 --- a/code/config_1.json +++ b/code/config_1.json @@ -15,17 +15,18 @@ "steps":{ "main": "annotation", "pre-processing": { - "tokenization": true, + "tokenization": false, "sentence_split": false, + "sentence_split_splitor": false, "syntactic_parsing": false, - "NER_format_initialisation": true + "NER_format_initialisation": false }, "discourse_segmenter": { "model": "tony" }, "post-processing": { "json_to_tab": true, - "tab_to_bracket":true + "tab_to_bracket":false }, "evaluation": false } diff --git a/code/config_1_fanny.json b/code/config_1_fanny.json index 7a1362011d34690391d85c82fccbb14ab40c5965..eae1fa0c89a1987df331f74a0129da3140b02c57 100644 --- a/code/config_1_fanny.json +++ b/code/config_1_fanny.json @@ -1,11 +1,11 @@ { "usecase_description": "Config file for usecase_1 : from a text, get the same text but with EDU using an existing model.", "input": { - "name": "my.cool.project", - "file": [".conllu", ".tok"], - "folder_path": "../data/my.cool.project", + "name": "chaperontest", + "file": ".ss", + "folder_path": "../data/chaperontest", "format": "truc", - "language": "en", + "language": "fr", "gold": true }, "output": { @@ -15,14 +15,14 @@ "steps":{ "main": "annotation", "pre-processing": { - "tokenization": [false, true], - "sentence_split": [false, true], + "tokenization": true, + "sentence_split": false, "sentence_split_splitor": "stanza", - "syntactic_parsing": [false], + "syntactic_parsing": false, "NER_format_initialisation": true }, "discourse_segmenter": { - "model": "/moredata/andiamo/discut/Results_split.tok/results_eng.rst.gum_bert/model.tar.gz", + "model": "tony", "training": { "toolkit": null, "pre_trained_lm": null, @@ -32,8 +32,8 @@ } }, "post-processing": { - "json_to_tab": [false, true], - "tab_to_bracket": [false, true] + "json_to_tab": true, + "tab_to_bracket": true }, "evaluation": false, "gold_test_data_path": null diff --git a/code/config_global_1.json b/code/config_global_1.json new file mode 100644 index 0000000000000000000000000000000000000000..cd6c01ca6812e45cf87d284f4ee323677800a07b --- /dev/null +++ b/code/config_global_1.json @@ -0,0 +1,37 @@ +{ + "usecase_description": "Config file for usecase_1 : from a text, get the same text but with EDU bracket using ToNy segmenter.", + "input": { + "name": "chaperontest", + "file": ".ss", + "language": "fr" + }, + "steps":{ + "main": "annotation", + "pre-processing": { + "tokenization": true, + "tokenization_tool" : "spacy", + "sentence_split": false, + "sentence_split_splitor": null, + "syntactic_parsing": false, + "NER_format_initialisation": true + }, + "discourse_segmenter": { + "model": "tony", + "training": { + "toolkit": null, + "pre_trained_lm": null, + "config_file": null, + "train_data_path": null, + "validation_data_path": null + } + }, + "post-processing": { + "json_to_tab": true, + "tab_to_bracket":true + }, + "evaluation": false, + "gold_test_data_path": null + } +} + + diff --git a/code/config_global_2.json b/code/config_global_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e46805bab828ba094025a51396c0448c1f6c3064 --- /dev/null +++ b/code/config_global_2.json @@ -0,0 +1,37 @@ +{ + "usecase_description": "Config file for usecase_2", + "input": { + "name": "fra.sdrt.annodis_dev", + "file": ".ttok", + "language": "fr" + }, + "steps":{ + "main": "annotation", + "pre-processing": { + "tokenization": false, + "tokenization_tool" : "spacy", + "sentence_split": true, + "sentence_split_splitor": "stanza", + "syntactic_parsing": false, + "NER_format_initialisation": true + }, + "discourse_segmenter": { + "model": "tony", + "training": { + "toolkit": null, + "pre_trained_lm": null, + "config_file": null, + "train_data_path": null, + "validation_data_path": null + } + }, + "post-processing": { + "json_to_tab": true, + "tab_to_bracket":true + }, + "evaluation": false, + "gold_test_data_path": null + } +} + + diff --git a/code/config_global_3.json b/code/config_global_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9703e8d798bbb98f1cf02f1dd40f7c2c061e3473 --- /dev/null +++ b/code/config_global_3.json @@ -0,0 +1,37 @@ +{ + "usecase_description": "Config file for usecase_3 : Take a EDU gold segmented set of train/dev/test of texts au format conll as input, train a model, output scores.", + "input": { + "name": "eng.rst.rstdt", + "file": ".conllu", + "language": "en" + }, + "steps":{ + "main": "train", + "pre-processing": { + "tokenization": false, + "tokenization_tool" : "spacy", + "sentence_split": false, + "sentence_split_splitor": "stanza", + "syntactic_parsing": false, + "NER_format_initialisation": true + }, + "discourse_segmenter": { + "model": null, + "training": { + "toolkit": "allennlp", + "pre_trained_lm": "bert", + "config_file": "../model/config_training_bert.jsonnet", + "train_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_train.conllu", + "validation_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_dev.conllu" + } + }, + "post-processing": { + "json_to_tab": false, + "tab_to_bracket":false + }, + "evaluation": true, + "gold_test_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_test.conllu" + } +} + + diff --git a/code/discut22_1.py b/code/discut22_1.py index ca66db7eac21dc47468013d4f07fdd282b544c69..1a40cfe5c0206ff3826bf173ef805dcb0345c92b 100644 --- a/code/discut22_1.py +++ b/code/discut22_1.py @@ -8,6 +8,8 @@ import os import sys import argparse +import re +from datetime import datetime import pandas as pd # for futur clean output in df import json @@ -22,11 +24,11 @@ import utils.training_allennlp as tr_allen # fonction to get config stuffs -def get_config_infos(config_file): +def get_config_infos(stamp, config_file): with open(config_file) as f: infos = json.load(f) - data_in = Input(infos['input']) + data_in = Input(infos['input'], stamp) actions = Process(infos['steps'], data_in) print(f"data to be process : {data_in.name}") return actions @@ -38,13 +40,20 @@ def get_model(model_name): if name == "tony": arch = "french_tokens.tar.gz" - if not os.path.isfile(f"../model/{arch}"): + if not os.path.isfile(f"../model/{name}/{arch}"): dl = "wget https://zenodo.org/record/4235850/files/french_tokens.tar.gz -P ../model --progress=bar" os.system(dl) else: print("Tony already in place !") - return f"../model/{arch}" + return f"../model/{name}/{arch}" + + +def text_tokenization(f_in, f_out, lang, tool): + if lang == "fr" : + if tool == "spacy" : + tk.main(f_in, f_out) # .ss -> .tok + @@ -69,9 +78,9 @@ def main(steps): #### Tokenization du text # #python ${SEG_DIR}/code/utils/fr_tokenize.py $RAW > ${RAW}.tok data_in = f"{steps.data.path}/{steps.data.name}{steps.data.file}" data_tok = f"{steps.data.path}/{steps.data.name}.tok" - # sys.exit("check path") print(f"Starting Tokenization...to {data_tok}") - tk.main(data_in, data_tok) # .ss -> .tok + #tk.main(f_in, f_out) # .ss -> .tok + text_tokenization(data_in, data_tok, steps.data.lang, steps.toke_tool) # .ss -> .tok else: data_tok = f"{steps.data.path}/{steps.data.name}{steps.data.file}" @@ -124,9 +133,6 @@ def main(steps): else: print(" pb define model") - - - if steps.post_tab == True : #### Appliquer les predictions au texte et sortir le texte tokenisé avec la colone des tags-prédis # # #python $RUNTIME/json2conll.py ${RESULT_DIR}/${FILE}.json split.tok > ${RESULT_DIR}/${FILE}.split.tok @@ -190,7 +196,14 @@ if __name__ == '__main__': parser.add_argument('--config', help='Config file in JSON') args = parser.parse_args() config = args.config - steps = get_config_infos(config) + now = datetime.now() + stamp = re.sub('[\s:]', '_', str(now)) + my_logs = {} + my_logs['stamp'] = stamp + + steps = get_config_infos(stamp, config) + print(stamp) main(steps) + print("Done.") \ No newline at end of file diff --git a/global_config_file_guideline.md b/global_config_file_guideline.md new file mode 100644 index 0000000000000000000000000000000000000000..580bd7d576dfa97043b48369e38324a56455c724 --- /dev/null +++ b/global_config_file_guideline.md @@ -0,0 +1,129 @@ +# DisCut22 - Global Config File Guideline + +## Good practice tips + +- You can rename the `config_global.json` file as convenient : a good practice is to make one experiment = one global config file. If so, do not forget to type your file name when you will run the main command `python discut22.py --config **config_XX.json**``` +- Data can only be : + - [boolean] `true`, `false`, + - [string] `"my_string_in_between_quote_marks"`, + - or `null`. +- In this documentation, values of fields in **bold** can not be changed and are specific to the usecase. +- Keep comas as in the templates to avoid errors on JSON format. + + +## For Usecase 1 : **Discourse Segmentation** + +- `"usecase_description":` [string] This field is not a fonctional one. You can describe your project or keep the default text. e.g. ```"Config file for usecase_1 : from a text, get the same text but with EDU bracket using ToNy segmenter."``` + +- `input:`{ These fields are mandatory for every Usecases. + + - `"name":` [string] The name of your input dataset, without the extension. This is also the same name of the directory where you put your input dataset. e.g. ```"my.cool.dataset"``` + - `"file":` [string] The extension of your input dataset that reflects its format. + - OPTIONS :[".conllu", ".tok", ".ttok", ".ss"] + - `"language":` [string] Language ID of your dataset following the ISO 639-1 Code. e.g. ```"en"``` + +- `"steps":` { + - `"main":` [string] : **"annotation"** + + - `"pre-processing":` { + - `"tokenization":` [false, true] *available for FR* + - `"sentence_split":` [false, true] *available for FR* + - `"sentence_split_splitor":` [string] This is the toolkit you want for sentence spliting. + - OPTIONS : ["stanza"] + - `"syntactic_parsing":` [boolean] : **false** *Not yet available* + - `"NER_format_initialisation":` [boolean] Set to true if your are working with ToNy. *Set to true anyway ??* + + - `"discourse_segmenter":` { + - `"model":` [string] Here the name or the path to the existing model you want to use. e.g. `"tony"`, `"/moredata/andiamo/discut/Results_split.tok/results_eng.rst.gum_bert/model.tar.gz"` + - `"training":` { + - `"toolkit":` **null** + - `"pre_trained_lm":` **null** + - `"config_file":` **null** + - `"train_data_path":` **null** + - `"validation_data_path":` **null** + + - `"post-processing":` { The toolkit AllenNlp output a JSON. + - `"json_to_tab":` [boolean] Set to true if you want also a conll-style output with predictions as last column. + - `"tab_to_bracket":` [boolean] Set to true if you want also an output as the raw text with brackets as EDU delimiter. If so, `"json_to_tab"` has to be set to true too. + + - `"evaluation":` [boolean] : **false** + - `"gold_test_data_path":` [string] **null** + + +## For Usecase 2 : **Segmentation Evaluation** + +- `"usecase_description":` [string] This field is not a fonctional one. You can describe your project or keep the default text. ```"Config file for usecase_2 : Take a EDU gold segmented text au format tok as input, use a loaded model to make predictions. Output scores of model predictions against gold"``` + +- `input:`{ These fields are mandatory for every Usecases. + + - `"name":` [string] The name of your input dataset, without the extension. This is also the same name of the directory where you put your input dataset. e.g. ```"my.cool.dataset"``` + - `"file":` [string] The extension of your input dataset that reflects its format. + - OPTIONS :[".conllu", ".tok", ".ttok", ".ss"] + - `"language":` [string] Language ID of your dataset following the ISO 639-1 Code. e.g. ```"en"``` + +- `"steps":` { + - `"main":` [string] : **"test"** + + - `"pre-processing":` { + - `"tokenization":` [false, true] *available for FR* + - `"sentence_split":` [false, true] *available for FR* + - `"sentence_split_splitor":` [string] This is the toolkit you want for sentence spliting. + - OPTIONS : ["stanza"] + - `"syntactic_parsing":` [boolean] : **false** *Not yet available* + - `"NER_format_initialisation":` [boolean] Set to true if your are working with ToNy. *Set to true anyway ??* + + - `"discourse_segmenter":` { + - `"model":` [string] Here the name or the path to the existing model you want to use. e.g. `"tony"`, `"/moredata/andiamo/discut/Results_split.tok/results_eng.rst.gum_bert/model.tar.gz"` + - `"training":` { + - `"toolkit":` **null** + - `"pre_trained_lm":` **null** + - `"config_file":` **null** + - `"train_data_path":` **null** + - `"validation_data_path":` **null** + + - `"post-processing":` { The toolkit AllenNlp output a JSON. + - `"json_to_tab":` [boolean] : **true** + - `"tab_to_bracket":` [boolean] Set to true if you want also an output as the raw text with brackets as EDU delimiter. If so, `"json_to_tab"` has to be set to true too. + + - `"evaluation":` [boolean] : **true** + - `"gold_test_data_path":` [string] The path to your gold dataset to make predictions to, and to evaluate against. + + +## For Usecase 3 : **Custom Model Creation** + +- `"usecase_description":` [string] This field is not a fonctional one. You can describe your project or keep the default text. ```"Config file for usecase_3 : Take a EDU gold segmented set of train/dev/test of texts au format conll as input, train a model, output scores."``` + +- `input:`{ These fields are mandatory for every Usecases. + + - `"name":` [string] The name of your input dataset, without the extension. This is also the same name of the directory where you put your input dataset. e.g. ```"my.cool.dataset"``` + - `"file":` [string] The extension of your input dataset that reflects its format. + - OPTIONS :[".conllu", ".tok", ".ttok", ".ss"] + - `"language":` [string] Language ID of your dataset following the ISO 639-1 Code. e.g. ```"en"``` + +- `"steps":` { + - `"main":` [string] : **"train"** + + - `"pre-processing":` { + - `"tokenization":` [false, true] *available for FR* + - `"sentence_split":` [false, true] *available for FR* + - `"sentence_split_splitor":` [string] This is the toolkit you want for sentence spliting. + - OPTIONS : ["stanza"] + - `"syntactic_parsing":` [boolean] : **false** *Not yet available* + - `"NER_format_initialisation":` [boolean] Set to true if your are working with ToNy. *Set to true anyway ??* + + - `"discourse_segmenter":` { + - `"model":` **null** + - `"training":` { + - `"toolkit":` [string] The toolkit to build your model (to be added : "jiant"). + - OPTIONS : ["allennlp"] + - `"pre_trained_lm":` **bert** (to be added : roberta..) + - `"config_file":` [string] The path to the config file for training. e.g. `"../model/config_training.jsonnet"` + - `"train_data_path":` [string] The path to your training dataset. e.g. `"../data/eng.rst.rstdt/eng.rst.rstdt_train.conllu"` *conflict with training_config ??* + - `"validation_data_path":` [string] The path to your development dataset. e.g. `"../data/eng.rst.rstdt/eng.rst.rstdt_dev.conllu"` *idem* + + - `"post-processing":` { The toolkit AllenNlp output a JSON. + - `"json_to_tab":` [boolean] : **true** + - `"tab_to_bracket":` [boolean] Set to true if you want also an output as the raw text with brackets as EDU delimiter. If so, `"json_to_tab"` has to be set to true too. + + - `"evaluation":` [boolean] : **true** + - `"gold_test_data_path":` [string] The path to your gold test dataset to make predictions on, and to evaluate against. diff --git a/model/config_training_bert.jsonnet b/model/config_training_bert.jsonnet new file mode 100644 index 0000000000000000000000000000000000000000..a656ba4f8bb1104aaea62e5acd0fe6a5af66a46b --- /dev/null +++ b/model/config_training_bert.jsonnet @@ -0,0 +1,79 @@ +{ + "dataset_reader": { + "type": "conll2003", + "coding_scheme": "BIOUL", + "tag_label": "ner", + "token_indexers": { + "bert": { + "type": "bert-pretrained", + "do_lowercase": false, + "pretrained_model": "bert-base-multilingual-cased", + "use_starting_offsets": true + }, + "token_characters": { + "type": "characters", + "min_padding_length": 3 + } + } + }, + "iterator": { + "type": "basic", + "batch_size": 2 + }, + "model": { + "type": "simple_tagger", + "encoder": { + "type": "lstm", + "bidirectional": true, + "dropout": 0.5, + "hidden_size": 100, + "input_size": 896, + "num_layers": 2 + }, + "text_field_embedder": { + "allow_unmatched_keys": true, + "embedder_to_indexer_map": { + "bert": [ + "bert", + "bert-offsets" + ], + "token_characters": [ + "token_characters" + ] + }, + "token_embedders": { + "bert": { + "type": "bert-pretrained", + "pretrained_model": "bert-base-multilingual-cased" + }, + "token_characters": { + "type": "character_encoding", + "embedding": { + "embedding_dim": 16 + }, + "encoder": { + "type": "cnn", + "conv_layer_activation": "relu", + "embedding_dim": 16, + "ngram_filter_sizes": [ + 3 + ], + "num_filters": 128 + } + } + } + } + }, + "train_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_train.ner.conllu", + "validation_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_dev.ner.conllu", + "trainer": { + "cuda_device": 1, + "grad_norm": 5, + "num_epochs": 4, + "num_serialized_models_to_keep": 3, + "optimizer": { + "type": "bert_adam", + "lr": 0.001 + } + } +} \ No newline at end of file