diff --git a/README.md b/README.md index 618ec4315d2db5f2703f572826fecb935a67c89b..d0dcb14010b9d10cc3a79741177038b7de0758db 100644 --- a/README.md +++ b/README.md @@ -14,9 +14,11 @@ Code: https://gitlab.inria.fr/andiamo/tony # Usage ## Usecases -- **Discourse Segmentation:** Take a raw text as input, use a loaded model to make predictions. Output the same text but with EDU segmentation. +- **Discourse Segmentation:** Take a raw text as input, use a loaded model to make predictions. Output the same text but with EDU segmentation. --> config_1 -- **Segmentation Evaluation:** Take an EDU gold segmented text as input, use a loaded model to make predictions. Output scores of model predictions against gold, and output discrepancies. +- **Segmentation Evaluation:** Take an EDU gold segmented text as input, use a loaded model to make predictions. Output scores of model predictions against gold, and output discrepancies. --> config_2 + +- **Custom Model Creation:** Fine-tuning (over one or two level) a pretrained Language Model with a specific dataset or combination of datasets. Then make predictions and evaluation. --> config_3 ## Content description @@ -40,17 +42,17 @@ pip install -r requirements.txt ## Configuration file: to chose or to complete - `code/config_1.json` Config for usecase_1 : take a sentence splited text, apply ToNy, output same text but with EDU brackets. -- [TBD : train models config and all sort of cool options] + ## Run usecase 1 (go to `code` directory) Run this command: ``` -python discut22.py --config config_1.json +python discut22.py --config config_XX.json ``` ## Authors and acknowledgment -Morteza Ezzabady +Morteza Ezzabady Amir Zeldes <!--- diff --git a/code/config_1.json b/code/config_1.json index 712bdd1903198bc16ecfa13fa5e20b4027a60af0..ed6c31f65e83f917c5cd4f24d603274593b807a8 100644 --- a/code/config_1.json +++ b/code/config_1.json @@ -6,8 +6,7 @@ "folder_path": "../data/chaperontest", "format": "raw_sentences", "language": "fr", - "gold": false, - "results_path": "../data/chaperontest/results" + "gold": false }, "output": { "format": "bracket", @@ -27,7 +26,8 @@ "post-processing": { "json_to_tab": true, "tab_to_bracket":true - } + }, + "evaluation": false } } diff --git a/code/config_1_fanny.json b/code/config_1_fanny.json new file mode 100644 index 0000000000000000000000000000000000000000..7a1362011d34690391d85c82fccbb14ab40c5965 --- /dev/null +++ b/code/config_1_fanny.json @@ -0,0 +1,43 @@ +{ + "usecase_description": "Config file for usecase_1 : from a text, get the same text but with EDU using an existing model.", + "input": { + "name": "my.cool.project", + "file": [".conllu", ".tok"], + "folder_path": "../data/my.cool.project", + "format": "truc", + "language": "en", + "gold": true + }, + "output": { + "format": "ner_tok", + "framework": "rst" + }, + "steps":{ + "main": "annotation", + "pre-processing": { + "tokenization": [false, true], + "sentence_split": [false, true], + "sentence_split_splitor": "stanza", + "syntactic_parsing": [false], + "NER_format_initialisation": true + }, + "discourse_segmenter": { + "model": "/moredata/andiamo/discut/Results_split.tok/results_eng.rst.gum_bert/model.tar.gz", + "training": { + "toolkit": null, + "pre_trained_lm": null, + "config_file": null, + "train_data_path": null, + "validation_data_path": null + } + }, + "post-processing": { + "json_to_tab": [false, true], + "tab_to_bracket": [false, true] + }, + "evaluation": false, + "gold_test_data_path": null + } +} + + diff --git a/code/config_2.json b/code/config_2.json index 5876381e9f607355baf1c99350ffbdc27bcc8bed..a8334462ffa8e651457dd6ece221c3b13c135a22 100644 --- a/code/config_2.json +++ b/code/config_2.json @@ -6,8 +6,7 @@ "folder_path": "../data/fra.sdrt.annodis_dev", "format": "truc", "language": "fr", - "gold": true, - "results_path": "../data/fra.sdrt.annodis_dev/results" + "gold": true }, "output": { "format": "ner_tok", diff --git a/code/config_3.json b/code/config_3.json index f28b55a0b3f79ad27888ebaa74b8ca8b70d88e95..67b6262a132371d381d63d65d34dc95d5efe9f36 100644 --- a/code/config_3.json +++ b/code/config_3.json @@ -7,8 +7,7 @@ "folder_path": "../data/fra.sdrt.annodis_dev", "format": "truc", "language": "fr", - "gold": true, - "results_path": "../data/fra.sdrt.annodis_dev/results" + "gold": true }, "output": { "format": "ner_tok", diff --git a/code/discut22_1.py b/code/discut22_1.py index fd531d6ec0bd919ca713a7fe036f07e37c62843c..ca66db7eac21dc47468013d4f07fdd282b544c69 100644 --- a/code/discut22_1.py +++ b/code/discut22_1.py @@ -48,7 +48,6 @@ def get_model(model_name): -# main call def main(steps): #steps = get_config_infos(config) # on obtient la liste des trucs @@ -60,13 +59,12 @@ def main(steps): # FN: soit besoin sent split, soit besoin tokenizer, soit aucun des deux if steps.ssplit == True : # python code/ssplit/parse_corpus.py ${dataset} --parser stanza --out_dir data #### Split text into sentence : not in usecase1 + if not steps.ssplitor == "stanza" : + print("pls define sentence splitor") # raise error n kill process data_in = f"{steps.data.path}/{steps.data.name}{steps.data.file}" data_tok = f"{steps.data.path}/{steps.data.name}.tok" print(f"Starting sentence spliting...to {steps.data.path}/steps.data.name") - # ssent.main(data_in, data_tok, "stanza", steps.data.lang) - - ssent.main(data_in, data_tok, "stanza", steps.data.lang) - + ssent.main(data_in, data_tok, steps.ssplitor, steps.data.lang) elif steps.toke == True : #### Tokenization du text # #python ${SEG_DIR}/code/utils/fr_tokenize.py $RAW > ${RAW}.tok data_in = f"{steps.data.path}/{steps.data.name}{steps.data.file}" @@ -74,7 +72,6 @@ def main(steps): # sys.exit("check path") print(f"Starting Tokenization...to {data_tok}") tk.main(data_in, data_tok) # .ss -> .tok - else: data_tok = f"{steps.data.path}/{steps.data.name}{steps.data.file}"