update Read.me and config files

2ecf6186 · laura.riviere · 4c24c831 · 2ecf6186 · 4c24c831 · 4c24c831
Commit 2ecf6186 authored 2 years ago by laura.riviere
--- a/README.md
+++ b/README.md
@@ -41,7 +41,7 @@ pip install -r requirements.txt
 ```
 ## Configuration file: to chose or to complete
- `code/config_1.json` Config for usecase_1 : take a sentence splited text, apply ToNy, output same text but with EDU brackets.
+- `code/config_global_X.json` See global_config_file_guideline.md.
 ## Run usecase 1

--- a/code/config_1.json
+++ b/code/config_1.json
-{
-    "usecase_description": "Config file for usecase_1 : from a text, get the same text but with EDU bracket using ToNy segmenter.",
-    "input": {
-        "name": "chaperontest",
-        "file": ".ss",
-        "folder_path": "../data/chaperontest",
-        "format": "raw_sentences",
-        "language": "fr",
-        "gold": false
-    },
-    "output": {
-        "format": "bracket",
-        "framework": "sdrt"
-    },
-    "steps":{
-        "main": "annotation",
-        "pre-processing": {
-            "tokenization": false,
-            "sentence_split": false,
-            "sentence_split_splitor": false,
-            "syntactic_parsing": false,
-            "NER_format_initialisation": false
-        },
-        "discourse_segmenter": {
-            "model": "tony"
-        },
-        "post-processing": {
-            "json_to_tab": true,
-            "tab_to_bracket":false
-        },
-        "evaluation": false
-    }
-}
--- a/code/config_1_fanny.json
+++ b/code/config_1_fanny.json
-{
-    "usecase_description": "Config file for usecase_1 : from a text, get the same text but with EDU using an existing model.",
-    "input": {
-        "name": "chaperontest",
-        "file": ".ss",
-        "folder_path": "../data/chaperontest",
-        "format": "truc",
-        "language": "fr",
-        "gold": true
-    },
-    "output": {
-        "format": "ner_tok", 
-        "framework": "rst"
-    },
-    "steps":{
-        "main": "annotation",
-        "pre-processing": {
-            "tokenization": true,
-            "sentence_split": false,
-            "sentence_split_splitor": "stanza",
-            "syntactic_parsing": false,
-            "NER_format_initialisation": true
-        },
-        "discourse_segmenter": {
-            "model": "tony",
-            "training": {
-                "toolkit": null,
-                "pre_trained_lm": null,
-                "config_file": null,
-                "train_data_path": null, 
-                "validation_data_path": null
-            }
-        },
-        "post-processing": {
-            "json_to_tab": true,
-            "tab_to_bracket": true
-        },
-        "evaluation": false,
-        "gold_test_data_path": null
-    }
-}
--- a/code/config_2.json
+++ b/code/config_2.json
-{
-    "usecase_description": "Config file for usecase_2 : Take a EDU gold segmented text au format tok as input, use a loaded model to make predictions. Output scores of model predictions against gold, and output discrepancies. To start, we evaluate tony on annodis dev set.",
-    "input": {
-        "name": "fra.sdrt.annodis_dev",
-        "file": ".ttok",
-        "folder_path": "../data/fra.sdrt.annodis_dev",
-        "format": "truc",
-        "language": "fr",
-        "gold": true
-    },
-    "output": {
-        "format": "ner_tok", 
-        "framework": "sdrt"
-    },
-    "steps":{
-        "main": "test",
-        "pre-processing": {
-            "tokenization": false,
-            "sentence_split": true,
-            "sentence_split_splitor": "stanza",
-            "syntactic_parsing": false, 
-            "NER_format_initialisation": true
-        },
-        "discourse_segmenter": {
-            "model": "tony"
-        },
-        "post-processing": {
-            "json_to_tab": true,
-            "tab_to_bracket":false
-        },
-        "evaluation": true
-    }
-}
--- a/code/config_3.json
+++ b/code/config_3.json
-{
-    "usecase_description": "Config file for usecase_2.2 : Take a EDU gold segmented text au format conll as input, use a loaded model to make predictions. Output scores of model predictions against gold, and output discrepancies. To start, we evaluate tony on annodis dev set.",
-    "input": {
-        "name": "fra.sdrt.annodis_dev",
-        "file": ".conllu", 
-        "file_options": [".conllu", ".tok"],
-        "folder_path": "../data/fra.sdrt.annodis_dev",
-        "format": "truc",
-        "language": "fr",
-        "gold": true
-    },
-    "output": {
-        "format": "ner_tok", 
-        "framework": "sdrt"
-    },
-    "steps":{
-        "main": "test",
-        "pre-processing": {
-            "tokenization": false,
-            "sentence_split": false,
-            "sentence_split_splitor": "stanza",
-            "syntactic_parsing": false, 
-            "NER_format_initialisation": true
-        },
-        "discourse_segmenter": {
-            "model": "tony"
-        },
-        "post-processing": {
-            "json_to_tab": true,
-            "tab_to_bracket":false
-        },
-        "evaluation": true
-    }
-}
--- a/code/config_4.json
+++ b/code/config_4.json
-{
-    "usecase_description": "Config file for usecase_3 : Take a EDU gold segmented set of train/dev/test of texts au format conll as input, train a model, output scores.",
-    "input": {
-        "name": "eng.rst.rstdt",
-        "file": ".conllu", 
-        "file_options": [".conllu", ".tok"],
-        "folder_path": "../data/eng.rst.rstdt",
-        "format": "truc",
-        "language": "en",
-        "gold": true
-    },
-    "output": {
-        "format": "ner_tok", 
-        "framework": "rst"
-    },
-    "steps":{
-        "main": "train",
-        "pre-processing": {
-            "tokenization": false,
-            "sentence_split": false,
-            "sentence_split_splitor": "stanza",
-            "syntactic_parsing": false, 
-            "NER_format_initialisation": true
-        },
-        "discourse_segmenter": {
-            "model": null,
-            "training": {
-                "toolkit": "allennlp",
-                "pre_trained_lm": "bert",
-                "config_file": "../model/config_training.jsonnet",
-                "train_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_train.conllu",
-                "validation_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_dev.conllu"
-            }
-        },
-        "post-processing": {
-            "json_to_tab": false,
-            "tab_to_bracket":false
-        },
-        "evaluation": true,
-        "gold_test_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_test.conllu"
-    }
-}