add config with path to en model in stout

1c404fc4 · laura.riviere · 58b29c78 · 1c404fc4 · 1c404fc4 · 1c404fc4
Commit 1c404fc4 authored 2 years ago by laura.riviere
--- a/README.md
+++ b/README.md
@@ -14,9 +14,11 @@ Code: https://gitlab.inria.fr/andiamo/tony

 # Usage
 ## Usecases 
- **Discourse Segmentation:** Take a raw text as input, use a loaded model to make predictions. Output the same text but with EDU segmentation.
+- **Discourse Segmentation:** Take a raw text as input, use a loaded model to make predictions. Output the same text but with EDU segmentation. --> config_1

- **Segmentation Evaluation:** Take an EDU gold segmented text as input, use a loaded model to make predictions. Output scores of model predictions against gold, and output discrepancies.
+- **Segmentation Evaluation:** Take an EDU gold segmented text as input, use a loaded model to make predictions. Output scores of model predictions against gold, and output discrepancies. --> config_2
+
+- **Custom Model Creation:** Fine-tuning (over one or two level) a pretrained Language Model with a specific dataset or combination of datasets. Then make predictions and evaluation. --> config_3


 ## Content description 
@@ -40,17 +42,17 @@ pip install -r requirements.txt

 ## Configuration file: to chose or to complete
 - `code/config_1.json` Config for usecase_1 : take a sentence splited text, apply ToNy, output same text but with EDU brackets.
- [TBD : train models config and all sort of cool options]
+

 ## Run usecase 1
 (go to `code` directory)
 Run this command:
 ```
-python discut22.py --config config_1.json
+python discut22.py --config config_XX.json
 ```

 ## Authors and acknowledgment
-Morteza Ezzabady
+Morteza Ezzabady  
 Amir Zeldes
 <!---


--- a/code/config_1.json
+++ b/code/config_1.json
@@ -6,8 +6,7 @@
        "folder_path": "../data/chaperontest",
        "format": "raw_sentences",
        "language": "fr",
-        "gold": false,
-        "results_path": "../data/chaperontest/results"
+        "gold": false
    },
    "output": {
        "format": "bracket",
@@ -27,7 +26,8 @@
        "post-processing": {
            "json_to_tab": true,
            "tab_to_bracket":true
-        }
+        },
+        "evaluation": false
    }
 }

--- a/code/config_1_fanny.json
+++ b/code/config_1_fanny.json
+{
+    "usecase_description": "Config file for usecase_1 : from a text, get the same text but with EDU using an existing model.",
+    "input": {
+        "name": "my.cool.project",
+        "file": [".conllu", ".tok"],
+        "folder_path": "../data/my.cool.project",
+        "format": "truc",
+        "language": "en",
+        "gold": true
+    },
+    "output": {
+        "format": "ner_tok", 
+        "framework": "rst"
+    },
+    "steps":{
+        "main": "annotation",
+        "pre-processing": {
+            "tokenization": [false, true],
+            "sentence_split": [false, true],
+            "sentence_split_splitor": "stanza",
+            "syntactic_parsing": [false],
+            "NER_format_initialisation": true
+        },
+        "discourse_segmenter": {
+            "model": "/moredata/andiamo/discut/Results_split.tok/results_eng.rst.gum_bert/model.tar.gz",
+            "training": {
+                "toolkit": null,
+                "pre_trained_lm": null,
+                "config_file": null,
+                "train_data_path": null, 
+                "validation_data_path": null
+            }
+        },
+        "post-processing": {
+            "json_to_tab": [false, true],
+            "tab_to_bracket": [false, true]
+        },
+        "evaluation": false,
+        "gold_test_data_path": null
+    }
+}
+
+
--- a/code/config_2.json
+++ b/code/config_2.json
@@ -6,8 +6,7 @@
        "folder_path": "../data/fra.sdrt.annodis_dev",
        "format": "truc",
        "language": "fr",
-        "gold": true,
-        "results_path": "../data/fra.sdrt.annodis_dev/results"
+        "gold": true
    },
    "output": {
        "format": "ner_tok", 

--- a/code/config_3.json
+++ b/code/config_3.json
@@ -7,8 +7,7 @@
        "folder_path": "../data/fra.sdrt.annodis_dev",
        "format": "truc",
        "language": "fr",
-        "gold": true,
-        "results_path": "../data/fra.sdrt.annodis_dev/results"
+        "gold": true
    },
    "output": {
        "format": "ner_tok", 

--- a/code/discut22_1.py
+++ b/code/discut22_1.py
@@ -48,7 +48,6 @@ def get_model(model_name):



-# main call
 def main(steps):
    
    #steps = get_config_infos(config) # on obtient la liste des trucs
@@ -60,13 +59,12 @@ def main(steps):
    # FN: soit besoin sent split, soit besoin tokenizer, soit aucun des deux
    if steps.ssplit == True :       # python code/ssplit/parse_corpus.py ${dataset} --parser stanza --out_dir data
    #### Split text into sentence : not in usecase1
+        if not steps.ssplitor == "stanza" :
+            print("pls define sentence splitor") # raise error n kill process
        data_in = f"{steps.data.path}/{steps.data.name}{steps.data.file}"
        data_tok = f"{steps.data.path}/{steps.data.name}.tok"
        print(f"Starting sentence spliting...to {steps.data.path}/steps.data.name")
-    #    ssent.main(data_in, data_tok, "stanza", steps.data.lang)
-
-        ssent.main(data_in, data_tok, "stanza", steps.data.lang)
-
+        ssent.main(data_in, data_tok, steps.ssplitor, steps.data.lang)
    elif steps.toke == True :
    #### Tokenization du text        # #python ${SEG_DIR}/code/utils/fr_tokenize.py $RAW > ${RAW}.tok 
        data_in = f"{steps.data.path}/{steps.data.name}{steps.data.file}"
@@ -74,7 +72,6 @@ def main(steps):
    #    sys.exit("check path")
        print(f"Starting Tokenization...to {data_tok}")
        tk.main(data_in, data_tok) # .ss -> .tok
-
    else:
        data_tok = f"{steps.data.path}/{steps.data.name}{steps.data.file}"