add documentation and new config files

4c24c831 · laura.riviere · 1c404fc4 · 4c24c831 · 4c24c831 · 4c24c831
Commit 4c24c831 authored 2 years ago by laura.riviere
--- a/README.md
+++ b/README.md
@@ -23,15 +23,15 @@ Code: https://gitlab.inria.fr/andiamo/tony

 ## Content description 
 [TBD : xplain directories automatically created during scripts run]
- `data/MyProjet/` Contains input data, raw and/or pre-processed format(s).
-    - `results/` Contains output data, scores and post-processed data. (Also logs of allennlp)
+- `data/my.cool.dataset/` Contains input data, raw and/or pre-processed format(s).
+    - `results.{stamp}/` Contains output data, scores and post-processed data. (Also logs of allennlp)
 - `code/` Contains main scripts.
    - `discut22_1.py` One python script to run them all.
-    - `config_XX.json` A file to be completed (or a dir with choise between simple use_case configs and a template for a custom config).
+    - `config_XX.json` A file to be completed for your specific project (or a dir with choise between simple use_case configs and a template for a custom config). See **
    - `utils/` Contains useful scripts to be called.
 - `model/` Contains model to be loaded or created.
    - `config_training.jsonnet` A file to be completed. (TBD automatically saved with model when done)
- `documentation.md` Contains detailed documentation (TBD?)
+- `global_config_file_guideline.md` Contains detailed documentation to build well formed config file.

 ## Set up environnement
 - Conda stuff pour python 3.7 (TBD ?)
@@ -51,8 +51,13 @@ Run this command:
 python discut22.py --config config_XX.json
 ```

+## Support
+laura.riviere@irit.fr
+
+
 ## Authors and acknowledgment
 Morteza Ezzabady  
+Laura Rivière  
 Amir Zeldes
 <!---


--- a/code/classes_def.py
+++ b/code/classes_def.py
@@ -3,14 +3,15 @@


 class Input:
-    def __init__(self, infos):
+    def __init__(self, infos, stamp):
        self.name = infos['name']
        self.lang = infos['language'] 
-        self.path = infos['folder_path'] # misused
-        self.file = infos['file'] 
-        self.form = infos['format'] # not used
-        self.gold = infos['gold'] # not used
-        self.resu = f"{self.path}/results"
+#        self.path = infos['folder_path'] # misused
+        self.path = f"../data/{self.name}"
+        self.file = infos['file']
+        self.stamp = stamp
+        self.conv = f"{self.path}/data_converted_{stamp}" # à intégrer
+        self.resu = f"{self.path}/results_{stamp}"


 class Process:
@@ -18,10 +19,11 @@ class Process:
        self.data = data
        self.main = infos["main"] # train test annotation

-        self.toke = infos['pre-processing']['tokenization'] # not used
+        self.toke = infos['pre-processing']['tokenization']
+        self.toke_tool = infos['pre-processing']['tokenization_tool']
        self.ssplit = infos['pre-processing']['sentence_split']
        self.ssplitor = infos['pre-processing']['sentence_split_splitor']
-        self.ner_init = infos['pre-processing']['NER_format_initialisation']
+        self.ner_init = infos['pre-processing']['NER_format_initialisation'] # useless because done anyway

        if self.main == "train":
            if self.ner_init == True : # à faire en relatif !! split truc
@@ -41,12 +43,5 @@ class Process:
        self.eval = infos['evaluation']
        self.test_data = infos['gold_test_data_path']

-        #if self.eval == True :
-        #    if self.ner_init == True :
-        #        self.test_data = f"{self.data.path}/{self.data.name}_test.ner{self.data.file}"
-        #        #self.test_data = infos['gold_test_data_path']
-        #    else :
-        #        self.test_data = infos['gold_test_data_path']
-
        self.post_bracket = infos['post-processing']['tab_to_bracket']
        
\ No newline at end of file
--- a/code/config_1.json
+++ b/code/config_1.json
@@ -15,17 +15,18 @@
    "steps":{
        "main": "annotation",
        "pre-processing": {
-            "tokenization": true,
+            "tokenization": false,
            "sentence_split": false,
+            "sentence_split_splitor": false,
            "syntactic_parsing": false,
-            "NER_format_initialisation": true
+            "NER_format_initialisation": false
        },
        "discourse_segmenter": {
            "model": "tony"
        },
        "post-processing": {
            "json_to_tab": true,
-            "tab_to_bracket":true
+            "tab_to_bracket":false
        },
        "evaluation": false
    }

--- a/code/config_1_fanny.json
+++ b/code/config_1_fanny.json
 {
    "usecase_description": "Config file for usecase_1 : from a text, get the same text but with EDU using an existing model.",
    "input": {
-        "name": "my.cool.project",
-        "file": [".conllu", ".tok"],
-        "folder_path": "../data/my.cool.project",
+        "name": "chaperontest",
+        "file": ".ss",
+        "folder_path": "../data/chaperontest",
        "format": "truc",
-        "language": "en",
+        "language": "fr",
        "gold": true
    },
    "output": {
@@ -15,14 +15,14 @@
    "steps":{
        "main": "annotation",
        "pre-processing": {
-            "tokenization": [false, true],
-            "sentence_split": [false, true],
+            "tokenization": true,
+            "sentence_split": false,
            "sentence_split_splitor": "stanza",
-            "syntactic_parsing": [false],
+            "syntactic_parsing": false,
            "NER_format_initialisation": true
        },
        "discourse_segmenter": {
-            "model": "/moredata/andiamo/discut/Results_split.tok/results_eng.rst.gum_bert/model.tar.gz",
+            "model": "tony",
            "training": {
                "toolkit": null,
                "pre_trained_lm": null,
@@ -32,8 +32,8 @@
            }
        },
        "post-processing": {
-            "json_to_tab": [false, true],
-            "tab_to_bracket": [false, true]
+            "json_to_tab": true,
+            "tab_to_bracket": true
        },
        "evaluation": false,
        "gold_test_data_path": null

--- a/code/config_global_1.json
+++ b/code/config_global_1.json
+{
+    "usecase_description": "Config file for usecase_1 : from a text, get the same text but with EDU bracket using ToNy segmenter.",
+    "input": {
+        "name": "chaperontest",
+        "file": ".ss",
+        "language": "fr"
+    },
+    "steps":{
+        "main": "annotation",
+        "pre-processing": {
+            "tokenization": true,
+            "tokenization_tool" : "spacy",
+            "sentence_split": false,
+            "sentence_split_splitor": null,
+            "syntactic_parsing": false, 
+            "NER_format_initialisation": true
+        },
+        "discourse_segmenter": {
+            "model": "tony",
+            "training": {
+                "toolkit": null,
+                "pre_trained_lm": null,
+                "config_file": null,
+                "train_data_path": null,
+                "validation_data_path": null
+            }
+        },
+        "post-processing": {
+            "json_to_tab": true,
+            "tab_to_bracket":true
+        },
+        "evaluation": false,
+        "gold_test_data_path": null
+    }
+}
+
+
--- a/code/config_global_2.json
+++ b/code/config_global_2.json
+{
+    "usecase_description": "Config file for usecase_2",
+    "input": {
+        "name": "fra.sdrt.annodis_dev",
+        "file": ".ttok",
+        "language": "fr"
+    },
+    "steps":{
+        "main": "annotation",
+        "pre-processing": {
+            "tokenization": false,
+            "tokenization_tool" : "spacy",
+            "sentence_split": true,
+            "sentence_split_splitor": "stanza",
+            "syntactic_parsing": false, 
+            "NER_format_initialisation": true
+        },
+        "discourse_segmenter": {
+            "model": "tony",
+            "training": {
+                "toolkit": null,
+                "pre_trained_lm": null,
+                "config_file": null,
+                "train_data_path": null,
+                "validation_data_path": null
+            }
+        },
+        "post-processing": {
+            "json_to_tab": true,
+            "tab_to_bracket":true
+        },
+        "evaluation": false,
+        "gold_test_data_path": null
+    }
+}
+
+
--- a/code/config_global_3.json
+++ b/code/config_global_3.json
+{
+    "usecase_description": "Config file for usecase_3 : Take a EDU gold segmented set of train/dev/test of texts au format conll as input, train a model, output scores.",
+    "input": {
+        "name": "eng.rst.rstdt",
+        "file": ".conllu",
+        "language": "en"
+    },
+    "steps":{
+        "main": "train",
+        "pre-processing": {
+            "tokenization": false,
+            "tokenization_tool" : "spacy",
+            "sentence_split": false,
+            "sentence_split_splitor": "stanza",
+            "syntactic_parsing": false, 
+            "NER_format_initialisation": true
+        },
+        "discourse_segmenter": {
+            "model": null,
+            "training": {
+                "toolkit": "allennlp",
+                "pre_trained_lm": "bert",
+                "config_file": "../model/config_training_bert.jsonnet",
+                "train_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_train.conllu",
+                "validation_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_dev.conllu"
+            }
+        },
+        "post-processing": {
+            "json_to_tab": false,
+            "tab_to_bracket":false
+        },
+        "evaluation": true,
+        "gold_test_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_test.conllu"
+    }
+}
+
+
--- a/code/discut22_1.py
+++ b/code/discut22_1.py
@@ -8,6 +8,8 @@
 import os
 import sys
 import argparse
+import re
+from datetime import datetime
 import pandas as pd # for futur clean output in df
 import json 

@@ -22,11 +24,11 @@ import utils.training_allennlp as tr_allen


 # fonction to get config stuffs
-def get_config_infos(config_file):
+def get_config_infos(stamp, config_file):

    with open(config_file) as f:
        infos = json.load(f)
-    data_in = Input(infos['input'])
+    data_in = Input(infos['input'], stamp)
    actions = Process(infos['steps'], data_in)
    print(f"data to be process : {data_in.name}")
    return actions
@@ -38,13 +40,20 @@ def get_model(model_name):

    if name == "tony": 
        arch = "french_tokens.tar.gz"
-        if not os.path.isfile(f"../model/{arch}"):
+        if not os.path.isfile(f"../model/{name}/{arch}"):
            dl = "wget https://zenodo.org/record/4235850/files/french_tokens.tar.gz -P ../model --progress=bar"
            os.system(dl)
        else:
            print("Tony already in place !")

-    return f"../model/{arch}"
+    return f"../model/{name}/{arch}"
+
+
+def text_tokenization(f_in, f_out, lang, tool):
+    if lang == "fr" :
+        if  tool == "spacy" :
+            tk.main(f_in, f_out) # .ss -> .tok
+



@@ -69,9 +78,9 @@ def main(steps):
    #### Tokenization du text        # #python ${SEG_DIR}/code/utils/fr_tokenize.py $RAW > ${RAW}.tok 
        data_in = f"{steps.data.path}/{steps.data.name}{steps.data.file}"
        data_tok = f"{steps.data.path}/{steps.data.name}.tok"
-    #    sys.exit("check path")
        print(f"Starting Tokenization...to {data_tok}")
-        tk.main(data_in, data_tok) # .ss -> .tok
+        #tk.main(f_in, f_out) # .ss -> .tok
+        text_tokenization(data_in, data_tok, steps.data.lang, steps.toke_tool) # .ss -> .tok
    else:
        data_tok = f"{steps.data.path}/{steps.data.name}{steps.data.file}"

@@ -124,9 +133,6 @@ def main(steps):
    else:
        print(" pb define model")

-
-
-
    if steps.post_tab == True :
    #### Appliquer les predictions au texte et sortir le texte tokenisé avec la colone des tags-prédis     
    # # #python $RUNTIME/json2conll.py ${RESULT_DIR}/${FILE}.json split.tok > ${RESULT_DIR}/${FILE}.split.tok
@@ -190,7 +196,14 @@ if __name__ == '__main__':
    parser.add_argument('--config', help='Config file in JSON')
    args = parser.parse_args()
    config = args.config
-    steps = get_config_infos(config)

+    now = datetime.now()
+    stamp = re.sub('[\s:]', '_', str(now))
+    my_logs = {}
+    my_logs['stamp'] = stamp
+
+    steps = get_config_infos(stamp, config)
+    print(stamp)
    main(steps)
+
    print("Done.")
\ No newline at end of file
--- a/global_config_file_guideline.md
+++ b/global_config_file_guideline.md
+# DisCut22 - Global Config File Guideline
+
+## Good practice tips
+
+- You can rename the `config_global.json` file as convenient : a good practice is to make one experiment = one global config file. If so, do not forget to type your file name when you will run the main command `python discut22.py --config **config_XX.json**```
+- Data can only be : 
+    - [boolean] `true`, `false`,
+    - [string] `"my_string_in_between_quote_marks"`,
+    - or `null`.
+- In this documentation, values of fields in **bold** can not be changed and are specific to the usecase.
+- Keep comas as in the templates to avoid errors on JSON format.
+
+
+## For Usecase 1 : **Discourse Segmentation**  
+
+- `"usecase_description":` [string] This field is not a fonctional one. You can describe your project or keep the default text. e.g. ```"Config file for usecase_1 : from a text, get the same text but with EDU bracket using ToNy segmenter."```  
+
+- `input:`{ These fields are mandatory for every Usecases.  
+
+    - `"name":` [string] The name of your input dataset, without the extension. This is also the same name of the directory where you put your input dataset.  e.g. ```"my.cool.dataset"```
+    - `"file":` [string] The extension of your input dataset that reflects its format.
+        - OPTIONS :[".conllu", ".tok", ".ttok", ".ss"]
+    - `"language":` [string] Language ID of your dataset following the ISO 639-1 Code. e.g. ```"en"```
+
+- `"steps":` { 
+    - `"main":` [string] : **"annotation"**
+
+    - `"pre-processing":` {
+        - `"tokenization":` [false, true] *available for FR*
+        - `"sentence_split":` [false, true] *available for FR*
+        - `"sentence_split_splitor":` [string] This is the toolkit you want for sentence spliting.
+            - OPTIONS : ["stanza"] 
+        - `"syntactic_parsing":` [boolean] : **false** *Not yet available*
+        - `"NER_format_initialisation":` [boolean] Set to true if your are working with ToNy. *Set to true anyway ??*
+
+    - `"discourse_segmenter":` {
+        - `"model":` [string] Here the name or the path to the existing model you want to use. e.g. `"tony"`, `"/moredata/andiamo/discut/Results_split.tok/results_eng.rst.gum_bert/model.tar.gz"`
+        - `"training":` {
+            - `"toolkit":` **null**
+            - `"pre_trained_lm":` **null**
+            - `"config_file":` **null**
+            - `"train_data_path":` **null**
+            - `"validation_data_path":` **null**
+
+    - `"post-processing":` { The toolkit AllenNlp output a JSON.
+        - `"json_to_tab":` [boolean] Set to true if you want also a conll-style output with predictions as last column.
+        - `"tab_to_bracket":` [boolean] Set to true if you want also an output as the raw text with brackets as EDU delimiter. If so, `"json_to_tab"` has to be set to true too.
+
+    - `"evaluation":` [boolean] : **false**
+    - `"gold_test_data_path":` [string] **null**
+
+
+## For Usecase 2 : **Segmentation Evaluation**  
+
+- `"usecase_description":` [string] This field is not a fonctional one. You can describe your project or keep the default text. ```"Config file for usecase_2 : Take a EDU gold segmented text au format tok as input, use a loaded model to make predictions. Output scores of model predictions against gold"```
+
+- `input:`{ These fields are mandatory for every Usecases.  
+
+    - `"name":` [string] The name of your input dataset, without the extension. This is also the same name of the directory where you put your input dataset.  e.g. ```"my.cool.dataset"```
+    - `"file":` [string] The extension of your input dataset that reflects its format. 
+        - OPTIONS :[".conllu", ".tok", ".ttok", ".ss"]
+    - `"language":` [string] Language ID of your dataset following the ISO 639-1 Code. e.g. ```"en"```
+
+- `"steps":` { 
+    - `"main":` [string] : **"test"**
+
+    - `"pre-processing":` {
+        - `"tokenization":` [false, true] *available for FR*
+        - `"sentence_split":` [false, true] *available for FR*
+        - `"sentence_split_splitor":` [string] This is the toolkit you want for sentence spliting.
+            - OPTIONS : ["stanza"] 
+        - `"syntactic_parsing":` [boolean] : **false** *Not yet available*
+        - `"NER_format_initialisation":` [boolean] Set to true if your are working with ToNy. *Set to true anyway ??*
+
+    - `"discourse_segmenter":` {
+        - `"model":` [string] Here the name or the path to the existing model you want to use. e.g. `"tony"`, `"/moredata/andiamo/discut/Results_split.tok/results_eng.rst.gum_bert/model.tar.gz"`
+        - `"training":` {
+            - `"toolkit":` **null**
+            - `"pre_trained_lm":` **null**
+            - `"config_file":` **null**
+            - `"train_data_path":` **null**
+            - `"validation_data_path":` **null**
+
+    - `"post-processing":` { The toolkit AllenNlp output a JSON.
+        - `"json_to_tab":` [boolean] : **true**
+        - `"tab_to_bracket":` [boolean] Set to true if you want also an output as the raw text with brackets as EDU delimiter. If so, `"json_to_tab"` has to be set to true too.
+
+    - `"evaluation":` [boolean] : **true**
+    - `"gold_test_data_path":` [string] The path to your gold dataset to make predictions to, and to evaluate against.
+
+
+## For Usecase 3 : **Custom Model Creation**  
+
+- `"usecase_description":` [string] This field is not a fonctional one. You can describe your project or keep the default text. ```"Config file for usecase_3 : Take a EDU gold segmented set of train/dev/test of texts au format conll as input, train a model, output scores."```
+
+- `input:`{ These fields are mandatory for every Usecases.  
+
+    - `"name":` [string] The name of your input dataset, without the extension. This is also the same name of the directory where you put your input dataset.  e.g. ```"my.cool.dataset"```
+    - `"file":` [string] The extension of your input dataset that reflects its format. 
+        - OPTIONS :[".conllu", ".tok", ".ttok", ".ss"]
+    - `"language":` [string] Language ID of your dataset following the ISO 639-1 Code. e.g. ```"en"```
+
+- `"steps":` { 
+    - `"main":` [string] : **"train"**
+
+    - `"pre-processing":` {
+        - `"tokenization":` [false, true] *available for FR*
+        - `"sentence_split":` [false, true] *available for FR*
+        - `"sentence_split_splitor":` [string] This is the toolkit you want for sentence spliting.
+            - OPTIONS : ["stanza"] 
+        - `"syntactic_parsing":` [boolean] : **false** *Not yet available*
+        - `"NER_format_initialisation":` [boolean] Set to true if your are working with ToNy. *Set to true anyway ??*
+
+    - `"discourse_segmenter":` {
+        - `"model":` **null**
+        - `"training":` {
+            - `"toolkit":` [string] The toolkit to build your model (to be added : "jiant").
+                - OPTIONS : ["allennlp"]
+            - `"pre_trained_lm":` **bert** (to be added : roberta..)
+            - `"config_file":` [string] The path to the config file for training. e.g. `"../model/config_training.jsonnet"`
+            - `"train_data_path":` [string] The path to your training dataset. e.g. `"../data/eng.rst.rstdt/eng.rst.rstdt_train.conllu"` *conflict with training_config ??* 
+            - `"validation_data_path":` [string] The path to your development dataset. e.g. `"../data/eng.rst.rstdt/eng.rst.rstdt_dev.conllu"` *idem*
+
+    - `"post-processing":` { The toolkit AllenNlp output a JSON.
+        - `"json_to_tab":` [boolean] : **true**
+        - `"tab_to_bracket":` [boolean] Set to true if you want also an output as the raw text with brackets as EDU delimiter. If so, `"json_to_tab"` has to be set to true too.
+
+    - `"evaluation":` [boolean] : **true**
+    - `"gold_test_data_path":` [string] The path to your gold test dataset to make predictions on, and to evaluate against.
--- a/model/config_training_bert.jsonnet
+++ b/model/config_training_bert.jsonnet
+{
+    "dataset_reader": {
+        "type": "conll2003",
+        "coding_scheme": "BIOUL",
+        "tag_label": "ner",
+        "token_indexers": {
+            "bert": {
+                "type": "bert-pretrained",
+                "do_lowercase": false,
+                "pretrained_model": "bert-base-multilingual-cased",
+                "use_starting_offsets": true
+            },
+            "token_characters": {
+                "type": "characters",
+                "min_padding_length": 3
+            }
+        }
+    },
+    "iterator": {
+        "type": "basic",
+        "batch_size": 2
+    },
+    "model": {
+        "type": "simple_tagger",
+        "encoder": {
+            "type": "lstm",
+            "bidirectional": true,
+            "dropout": 0.5,
+            "hidden_size": 100,
+            "input_size": 896,
+            "num_layers": 2
+        },
+        "text_field_embedder": {
+            "allow_unmatched_keys": true,
+            "embedder_to_indexer_map": {
+                "bert": [
+                    "bert",
+                    "bert-offsets"
+                ],
+                "token_characters": [
+                    "token_characters"
+                ]
+            },
+            "token_embedders": {
+                "bert": {
+                    "type": "bert-pretrained",
+                    "pretrained_model": "bert-base-multilingual-cased"
+                },
+                "token_characters": {
+                    "type": "character_encoding",
+                    "embedding": {
+                        "embedding_dim": 16
+                    },
+                    "encoder": {
+                        "type": "cnn",
+                        "conv_layer_activation": "relu",
+                        "embedding_dim": 16,
+                        "ngram_filter_sizes": [
+                            3
+                        ],
+                        "num_filters": 128
+                    }
+                }
+            }
+        }
+    },
+    "train_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_train.ner.conllu",
+    "validation_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_dev.ner.conllu",
+    "trainer": {
+        "cuda_device": 1,
+        "grad_norm": 5,
+        "num_epochs": 4,
+        "num_serialized_models_to_keep": 3,
+        "optimizer": {
+            "type": "bert_adam",
+            "lr": 0.001
+        }
+    }
+}
\ No newline at end of file