Merge branch 'refacto-1205' into 'main'

Refacto 1205 See merge request !3

Merge branch 'refacto-1205' into 'main'
b369e12e · larivier · 415aed4a · af630340 · 415aed4a · b369e12e
Commit b369e12e authored 2 years ago by larivier
--- a/code/classes_def.py
+++ b/code/classes_def.py
-# Classes for discut22
-
-
-
-class Input:
-    def __init__(self, infos, stamp):
-        self.name = infos['name']
-        self.lang = infos['language'] 
-#        self.path = infos['folder_path'] # misused
-        self.path = f"../data/{self.name}"
-        self.file = infos['file']
-        self.stamp = stamp
-        self.conv = f"{self.path}/data_converted_{stamp}" # à intégrer
-        self.resu = f"{self.path}/results_{stamp}"
-
-
-class Process:
-    def __init__(self, infos, data):
-        self.data = data
-        self.main = infos["main"] # train test annotation
-
-        self.toke = infos['pre-processing']['tokenization']
-        self.toke_tool = infos['pre-processing']['tokenization_tool']
-        self.ssplit = infos['pre-processing']['sentence_split']
-        self.ssplitor = infos['pre-processing']['sentence_split_splitor']
-        self.ner_init = infos['pre-processing']['NER_format_initialisation'] # useless because done anyway
-
-        if self.main == "train":
-            if self.ner_init == True : # à faire en relatif !! split truc
-                self.train_data = f"{self.data.path}/{self.data.name}_train.ner{self.data.file}"
-                self.dev_data = f"{self.data.path}/{self.data.name}_dev.ner{self.data.file}"
-            else :
-                self.train_data = infos['discourse_segmenter']['training']['train_data_path']
-                self.dev_data = infos['discourse_segmenter']['training']['validation_data_path']
-        self.toolkit = infos['discourse_segmenter']['training']['toolkit']
-        self.tr_config = infos['discourse_segmenter']['training']['config_file']
-        self.pretr_lm = infos['discourse_segmenter']['training']['pre_trained_lm']
-
-        self.model = infos['discourse_segmenter']['model'] # ezpz for Tony 
-
-        self.post_tab = infos['post-processing']['json_to_tab']
-
-        self.eval = infos['evaluation']
-        self.test_data = infos['gold_test_data_path']
-
-        self.post_bracket = infos['post-processing']['tab_to_bracket']
-        self.post_conll = infos['post-processing']['metadata_conll']
-        
\ No newline at end of file
--- a/code/config_global_1.2.json
+++ b/code/config_global_1.2.json
+{
+    "usecase_description": "Config file for usecase_1 : from a raw text, get the same text but with EDU bracket.",
+    "data_raw": {
+        "name": "edgar_poe_en",
+        "exte": ".txt",
+        "language": "en",
+        "existing_metadata": true
+    },
+    "steps":{
+        "main": "annotation",
+        "pre-processing": {
+            "to_do": true,
+            "syntactic_tool": "stanza",
+            "sentence_split": true,
+            "tokenization": true,
+            "syntactic_parsing": false,
+            "create_metadata": {
+                "to_do": false,
+                "line": "paragraph",
+                "sent": "sent"
+            }
+        },
+        "discourse_segmenter": {
+            "model": "/home/lriviere/andiamo/discut/Results_conllu/results_eng.rst.gum-eng_bert/model.tar.gz",
+            "training": {
+                "toolkit": null,
+                "pre_trained_lm": null,
+                "config_file": null,
+                "train_data_path": null,
+                "validation_data_path": null
+            }
+        },
+        "gold_test_data_path": null
+    },
+    "output":{
+        "conll_file":{
+            "to_do": true,
+            "metadata": true,
+            "with_gold_labels": true
+        },
+        "txt_file":{
+            "to_do": true,
+            "metadata": true
+        }
+    }
+}
+
+
+
--- a/code/config_global_1.json
+++ b/code/config_global_1.json
 {
-    "usecase_description": "Config file for usecase_1 : from a text, get the same text but with EDU bracket.",
-    "input": {
-        "name": "eng_annotation",
-        "file": ".conllu",
-        "language": "en"
+    "usecase_description": "Config file for usecase_1 : from a tokenized text, get the same text but with EDU bracket.",
+    "data_raw": {
+        "name": "edgar_poe_short",
+        "exte": ".conll",
+        "language": "en",
+        "existing_metadata": true
    },
    "steps":{
        "main": "annotation",
        "pre-processing": {
-            "tokenization": false,
-            "tokenization_tool" : null,
-            "sentence_split": false,
-            "sentence_split_splitor": null,
-            "syntactic_parsing": false, 
-            "NER_format_initialisation": true
+            "to_do": false,
+            "syntactic_tool": "stanza",
+            "sentence_split": true,
+            "tokenization": true,
+            "syntactic_parsing": true,
+            "create_metadata": {
+                "to_do": true,
+                "line": "paragraph",
+                "sent": "sent"
+            }
        },
        "discourse_segmenter": {
-            "model": "/home/lriviere/andiamo/discut22/data/eng.sdrt.stac/results_2022-11-21_15_42_42.923648/model.tar.gz",
+            "model": "/home/lriviere/andiamo/discut/Results_conllu/results_eng.rst.gum-eng_bert/model.tar.gz",
            "training": {
                "toolkit": null,
                "pre_trained_lm": null,
@@ -25,13 +30,20 @@
                "validation_data_path": null
            }
        },
-        "post-processing": {
-            "json_to_tab": true,
-            "tab_to_bracket":true
-        },
-        "evaluation": false,
        "gold_test_data_path": null
+    },
+    "output":{
+        "conll_file":{
+            "to_do": true,
+            "metadata": true,
+            "with_gold_labels": true
+        },
+        "txt_file":{
+            "to_do": true,
+            "metadata": true
+        }
    }
 }


+
--- a/code/config_global_2.json
+++ b/code/config_global_2.json
 {
-    "usecase_description": "Config file for usecase_2",
-    "input": {
+    "usecase_description": "Config file for usecase_2 : from a gold text, make predictions with an existing model, compare gold and predictions with metrics.",
+    "data_raw": {
        "name": "fra.sdrt.annodis_dev",
-        "file": ".conllu",
-        "language": "fr"
+        "exte": ".conllu",
+        "language": "fr",
+        "existing_metadata": true
    },
    "steps":{
        "main": "test",
        "pre-processing": {
-            "tokenization": false,
-            "tokenization_tool" : "spacy",
-            "sentence_split": false,
-            "sentence_split_splitor": "stanza",
-            "syntactic_parsing": false, 
-            "NER_format_initialisation": true
+            "to_do": false,
+            "syntactic_tool": "stanza",
+            "sentence_split": true,
+            "tokenization": true,
+            "syntactic_parsing": true,
+            "create_metadata": {
+                "to_do": false,
+                "line": "paragraph",
+                "sent": "sent"
+            }
        },
        "discourse_segmenter": {
            "model": "tony",
@@ -25,14 +30,20 @@
                "validation_data_path": null
            }
        },
-        "post-processing": {
-            "json_to_tab": true,
-            "metadata_conll": true,
-            "tab_to_bracket":true
-        },
-        "evaluation": true,
        "gold_test_data_path": null
+    },
+    "output":{
+        "conll_file":{
+            "to_do": true,
+            "metadata": true,
+            "with_gold_labels": true
+        },
+        "txt_file":{
+            "to_do": true,
+            "metadata": true
+        }
    }
 }


+
--- a/code/config_global_3.json
+++ b/code/config_global_3.json
-{
-    "usecase_description": "Config file for usecase_3 : Take a EDU gold segmented set of train/dev/test of texts au format conll as input, train a model, output scores.",
-    "input": {
-        "name": "eng.sdrt.stac",
-        "file": ".conllu",
-        "language": "en"
-    },
-    "steps":{
-        "main": "train",
-        "pre-processing": {
-            "tokenization": false,
-            "tokenization_tool" : null,
-            "sentence_split": false,
-            "sentence_split_splitor": null,
-            "syntactic_parsing": false, 
-            "NER_format_initialisation": true
-        },
-        "discourse_segmenter": {
-            "model": null,
-            "training": {
-                "toolkit": "allennlp",
-                "pre_trained_lm": "bert",
-                "config_file": "../model/config_training_bert.jsonnet",
-                "train_data_path": "../data/eng.sdrt.stac/eng.sdrt.stac_train.conllu",
-                "validation_data_path": "../data/eng.sdrt.stac/eng.sdrt.stac_dev.conllu"
-            }
-        },
-        "post-processing": {
-            "json_to_tab": false,
-            "tab_to_bracket":false
-        },
-        "evaluation": true,
-        "gold_test_data_path": "../data/eng.sdrt.stac/eng.sdrt.stac_test.conllu"
-    }
-}
-
-
--- a/code/discut22_1.py
+++ b/code/discut22_1.py
-######################################
-###### DISCOURSE SEGMENTOR 2022 ######
-######################################
-""" This the main script
-    And the only one to run,
-    after completion of config.json """
-
-import os
-import sys
-import argparse
-import re
-from datetime import datetime
-import pandas as pd # for futur clean output in df
-import json 
-
-from classes_def import Input, Process
-import utils
-#import utils.fr_tokenize as tk
-import utils.conv2ner as c2n
-import utils.json2conll as j2c
-import utils.conll2bracket as c2bracket
-import utils.sent_split as ssent
-import utils.training_allennlp as tr_allen
-
-
-# fonction to get config stuffs
-def get_config_infos(stamp, config_file):
-
-    with open(config_file) as f:
-        infos = json.load(f)
-    data_in = Input(infos['input'], stamp)
-    actions = Process(infos['steps'], data_in)
-    print(f"data to be process : {data_in.name}")
-    return actions
-
-
-# fonction to load existing model -> only tony for now
-def get_model(model_name):
-    name = model_name
-    output = ""
-
-    if name == "tony": 
-        arch = "french_tokens.tar.gz"
-        if not os.path.isfile(f"../model/{name}/{arch}"):
-            dl = "wget https://zenodo.org/record/4235850/files/french_tokens.tar.gz -P ../model --progress=bar"
-            os.system(dl)
-            output = f"../model/{name}/{arch}"
-        else:
-            print("Tony already in place !")
-            output = f"../model/{name}/{arch}"
-
-    else:
-        output = model_name
-
-    return output
-
-
-def text_tokenization(f_in, f_out, lang, tool):
-    if lang == "fr" :
-        if  tool == "spacy" :
-            tk.main(f_in, f_out) # .ss -> .tok
-
-
-
-
-def main(steps):
-    
-
-    
-    # FN: soit besoin sent split, soit besoin tokenizer, soit aucun des deux
-    if steps.ssplit == True :       # python code/ssplit/parse_corpus.py ${dataset} --parser stanza --out_dir data
-    #### Split text into sentence : not in usecase1
-        if not steps.ssplitor == "stanza" :
-            print("pls define sentence splitor") # raise error n kill process
-        data_in = f"{steps.data.path}/{steps.data.name}{steps.data.file}"
-        data_tok = f"{steps.data.path}/{steps.data.name}.tok"
-        print(f"Starting sentence spliting...to {steps.data.path}/steps.data.name")
-        ssent.main(data_in, data_tok, steps.ssplitor, steps.data.lang)
-    elif steps.toke == True :
-    #### Tokenization du text        # #python ${SEG_DIR}/code/utils/fr_tokenize.py $RAW > ${RAW}.tok 
-        data_in = f"{steps.data.path}/{steps.data.name}{steps.data.file}"
-        data_tok = f"{steps.data.path}/{steps.data.name}.tok"
-        print(f"Starting Tokenization...to {data_tok}")
-        #tk.main(f_in, f_out) # .ss -> .tok
-        text_tokenization(data_in, data_tok, steps.data.lang, steps.toke_tool) # .ss -> .tok
-    else:
-        data_tok = f"{steps.data.path}/{steps.data.name}{steps.data.file}"
-
-
-    if steps.ner_init == True:
-        if steps.main == "test" or steps.main =="annotation":
-    #### Conversion en NER pb        # #python $RUNTIME/conv2ner.py ${RAW}.tok > ${RAW}.ner.tok
-            data_ner = f"{steps.data.path}/{steps.data.name}.ner.tok"
-            print(f"Starting conversion to NER format...to {data_ner}")
-            c2n.main(data_tok, data_ner, steps.data.file)
-        elif steps.main == "train":
-            for part in ["train", "dev", "test"]:
-                data_tok = f"{steps.data.path}/{steps.data.name}_{part}{steps.data.file}"
-                data_ner = f"{steps.data.path}/{steps.data.name}_{part}.ner{steps.data.file}"
-                print("Starting conversion to NER format...to {}".format(data_ner))
-                c2n.main(data_tok, data_ner, steps.data.file)
-
-
-    # Create the results directory
-    if not os.path.isdir(steps.data.resu):
-        print(" result directory does not exist yet")
-        os.mkdir(steps.data.resu)
-
-
-    if steps.main == "train":
-        if steps.toolkit == "allennlp":
-            print("toolkit allennlp for training")
-            tr_allen.main(steps)
-            # set the value of model from null to what was just created by training
-            steps.model = f"{steps.data.resu}/model.tar.gz"
-        elif steps.toolkit == "jiant":
-            print("Jiant toolkit not ready")
-        else :
-            print("toolkit unknown")
-        
-        #check config train file
-    elif steps.main == "test" or steps.main =="annotation":
-    #### Appliquer le model choisi, sortir le JSON avec les predictions :score, proba, tags
-    # #allennlp predict --use-dataset-reader --output-file ${RESULT_DIR}/${FILE}.json ${MODEL} ${RAW}.ner.tok
-        print(f"Checking for model...{steps.model}")
-        model_path = get_model(steps.model)
-        print(f"model{model_path}")
-        data_json = f"{steps.data.resu}/{steps.data.name}.json"
-        print(f"datapred: {data_json}\n")
-        print(f"input: {data_ner}\n")
-        cmd = f"allennlp predict --use-dataset-reader --output-file {data_json} {model_path} {data_ner} &> {steps.data.resu}/logs.txt"
-        print(f"comd{cmd}")
-        print("Starting Prediction...")
-        os.system(cmd)
-    #### ------------------------------- TBD do the same but with python script (or JIANT ??)
-    else:
-        print(" pb define model")
-
-    if steps.post_tab == True :
-    #### Appliquer les predictions au texte et sortir le texte tokenisé avec la colone des tags-prédis     
-    # # #python $RUNTIME/json2conll.py ${RESULT_DIR}/${FILE}.json split.tok > ${RESULT_DIR}/${FILE}.split.tok
-        data_conll = f"{steps.data.resu}/{steps.data.name}.split.tok"
-        format = "split.tok" # to retrive from config file !!!
-        print(f"Starting Formating from json to tok format...to {data_conll}")
-        j2c.main(data_json, format, data_conll)
-
-    ####### EVALUATION AGAINST GOLD
-    # python discut/code/utils/seg_eval.py data_gold data_pred (-s)
-    if steps.eval == True : 
-        if steps.main == "train":
-            data_gold = steps.test_data # (())== data NER because of ner_init == true((deleted))
-            if steps.ner_init == True :
-                data_gold_ner = f"{steps.data.path}/{steps.data.name}_test.ner.conllu"
-
-            # make predictions on test_data
-            model_path = steps.model # model just been created
-            # data_json about to be created by predict cmd
-            data_json = f"{steps.data.resu}/{steps.data.name}_test.predictions.json" ## à faire en relatif !! [opt : --silent ??]
-            cmd = f"allennlp predict --use-dataset-reader --output-file {data_json} {model_path} {data_gold_ner} --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --predictor sentence-tagger --include-package allen_custom.custom_bert_token_embedder &> {steps.data.resu}/logs.txt"
-            #cmd = f"allennlp predict --use-dataset-reader --output-file {data_json} {model_path} {data_gold}  &> {steps.data.resu} /logs.txt"
-            print("Starting Prediction...")
-            print(f"cmd prediction: {cmd}")
-            os.system(cmd)
-            
-            data_conll = f"{steps.data.resu}/{steps.data.name}_test.predictions.conll" ## à faire en relatif
-            print(f"Starting Formating from json to tok format...to {data_conll}")
-            j2c.main(data_json, "split.tok", data_conll)
-            print(f"starting eval, gold={data_gold}, predictions={data_conll}, model={model_path}")
-            data_g = "../data/eng.rst.rstdt/eng.rst.rstdt_test.conllu"
-            data_p = "../data/eng.rst.rstdt/results/eng.rst.rstdt_test.predictions.conll" # == data_conll
-            cmd = f"python utils/seg_eval.py {data_gold} {data_conll} &> {steps.data.resu}/Evaluation.txt"
-            os.system(cmd)
-
-
-        else :
-            data_gold = data_tok # changer les noms des var, c'est pas clair !
-            data_pred = data_conll #
-            cmd = f"python utils/seg_eval.py {data_gold} {data_pred} &> {steps.data.resu}/Evaluation.txt"
-            os.system(cmd)
-
-
-    if steps.post_conll == True:
-        f_pred = f"{steps.data.resu}/{steps.data.name}.split.tok"
-        predictions = open(f_pred, 'r')
-        first_line = predictions.readline()
-        columns = first_line.split("\t")
-        predictions.close()
-
-        f_out = f"{steps.data.resu}/{steps.data.name}_full_output.conllu"
-        with open(f_out, "w") as fo:
-            f_in = f"{steps.data.path}/{steps.data.name}{steps.data.file}"
-            with open(f_in, "r") as fi:
-                f_pred = f"{steps.data.resu}/{steps.data.name}.split.tok"
-                with open(f_pred, "r") as fp:
-                    df = pd.read_csv(fp, header=None, sep="\t", usecols=[len(columns)-1])
-                    #df = df.dropna()
-                    print(f"longueur={len(df)}")
-                    print(f"line bug: {df.iloc[3047-148:3060-148,:]}\n")
-                    print(f"type {type(df.iloc[4,:])}")
-                    i = 0
-                    for line in fi:
-                        line = line.strip()
-                        if line.startswith("#"):
-                            fo.write(f"{line}\n")
-                        elif line == "":
-                            fo.write(f"{line}\n")
-                            i +=1
-                        else:
-                            
-                            fo.write(f"{line}")
-
-                            labels = df.iloc[i,:].values.tolist()
-                            for tag in labels:
-                                fo.write(f"\t{tag}")
-
-                            fo.write("\n")
-                            #fo.write(f"{df.iloc[i,:]}\n")
-                            i += 1
-                            #print(f"i::{i}\t")
-                        
-
-
-
-    if steps.post_bracket == True :
-    ####prendre le texte tokénisé+tags-prédits et sortir le texte en plain (format du d'ebut, for now en suite de phrases) avec les brackets    
-    # # #python $RUNTIME/conll2bracket.py ${RESULT_DIR}/${FILE}.split.tok >  ${RESULT_DIR}/${FILE}.split.tok.bracket
-        data_bracket = f"{steps.data.resu}/{steps.data.name}.split.tok.bracket"
-        print(f"Starting formating into bracket text...to {data_bracket}")
-        c2bracket.main(data_conll, data_bracket)
-    
-
-
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--config', help='Config file in JSON')
-    args = parser.parse_args()
-    config = args.config
-
-    now = datetime.now()
-    #stamp = re.sub('[\s:]', '_', str(now))
-    stamp = "debug1205"
-    my_logs = {}
-    my_logs['stamp'] = stamp
-
-    steps = get_config_infos(stamp, config)
-    print(stamp)
-    main(steps)
-
-    #print("Done.")
\ No newline at end of file
--- a/code/discut22_2.py
+++ b/code/discut22_2.py
+######################################
+###### DISCOURSE SEGMENTOR 2022 ######
+######################################
+""" This the main script
+    And the only one to run,
+    after completion of config.json 
+    Discut22 uses allennlp toolkit. For that, it need NER intermediary format.
+"""
+
+import argparse
+from datetime import datetime
+import os
+import re
+import json
+import utils.syntactic_parsing as synt_pars
+import utils.conv2ner as conv_to_ner # TODO clean it
+import utils.json2conll as json_to_connl # TODO clean it
+import utils.training_allennlp as tr_allen
+import utils.conll2bracket as c2bracket
+import utils.seg_eval as seg_eval
+
+
+
+class Data:
+    def __init__(self, infos, stamp):
+        self.name = infos['name']
+        self.lang = infos['language']
+        self.path = f"../data/{self.name}"
+        self.exte = infos['exte']
+        self.raw = f"{self.path}/{self.name}{self.exte}"
+        self.stamp = stamp
+        self.conv = f"{self.path}/data_converted_{stamp}"
+        self.resu = f"{self.path}/results_{stamp}"
+        self.meta = infos['existing_metadata']
+
+    def create_folders(self): # -> can be rtansfor into method of class
+        for it in [self.conv, self.resu]:
+            print(f"----> Checking/creating folder {it}.")
+            if not os.path.isdir(it):
+                os.mkdir(it)
+            my_logs['folders'] = f"{self.conv}, {self.resu}"
+
+    def pre_processing(self, steps):
+        print("----> Preprocessing input data.")
+        file_in = self.raw
+        if steps.pre_process_to_do == True:
+            file_out = f"{self.conv}/{self.name}.conll"
+            if steps.synt_tool == "stanza":
+                processors = []
+                metadata = {}
+                if steps.toke == True:
+                    processors.extend(['tokenize', 'mwt'])
+                if steps.synt_parse == True:
+                    processors.extend(['pos', 'lemma', 'depparse'])
+                #if steps.ssplit == True:
+                #    processors.append('constituency')
+                if steps.crea_meta == True:
+                    metadata['line'] = steps.meta_line
+                    metadata['sent'] = steps.meta_sent
+                if data.meta == True:
+                    metadata['meta'] = True
+                processors_str = ",".join(processors)
+                synt_pars.with_stanza(data.lang, file_in, file_out, processors_str, metadata)
+            else:
+                exit(f"Exited. Not valid syntactic tool: \"{steps.synt_tool}\". Options: \"stanza\". Change your config file.")
+        else:
+            file_out = file_in
+        my_logs['data_preprocessed'] = file_out
+        self.preprocessed = file_out 
+    
+    def make_ner_format(self):
+        """
+        This fonction build the NER format upon the Segmentor works.
+        INPUT: Tokenized text with whatever number of columns.
+        OUTPUT: Tokenized text with just 4 columns.
+        """
+        self.ner = f"{self.preprocessed}.ner"
+        self.ner = f"{self.conv}/{self.name}.conll.ner"
+        print(f"----> Making NER format {self.ner}.")
+        conv_to_ner.main(self.preprocessed, self.ner, "conll") # <-- TODO faire en relatif#TODO add same for train/dev/test for config train
+        my_logs['data_ner'] = self.ner
+
+    def make_predictions(self, steps):
+        self.pred_json = f"{self.resu}/{self.name}_pred.json"
+        cmd = f"allennlp predict --use-dataset-reader --output-file {self.pred_json} {steps.model_path} {self.ner} &> {self.resu}/logs_predictions.txt"
+        print(f"----> Making predictions: {cmd}.")
+        os.system(cmd)
+        my_logs['predictions_cmd'] = cmd
+
+
+    def pred_json_to_conll_w_metadata_w_gold(self): # here and 3 below..sorry..factorsation TBD
+        self.pred_conll_meta_gold = f"{self.resu}/{self.name}_pred_meta_gold.conll"
+        json_to_connl.js2conllNmetaNgold(self.pred_json, self.pred_conll_meta_gold, "conll", self.preprocessed)
+        return self.pred_conll_meta_gold
+
+    def pred_json_to_conll_w_metadata(self):
+        self.pred_meta_conll = f"{self.resu}/{self.name}_pred_meta.conll"
+        json_to_connl.js2conllNmeta(self.pred_json, self.pred_meta_conll, "conll", self.preprocessed) 
+        return self.pred_meta_conll
+
+    def pred_json_to_conll_w_gold(self):
+        self.pred_conll_gold = f"{self.resu}/{self.name}_pred_gold.conll"
+        json_to_connl.js2conll(self.pred_json, self.pred_conll_gold, "conll") 
+        return self.pred_conll_gold
+
+    def pred_json_to_conll(self):
+        self.pred_conll = f"{self.resu}/{self.name}_pred.conll"
+        json_to_connl.js2conll(self.pred_json, self.pred_conll, "conll") 
+        return self.pred_conll
+
+    def brackets_txt(self):
+        self.brack = f"{self.resu}/{self.name}_brac.txt"
+        c2bracket.conll2brackets(self.pred_conll, self.brack)
+
+    def brackets_txt_with_metadata(self):
+        self.brack_meta = f"{self.resu}/{self.name}_brac_meta.txt"
+        c2bracket.conll2brackets_with_meta(self.pred_meta_conll, self.brack_meta)
+
+
+    def evaluation(self, prod):
+        self.basic_metrics = f"{self.resu}/Evaluation_metrics.json"
+
+        if self.exte == ".conll" or self.exte == ".conllu": # get gold file
+            gold = self.raw
+        else:
+            gold = self.preprocessed
+
+        if prod.conll_todo == False:    # get pred_file
+            pred = self.pred_json_to_conll()
+        else:                       
+            if prod.conll_meta == True:
+                if prod.conll_w_gold == True:
+                    pred = self.pred_json_to_conll_w_metadata_w_gold()
+                else:
+                    pred = self.pred_json_to_conll_w_metadata()
+            else:
+                if prod.conll_w_gold == True:
+                    pred = self.pred_json_to_conll_w_gold()
+                else:
+                    pred = self.pred_json_to_conll()
+
+        print(f"----> Predictions to file {pred}")
+        print(f"----> Evaluation scores to file {self.basic_metrics}")
+        scores_dict = seg_eval.get_scores(gold, pred)
+        with open(self.basic_metrics, 'w') as fo:
+            json.dump(scores_dict, fo)
+
+
+class Output:
+    def __init__(self, infos):
+        self.conll_todo = infos['conll_file']['to_do']
+        self.conll_meta = infos['conll_file']['metadata']
+        self.conll_w_gold = infos['conll_file']['with_gold_labels']
+        self.txt_todo = infos['txt_file']['to_do']
+        self.txt_meta = infos['txt_file']['metadata']
+
+
+
+class Process:
+    def __init__(self, infos):
+        self.main = infos["main"] # train test annotation
+
+        self.pre_process_to_do = infos['pre-processing']['to_do']
+        self.synt_tool = infos['pre-processing']['syntactic_tool']
+        self.synt_parse = infos['pre-processing']['syntactic_parsing']
+        self.toke = infos['pre-processing']['tokenization']
+        self.ssplit = infos['pre-processing']['sentence_split']
+        self.crea_meta = infos['pre-processing']['create_metadata']['to_do']
+        self.meta_line = infos['pre-processing']['create_metadata']['line']
+        self.meta_sent = infos['pre-processing']['create_metadata']['sent']
+
+        #if self.main == "train":
+            #if self.ner_init == True : # à faire en relatif !! split truc
+            #    self.train_data = f"{self.data.path}/{self.data.name}_train.ner{self.data.file}"
+            #    self.dev_data = f"{self.data.path}/{self.data.name}_dev.ner{self.data.file}"
+            #else :
+            #    self.train_data = infos['discourse_segmenter']['training']['train_data_path']
+            #    self.dev_data = infos['discourse_segmenter']['training']['validation_data_path']
+        self.toolkit = infos['discourse_segmenter']['training']['toolkit']
+        self.tr_config = infos['discourse_segmenter']['training']['config_file']
+        self.pretr_lm = infos['discourse_segmenter']['training']['pre_trained_lm']
+        self.model = infos['discourse_segmenter']['model'] # ezpz for Tony 
+
+        self.test_data = infos['gold_test_data_path']
+
+    def get_evaluation_status(self):
+        if self.main == "test":
+            self.eval = True
+        #elif self.main == "train":
+
+    def get_model(self):
+        self.model_path = ""
+        if self.model == "tony": 
+            arch = "french_tokens.tar.gz"
+            if not os.path.isfile(f"../model/tony/{arch}"):
+                dl = "wget https://zenodo.org/record/4235850/files/french_tokens.tar.gz -P ../model --progress=bar"
+                os.system(dl)
+                self.model_path = f"../model/tony/{arch}"
+            else:
+                print("----> Tony already in place !")
+                self.model_path = f"../model/tony/{arch}"
+        else:
+            self.model_path = self.model
+
+
+def get_stamp():
+    now = datetime.now()
+    stamp = re.sub('[\s:]', '_', str(now))
+    return stamp
+
+def get_config_infos(config, stamp):
+    with open(config, 'r', encoding='utf-8') as f:
+        infos = json.load(f)
+        data = Data(infos['data_raw'], stamp)
+        steps = Process(infos['steps'])
+        prod = Output(infos['output'])
+        my_logs["config"] = infos
+    return data, steps, prod
+
+
+def print_logs(dict_logs):
+    file_logs = f"{data.resu}/processes_logs.json"
+    with open(file_logs, 'w') as fi:
+        json.dump(dict_logs, fi, indent=4)
+    
+
+
+
+if __name__ == '__main__':
+    my_logs = {}
+    stamp = get_stamp()
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--config', help='Config file in JSON.')
+    parser.add_argument('--name',default=stamp , help='Run name.')
+    args = parser.parse_args()
+    config = args.config
+    stamp = args.name
+    my_logs["stamp"] = stamp
+   
+    data, steps, prod = get_config_infos(config, stamp)
+    data.create_folders()
+    data.pre_processing(steps)
+    data.make_ner_format()
+    steps.get_model()
+
+    if steps.main == "annotation" or steps.main == "test":
+        data.make_predictions(steps) # output allennlp JSON
+    #elif steps.main == "train":
+
+    steps.get_evaluation_status()
+    if steps.eval == True:
+        data.evaluation(prod)
+        
+
+    print_logs(my_logs) # <-- attention variable globale !
\ No newline at end of file
--- a/code/utils/conll2bracket.py
+++ b/code/utils/conll2bracket.py
@@ -50,8 +50,26 @@ def conll2brackets(in_f, out_f):
                    start = False
            file_out.write("]\n\n")
            
-def main(f_in, f_out):
-    input = f_in
-    output = f_out
-    
-    conll2brackets(input, output)
\ No newline at end of file
+def conll2brackets_with_meta(in_f, out_f):
+    start = True
+    input = in_f
+
+    with open(out_f, 'w') as file_out:
+        with open(in_f, 'r') as input:
+            for line in input: 
+                if line.startswith("#"):
+                    file_out.write(f"{line}\n")
+                elif line.strip()=="":
+                    file_out.write("]")
+                    file_out.write("\n\n")
+                    start = True
+                else:
+                    n, word, *junk, tag = line.split()
+                    if tag=="BeginSeg=Yes":
+                        if not(start):
+                            file_out.write("] ")
+                        file_out.write(f"[ {word} ")
+                    else:
+                        file_out.write(f"{word} ")
+                    start = False
+            file_out.write("]\n\n")
\ No newline at end of file
--- a/code/utils/conv2ner.py
+++ b/code/utils/conv2ner.py
@@ -110,8 +110,8 @@ def conversion2ner(input, output, params=None):
                        # then, previous token label is set to B-E to signal end of previous segment
                        res[-1][-1] = "B-E"
                    start_doc = False
-                    if label not in maptags:
-                        print("warning, strange label ",label,file=sys.stderr)
+                    #if label not in maptags:
+                        #print("warning, strange label ",label,file=sys.stderr)
                    res.append([w,pos,"O",tag])
                    
            for line in res:

--- a/code/utils/json2conll.py
+++ b/code/utils/json2conll.py
@@ -5,6 +5,7 @@ conll format

 import json
 import sys
+import re

 #filepath = sys.argv[1]
 #config = sys.argv[2]
@@ -26,7 +27,6 @@ def js2conll(filepath, fileoutpath, config):
    data = [] 
    for line in open(filepath, 'r'):
        data.append(json.loads(line))
-
    with open(fileoutpath, 'w') as f_out:
        for doc in data:
            tokens = zip(doc["words"],doc["tags"])
@@ -37,8 +37,62 @@ def js2conll(filepath, fileoutpath, config):
            f_out.write("\n")
            #print()

-def main(f_in, form, f_out):
-    input = f_in
-    output = f_out
-    forma = form
-    js2conll(input, output, forma) 
\ No newline at end of file
+def js2conllNmeta(data_pred_json, data_out, config, data_meta):
+    data = []
+    sent_pred_count = 0
+    tok = 0
+    for line in open(data_pred_json, 'r'):
+        data.append(json.loads(line))
+
+    with open(data_out, 'w', encoding='utf-8') as fo, open(data_meta, 'r') as fm:       
+        
+        # id 
+        for line in fm:
+            line = line.strip()
+            if line.startswith("#"):
+                fo.write(f"{line}\n")
+            elif line == "":
+                sent_pred_count += 1
+                tok = 0
+                fo.write(f"{line}\n")
+            else:
+                sent_pred = data[sent_pred_count]
+                word = data[sent_pred_count]['words'][tok]
+                tag = data[sent_pred_count]['tags'][tok]
+                tok += 1
+                #print(f"tok: {tok}, tag: {tag}, word: {word}, line : {line}")
+                new_line = re.sub('\t[^\t]+$', '', line)
+                fo.write(f"{new_line}\t{map[tag]}\n")
+                #if int(line.split("\t")[0]) == tok and line.split("\t")[1] == word:
+                #    fo.write(f"{line}\t{tag}\n")
+                    
+
+
+                #print(f"sentpred : {sent_pred}\n")
+                #print(f"word n tag : {word}:::{tag}\n")
+                
+def js2conllNmetaNgold(data_pred_json, data_out, config, gold_n_meta):
+    data = []
+    sent_pred_count = 0
+    tok = 0
+    for line in open(data_pred_json, 'r'):
+        data.append(json.loads(line))
+
+    with open(data_out, 'w', encoding='utf-8') as fo, open(gold_n_meta, 'r') as fm:       
+        
+        # id 
+        for line in fm:
+            line = line.strip()
+            if line.startswith("#"):
+                fo.write(f"{line}\n")
+            elif line == "":
+                sent_pred_count += 1
+                tok = 0
+                fo.write(f"{line}\n")
+            else:
+                sent_pred = data[sent_pred_count]
+                word = data[sent_pred_count]['words'][tok]
+                tag = data[sent_pred_count]['tags'][tok]
+                tok += 1
+                #print(f"tok: {tok}, tag: {tag}, word: {word}, line : {line}")
+                fo.write(f"{line}\t{map[tag]}\n")
\ No newline at end of file
--- a/code/utils/seg_eval.py
+++ b/code/utils/seg_eval.py
@@ -68,9 +68,9 @@ Arguments:

 """

-__author__ = "Amir Zeldes"
+#__author__ = "Amir Zeldes"
 __license__ = "Apache 2.0"
-__version__ = "1.0.1"
+#__version__ = "1.0.1"

 def parse_data(infile, string_input=False):
 	if not string_input:
@@ -222,22 +222,3 @@ def get_scores(gold_file, pred_file, string_input=False):

 	return score_dict

-
-if __name__ == "__main__":
-	p = argparse.ArgumentParser()
-
-	p.add_argument("goldfile",help="Shared task gold file in .tok or .conll format")
-	p.add_argument("predfile",help="Corresponding file with system predictions")
-	p.add_argument("-s","--string_input",action="store_true",help="Whether inputs are file names or strings")
-
-	opts = p.parse_args()
-
-	score_dict = get_scores(opts.goldfile,opts.predfile,opts.string_input)
-
-	print("File: " + score_dict["doc_name"])
-	print("o Total tokens: " + str(score_dict["tok_count"]))
-	print("o Gold " +score_dict["seg_type"]+": " + str(score_dict["gold_seg_count"]))
-	print("o Predicted "+score_dict["seg_type"]+": " + str(score_dict["pred_seg_count"]))
-	print("o Precision: " + str(score_dict["prec"]))
-	print("o Recall: " + str(score_dict["rec"]))
-	print("o F-Score: " + str(score_dict["f_score"]))
--- a/code/utils/syntactic_parsing.py
+++ b/code/utils/syntactic_parsing.py
+import stanza
+from stanza.utils.conll import CoNLL
+
+
+
+
+
+
+
+def with_stanza(lang, f_in, f_out, process, meta):
+    """ 
+    Stanza's class CoNNL:
+
+    ID = 'id'
+    TEXT = 'text'
+    LEMMA = 'lemma'
+    UPOS = 'upos'
+    XPOS = 'xpos'
+    FEATS = 'feats'
+    HEAD = 'head'
+    DEPREL = 'deprel'
+    DEPS = 'deps'
+    MISC = 'misc' -> 'start_char|end_char'
+    START_CHAR = 'start_char'
+    END_CHAR = 'end_char'
+    FIELD_TO_IDX = {ID: 0, TEXT: 1, LEMMA: 2, UPOS: 3, XPOS: 4, FEATS: 5, HEAD: 6, DEPREL: 7, DEPS: 8, MISC: 9}
+    """
+
+    stanza.download(lang)
+    nlp = stanza.Pipeline(lang, processors=process, use_gpu=True)
+    with open(f_in, 'r', encoding='utf-8') as fi, open(f_out, 'w', encoding='utf-8') as fo:
+        count_line = 0
+        for line in fi:
+            count_line += 1
+            count_sent = 0
+            line = line.strip()
+
+            if line.startswith("#"):
+                if "meta" in meta.keys() and meta['meta'] == True:
+                    fo.write(f"{line}\n")
+            elif line == "":
+                fo.write("\n")
+            else:
+                
+                #if meta['line']:
+                if "line" in meta.keys():
+                    txt = f"#{meta['line']}-{count_line}\n"
+                    fo.write(txt)
+
+                doc = nlp(line)
+                for sent in doc.sentences:
+                    count_sent += 1
+                    #if meta['sent']:
+                    if "sent" in meta.keys():
+                        txt = f"#{meta['sent']}-{count_sent}\n#text=\"{sent.text}\"\n"
+                        fo.write(txt)
+
+                    for token in sent.tokens:
+                        token_conll = CoNLL.convert_token_dict(token.to_dict()[0])
+                        fo.write("\t".join(token_conll))
+                        fo.write("\n")
+
+                    fo.write("\n")
+
+            
\ No newline at end of file
--- a/code/utils/training_allennlp.py
+++ b/code/utils/training_allennlp.py
@@ -33,9 +33,18 @@ def main(steps):
    #### train, has_per == False
    # allennlp train -s Results_${CONFIG}/results_${OUTPUT} ${CODE}configs/${MODEL}.jsonnet --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --include-package allen_custom.custom_bert_token_embedder 
    # allennlp train -s Resultts_conllu/results_eng.rst.rstdt_bert ../code/utils/configs/bert.jsonnet ....
+    
+    # Dicut- repo morteza
+    #allennlp train -s Results_${CONFIG}/results_${OUTPUT} ${CODE}configs/bert.jsonnet
+    cmd2 = f"allennlp train -s {steps.data.resu} {tr_config}"
+    
+    # Discut-gitlab
    cmd = f"allennlp train -s {steps.data.resu} {tr_config} --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --include-package allen_custom.custom_bert_token_embedder" 
-    print(cmd)
-    os.system(cmd)
+    
+    
+    
+    print(cmd2)
+    os.system(cmd2)
    # then...

    # TODO:

--- a/data/edgar_poe_en/edgar_poe_en.txt
+++ b/data/edgar_poe_en/edgar_poe_en.txt
--- a/data/edgar_poe_short/edgar_poe_short.conll
+++ b/data/edgar_poe_short/edgar_poe_short.conll