From 6744dc6b6848374e1a258e98d33302e14fbbe847 Mon Sep 17 00:00:00 2001
From: "laura.riviere" <laura.riviere@irit.fr>
Date: Tue, 3 Jan 2023 18:51:40 +0100
Subject: [PATCH] change to classes methodes

---
 code/classes_def.py                           |  48 ----
 code/classes_def_2.py                         |  55 ----
 code/config_global_1.2.json                   |   4 +-
 code/discut22_1.py                            | 251 ------------------
 code/discut22_2.py                            | 234 ++++++++++------
 code/utils/conll2bracket.py                   |  28 +-
 code/utils/json2conll.py                      |  11 +-
 .../syntactic_parsing.cpython-37.pyc          | Bin 1509 -> 1517 bytes
 code/utils_2/syntactic_parsing.py             |   6 +-
 9 files changed, 187 insertions(+), 450 deletions(-)
 delete mode 100644 code/classes_def.py
 delete mode 100644 code/classes_def_2.py
 delete mode 100644 code/discut22_1.py

diff --git a/code/classes_def.py b/code/classes_def.py
deleted file mode 100644
index b8ca4bd..0000000
--- a/code/classes_def.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Classes for discut22
-
-
-
-class Input:
-    def __init__(self, infos, stamp):
-        self.name = infos['name']
-        self.lang = infos['language'] 
-#        self.path = infos['folder_path'] # misused
-        self.path = f"../data/{self.name}"
-        self.file = infos['file']
-        self.stamp = stamp
-        self.conv = f"{self.path}/data_converted_{stamp}" # à intégrer
-        self.resu = f"{self.path}/results_{stamp}"
-
-
-class Process:
-    def __init__(self, infos, data):
-        self.data = data
-        self.main = infos["main"] # train test annotation
-
-        self.toke = infos['pre-processing']['tokenization']
-        self.toke_tool = infos['pre-processing']['tokenization_tool']
-        self.ssplit = infos['pre-processing']['sentence_split']
-        self.ssplitor = infos['pre-processing']['sentence_split_splitor']
-        self.ner_init = infos['pre-processing']['NER_format_initialisation'] # useless because done anyway
-
-        if self.main == "train":
-            if self.ner_init == True : # à faire en relatif !! split truc
-                self.train_data = f"{self.data.path}/{self.data.name}_train.ner{self.data.file}"
-                self.dev_data = f"{self.data.path}/{self.data.name}_dev.ner{self.data.file}"
-            else :
-                self.train_data = infos['discourse_segmenter']['training']['train_data_path']
-                self.dev_data = infos['discourse_segmenter']['training']['validation_data_path']
-        self.toolkit = infos['discourse_segmenter']['training']['toolkit']
-        self.tr_config = infos['discourse_segmenter']['training']['config_file']
-        self.pretr_lm = infos['discourse_segmenter']['training']['pre_trained_lm']
-
-        self.model = infos['discourse_segmenter']['model'] # ezpz for Tony 
-
-        self.post_tab = infos['post-processing']['json_to_tab']
-
-        self.eval = infos['evaluation']
-        self.test_data = infos['gold_test_data_path']
-
-        self.post_bracket = infos['post-processing']['tab_to_bracket']
-        self.post_conll = infos['post-processing']['metadata_conll']
-        
\ No newline at end of file
diff --git a/code/classes_def_2.py b/code/classes_def_2.py
deleted file mode 100644
index 9233d21..0000000
--- a/code/classes_def_2.py
+++ /dev/null
@@ -1,55 +0,0 @@
-# Classes for discut22
-
-
-
-class Data:
-    def __init__(self, infos, stamp):
-        self.name = infos['name']
-        self.lang = infos['language']
-        self.path = f"../data/{self.name}"
-        self.exte = infos['exte']
-        self.stamp = stamp
-        self.conv = f"{self.path}/data_converted_{stamp}" # à intégrer
-        self.resu = f"{self.path}/results_{stamp}"
-        self.meta = infos['existing_metadata']
-
-
-class Process:
-    def __init__(self, infos):
-        self.main = infos["main"] # train test annotation
-
-        self.pre_process_to_do = infos['pre-processing']['to_do']
-        self.synt_tool = infos['pre-processing']['syntactic_tool']
-        self.synt_parse = infos['pre-processing']['syntactic_parsing']
-        self.toke = infos['pre-processing']['tokenization']
-        self.ssplit = infos['pre-processing']['sentence_split']
-        self.crea_meta = infos['pre-processing']['create_metadata']['to_do']
-        self.meta_line = infos['pre-processing']['create_metadata']['line']
-        self.meta_sent = infos['pre-processing']['create_metadata']['sent']
-
-
-        #if self.main == "train":
-            #if self.ner_init == True : # à faire en relatif !! split truc
-            #    self.train_data = f"{self.data.path}/{self.data.name}_train.ner{self.data.file}"
-            #    self.dev_data = f"{self.data.path}/{self.data.name}_dev.ner{self.data.file}"
-            #else :
-            #    self.train_data = infos['discourse_segmenter']['training']['train_data_path']
-            #    self.dev_data = infos['discourse_segmenter']['training']['validation_data_path']
-        self.toolkit = infos['discourse_segmenter']['training']['toolkit']
-        self.tr_config = infos['discourse_segmenter']['training']['config_file']
-        self.pretr_lm = infos['discourse_segmenter']['training']['pre_trained_lm']
-
-        self.model = infos['discourse_segmenter']['model'] # ezpz for Tony 
-
-        #self.post_tab = infos['post-processing']['json_to_tab']
-
-        self.eval = infos['evaluation']
-        self.test_data = infos['gold_test_data_path']
-
-        
-        
-class Output:
-    def __init__(self, infos):
-        self.prod_bracket = infos['file']['tab_to_bracket']
-        self.prod_conll = infos['file']['conllu']
-        self.metadata = infos['file']['metadata']
\ No newline at end of file
diff --git a/code/config_global_1.2.json b/code/config_global_1.2.json
index 0e71493..4b51333 100644
--- a/code/config_global_1.2.json
+++ b/code/config_global_1.2.json
@@ -13,9 +13,9 @@
             "syntactic_tool": "stanza",
             "sentence_split": true,
             "tokenization": true,
-            "syntactic_parsing": true,
+            "syntactic_parsing": false,
             "create_metadata": {
-                "to_do": true,
+                "to_do": false,
                 "line": "paragraph",
                 "sent": "sent"
             }
diff --git a/code/discut22_1.py b/code/discut22_1.py
deleted file mode 100644
index 7a8739b..0000000
--- a/code/discut22_1.py
+++ /dev/null
@@ -1,251 +0,0 @@
-######################################
-###### DISCOURSE SEGMENTOR 2022 ######
-######################################
-""" This the main script
-    And the only one to run,
-    after completion of config.json """
-
-import os
-import sys
-import argparse
-import re
-from datetime import datetime
-import pandas as pd # for futur clean output in df
-import json 
-
-from classes_def import Input, Process
-import utils
-#import utils.fr_tokenize as tk
-import utils.conv2ner as c2n
-import utils.json2conll as j2c
-import utils.conll2bracket as c2bracket
-import utils.sent_split as ssent
-import utils.training_allennlp as tr_allen
-
-
-# fonction to get config stuffs
-def get_config_infos(stamp, config_file):
-
-    with open(config_file) as f:
-        infos = json.load(f)
-    data_in = Input(infos['input'], stamp)
-    actions = Process(infos['steps'], data_in)
-    print(f"data to be process : {data_in.name}")
-    return actions
-
-
-# fonction to load existing model -> only tony for now
-def get_model(model_name):
-    name = model_name
-    output = ""
-
-    if name == "tony": 
-        arch = "french_tokens.tar.gz"
-        if not os.path.isfile(f"../model/{name}/{arch}"):
-            dl = "wget https://zenodo.org/record/4235850/files/french_tokens.tar.gz -P ../model --progress=bar"
-            os.system(dl)
-            output = f"../model/{name}/{arch}"
-        else:
-            print("Tony already in place !")
-            output = f"../model/{name}/{arch}"
-
-    else:
-        output = model_name
-
-    return output
-
-
-def text_tokenization(f_in, f_out, lang, tool):
-    if lang == "fr" :
-        if  tool == "spacy" :
-            tk.main(f_in, f_out) # .ss -> .tok
-
-
-
-
-def main(steps):
-    
-
-    
-    # FN: soit besoin sent split, soit besoin tokenizer, soit aucun des deux
-    if steps.ssplit == True :       # python code/ssplit/parse_corpus.py ${dataset} --parser stanza --out_dir data
-    #### Split text into sentence : not in usecase1
-        if not steps.ssplitor == "stanza" :
-            print("pls define sentence splitor") # raise error n kill process
-        data_in = f"{steps.data.path}/{steps.data.name}{steps.data.file}"
-        data_tok = f"{steps.data.path}/{steps.data.name}.tok"
-        print(f"Starting sentence spliting...to {steps.data.path}/steps.data.name")
-        ssent.main(data_in, data_tok, steps.ssplitor, steps.data.lang)
-    elif steps.toke == True :
-    #### Tokenization du text        # #python ${SEG_DIR}/code/utils/fr_tokenize.py $RAW > ${RAW}.tok 
-        data_in = f"{steps.data.path}/{steps.data.name}{steps.data.file}"
-        data_tok = f"{steps.data.path}/{steps.data.name}.tok"
-        print(f"Starting Tokenization...to {data_tok}")
-        #tk.main(f_in, f_out) # .ss -> .tok
-        text_tokenization(data_in, data_tok, steps.data.lang, steps.toke_tool) # .ss -> .tok
-    else:
-        data_tok = f"{steps.data.path}/{steps.data.name}{steps.data.file}"
-
-
-    if steps.ner_init == True:
-        if steps.main == "test" or steps.main =="annotation":
-    #### Conversion en NER pb        # #python $RUNTIME/conv2ner.py ${RAW}.tok > ${RAW}.ner.tok
-            data_ner = f"{steps.data.path}/{steps.data.name}.ner.tok"
-            print(f"Starting conversion to NER format...to {data_ner}")
-            c2n.main(data_tok, data_ner, steps.data.file)
-        elif steps.main == "train":
-            for part in ["train", "dev", "test"]:
-                data_tok = f"{steps.data.path}/{steps.data.name}_{part}{steps.data.file}"
-                data_ner = f"{steps.data.path}/{steps.data.name}_{part}.ner{steps.data.file}"
-                print("Starting conversion to NER format...to {}".format(data_ner))
-                c2n.main(data_tok, data_ner, steps.data.file)
-
-
-    # Create the results directory
-    if not os.path.isdir(steps.data.resu):
-        print(" result directory does not exist yet")
-        os.mkdir(steps.data.resu)
-
-
-    if steps.main == "train":
-        if steps.toolkit == "allennlp":
-            print("toolkit allennlp for training")
-            tr_allen.main(steps)
-            # set the value of model from null to what was just created by training
-            steps.model = f"{steps.data.resu}/model.tar.gz"
-        elif steps.toolkit == "jiant":
-            print("Jiant toolkit not ready")
-        else :
-            print("toolkit unknown")
-        
-        #check config train file
-    elif steps.main == "test" or steps.main =="annotation":
-    #### Appliquer le model choisi, sortir le JSON avec les predictions :score, proba, tags
-    # #allennlp predict --use-dataset-reader --output-file ${RESULT_DIR}/${FILE}.json ${MODEL} ${RAW}.ner.tok
-        print(f"Checking for model...{steps.model}")
-        model_path = get_model(steps.model)
-        print(f"model{model_path}")
-        data_json = f"{steps.data.resu}/{steps.data.name}.json"
-        print(f"datapred: {data_json}\n")
-        print(f"input: {data_ner}\n")
-        cmd = f"allennlp predict --use-dataset-reader --output-file {data_json} {model_path} {data_ner} &> {steps.data.resu}/logs.txt"
-        print(f"comd{cmd}")
-        print("Starting Prediction...")
-        os.system(cmd)
-    #### ------------------------------- TBD do the same but with python script (or JIANT ??)
-    else:
-        print(" pb define model")
-
-    if steps.post_tab == True :
-    #### Appliquer les predictions au texte et sortir le texte tokenisé avec la colone des tags-prédis     
-    # # #python $RUNTIME/json2conll.py ${RESULT_DIR}/${FILE}.json split.tok > ${RESULT_DIR}/${FILE}.split.tok
-        data_conll = f"{steps.data.resu}/{steps.data.name}.split.tok"
-        format = "split.tok" # to retrive from config file !!!
-        print(f"Starting Formating from json to tok format...to {data_conll}")
-        j2c.main(data_json, format, data_conll)
-
-    ####### EVALUATION AGAINST GOLD
-    # python discut/code/utils/seg_eval.py data_gold data_pred (-s)
-    if steps.eval == True : 
-        if steps.main == "train":
-            data_gold = steps.test_data # (())== data NER because of ner_init == true((deleted))
-            if steps.ner_init == True :
-                data_gold_ner = f"{steps.data.path}/{steps.data.name}_test.ner.conllu"
-
-            # make predictions on test_data
-            model_path = steps.model # model just been created
-            # data_json about to be created by predict cmd
-            data_json = f"{steps.data.resu}/{steps.data.name}_test.predictions.json" ## à faire en relatif !! [opt : --silent ??]
-            cmd = f"allennlp predict --use-dataset-reader --output-file {data_json} {model_path} {data_gold_ner} --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --predictor sentence-tagger --include-package allen_custom.custom_bert_token_embedder &> {steps.data.resu}/logs.txt"
-            #cmd = f"allennlp predict --use-dataset-reader --output-file {data_json} {model_path} {data_gold}  &> {steps.data.resu} /logs.txt"
-            print("Starting Prediction...")
-            print(f"cmd prediction: {cmd}")
-            os.system(cmd)
-            
-            data_conll = f"{steps.data.resu}/{steps.data.name}_test.predictions.conll" ## à faire en relatif
-            print(f"Starting Formating from json to tok format...to {data_conll}")
-            j2c.main(data_json, "split.tok", data_conll)
-            print(f"starting eval, gold={data_gold}, predictions={data_conll}, model={model_path}")
-            data_g = "../data/eng.rst.rstdt/eng.rst.rstdt_test.conllu"
-            data_p = "../data/eng.rst.rstdt/results/eng.rst.rstdt_test.predictions.conll" # == data_conll
-            cmd = f"python utils/seg_eval.py {data_gold} {data_conll} &> {steps.data.resu}/Evaluation.txt"
-            os.system(cmd)
-
-
-        else :
-            data_gold = data_tok # changer les noms des var, c'est pas clair !
-            data_pred = data_conll #
-            cmd = f"python utils/seg_eval.py {data_gold} {data_pred} &> {steps.data.resu}/Evaluation.txt"
-            os.system(cmd)
-
-
-    if steps.post_conll == True:
-        f_pred = f"{steps.data.resu}/{steps.data.name}.split.tok"
-        predictions = open(f_pred, 'r')
-        first_line = predictions.readline()
-        columns = first_line.split("\t")
-        predictions.close()
-
-        f_out = f"{steps.data.resu}/{steps.data.name}_full_output.conllu"
-        with open(f_out, "w") as fo:
-            f_in = f"{steps.data.path}/{steps.data.name}{steps.data.file}"
-            with open(f_in, "r") as fi:
-                f_pred = f"{steps.data.resu}/{steps.data.name}.split.tok"
-                with open(f_pred, "r") as fp:
-                    df = pd.read_csv(fp, header=None, sep="\t", usecols=[len(columns)-1])
-                    #df = df.dropna()
-                    print(f"longueur={len(df)}")
-                    print(f"line bug: {df.iloc[3047-148:3060-148,:]}\n")
-                    print(f"type {type(df.iloc[4,:])}")
-                    i = 0
-                    for line in fi:
-                        line = line.strip()
-                        if line.startswith("#"):
-                            fo.write(f"{line}\n")
-                        elif line == "":
-                            fo.write(f"{line}\n")
-                            i +=1
-                        else:
-                            
-                            fo.write(f"{line}")
-
-                            labels = df.iloc[i,:].values.tolist()
-                            for tag in labels:
-                                fo.write(f"\t{tag}")
-
-                            fo.write("\n")
-                            #fo.write(f"{df.iloc[i,:]}\n")
-                            i += 1
-                            #print(f"i::{i}\t")
-                        
-
-
-
-    if steps.post_bracket == True :
-    ####prendre le texte tokénisé+tags-prédits et sortir le texte en plain (format du d'ebut, for now en suite de phrases) avec les brackets    
-    # # #python $RUNTIME/conll2bracket.py ${RESULT_DIR}/${FILE}.split.tok >  ${RESULT_DIR}/${FILE}.split.tok.bracket
-        data_bracket = f"{steps.data.resu}/{steps.data.name}.split.tok.bracket"
-        print(f"Starting formating into bracket text...to {data_bracket}")
-        c2bracket.main(data_conll, data_bracket)
-    
-
-
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--config', help='Config file in JSON')
-    args = parser.parse_args()
-    config = args.config
-
-    now = datetime.now()
-    #stamp = re.sub('[\s:]', '_', str(now))
-    stamp = "_debug1214"
-    my_logs = {}
-    my_logs['stamp'] = stamp
-
-    steps = get_config_infos(stamp, config)
-    print(stamp)
-    main(steps)
-
-    #print("Done.")
\ No newline at end of file
diff --git a/code/discut22_2.py b/code/discut22_2.py
index 0bf7b74..543cd4d 100644
--- a/code/discut22_2.py
+++ b/code/discut22_2.py
@@ -5,25 +5,153 @@
     And the only one to run,
     after completion of config.json 
     Discut22 uses allennlp toolkit. For that, it need NER intermediary format.
-    
-    
-    
-    
-    
-    """
+"""
+
 import argparse
 from datetime import datetime
 import os
 import re
 import json
-from classes_def_2 import Data, Process, Output
 import utils_2.syntactic_parsing as synt_pars
 import utils.conv2ner as conv_to_ner # TODO clean it
 import utils.json2conll as json_to_connl # TODO clean it
 import utils.training_allennlp as tr_allen
+import utils.conll2bracket as c2bracket
+
+
+
+class Data:
+    def __init__(self, infos, stamp):
+        self.name = infos['name']
+        self.lang = infos['language']
+        self.path = f"../data/{self.name}"
+        self.exte = infos['exte']
+        self.stamp = stamp
+        self.conv = f"{self.path}/data_converted_{stamp}"
+        self.resu = f"{self.path}/results_{stamp}"
+        self.meta = infos['existing_metadata']
+
+    def create_folders(self): # -> can be rtansfor into method of class
+        for it in [self.conv, self.resu]:
+            if not os.path.isdir(it):
+                os.mkdir(it)
+
+    def pre_processing(self, steps):
+        file_in = f"{self.path}/{self.name}{self.exte}"
+        if steps.pre_process_to_do == True:
+            file_out = f"{self.conv}/{self.name}.conll"
+            if steps.synt_tool == "stanza":
+                processors = []
+                metadata = {}
+                if steps.toke == True:
+                    processors.extend(['tokenize', 'mwt'])
+                if steps.synt_parse == True:
+                    processors.extend(['pos', 'lemma', 'depparse'])
+                #if steps.ssplit == True:
+                #    processors.append('constituency')
+                if steps.crea_meta == True:
+                    metadata['line'] = steps.meta_line
+                    metadata['sent'] = steps.meta_sent
+                if data.meta == True:
+                    metadata['meta'] = True
+                processors_str = ",".join(processors)
+                synt_pars.with_stanza(data.lang, file_in, file_out, processors_str, metadata)
+            else:
+                exit(f"Exited. Not valid syntactic tool: \"{steps.synt_tool}\". Options: \"stanza\". Change your config file.")
+        else:
+            file_out = file_in
+        my_logs['data_preprocessed'] = file_out
+        self.preprocessed = file_out 
+    
+    def make_ner_format(self):
+        """
+        This fonction build the NER format upon the Segmentor works.
+        INPUT: Tokenized text with whatever number of columns.
+        OUTPUT: Tokenized text with just 4 columns.
+        """
+        self.ner = f"{self.preprocessed}.ner"
+        conv_to_ner.main(self.preprocessed, self.ner, "conll") # <-- TODO faire en relatif#TODO add same for train/dev/test for config train
+        my_logs['data_ner'] = self.ner
+
+    def make_predictions(self, steps):
+        self.pred_json = f"{self.resu}/{self.name}_pred.json"
+        cmd = f"allennlp predict --use-dataset-reader --output-file {self.pred_json} {steps.model_path} {self.ner} &> {self.resu}/logs_predictions.txt"
+        os.system(cmd)
+
+    def pred_json_to_conll_with_metadata(self):
+        self.pred_meta_conll = f"{self.resu}/{self.name}_pred_n_meta.conll"
+        json_to_connl.js2conllNmeta(self.pred_json, self.pred_meta_conll, "conll", self.preprocessed) 
+
+    def pred_json_to_conll(self):
+        self.pred_conll = f"{self.resu}/{self.name}_pred.conll"
+        json_to_connl.js2conll(self.pred_json, self.pred_conll, "conll") 
+
+    def brackets_txt(self):
+        self.brack = f"{self.resu}/{self.name}_brac.txt"
+        c2bracket.conll2brackets(self.pred_conll, self.brack)
+
+    def brackets_txt_with_metadata(self):
+        self.brack_meta = f"{self.resu}/{self.name}_brac_meta.txt"
+        c2bracket.conll2brackets_with_meta(self.pred_meta_conll, self.brack_meta)
+
+
+
+class Process:
+    def __init__(self, infos):
+        self.main = infos["main"] # train test annotation
+
+        self.pre_process_to_do = infos['pre-processing']['to_do']
+        self.synt_tool = infos['pre-processing']['syntactic_tool']
+        self.synt_parse = infos['pre-processing']['syntactic_parsing']
+        self.toke = infos['pre-processing']['tokenization']
+        self.ssplit = infos['pre-processing']['sentence_split']
+        self.crea_meta = infos['pre-processing']['create_metadata']['to_do']
+        self.meta_line = infos['pre-processing']['create_metadata']['line']
+        self.meta_sent = infos['pre-processing']['create_metadata']['sent']
+
+
+        #if self.main == "train":
+            #if self.ner_init == True : # à faire en relatif !! split truc
+            #    self.train_data = f"{self.data.path}/{self.data.name}_train.ner{self.data.file}"
+            #    self.dev_data = f"{self.data.path}/{self.data.name}_dev.ner{self.data.file}"
+            #else :
+            #    self.train_data = infos['discourse_segmenter']['training']['train_data_path']
+            #    self.dev_data = infos['discourse_segmenter']['training']['validation_data_path']
+        self.toolkit = infos['discourse_segmenter']['training']['toolkit']
+        self.tr_config = infos['discourse_segmenter']['training']['config_file']
+        self.pretr_lm = infos['discourse_segmenter']['training']['pre_trained_lm']
+
+        self.model = infos['discourse_segmenter']['model'] # ezpz for Tony 
+
+        #self.post_tab = infos['post-processing']['json_to_tab']
+
+        self.eval = infos['evaluation']
+        self.test_data = infos['gold_test_data_path']
+
+
+    def get_model(self):
+        self.model_path = ""
+        if self.model == "tony": 
+            arch = "french_tokens.tar.gz"
+            if not os.path.isfile(f"../model/tony/{arch}"):
+                dl = "wget https://zenodo.org/record/4235850/files/french_tokens.tar.gz -P ../model --progress=bar"
+                os.system(dl)
+                self.model_path = f"../model/tony/{arch}"
+            else:
+                print("Tony already in place !")
+                self.model_path = f"../model/tony/{arch}"
+        else:
+            self.model_path = self.model
 
 
 
+        
+class Output:
+    def __init__(self, infos):
+        self.prod_bracket = infos['file']['tab_to_bracket']
+        self.prod_conll = infos['file']['conllu']
+        self.metadata = infos['file']['metadata']
+
 
 def get_stamp():
     now = datetime.now()
@@ -39,75 +167,11 @@ def get_config_infos(config, stamp):
         my_logs["config"] = infos
     return data, steps, prod
 
-def create_folders(li): # -> can be rtansfor into method of class
-    for it in li:
-        if not os.path.isdir(it):
-            os.mkdir(it)
 
 def print_logs():
     file_logs = f"{data.resu}/processes_logs.json"
-    print(my_logs)
-        
-def pre_processing(data, steps):
-    data_in = f"{data.path}/{data.name}{data.exte}"
-    if steps.pre_process_to_do == True:
-        data_out = f"{data.conv}/{data.name}.conll"
-        if steps.synt_tool == "stanza":
-            processors = []
-            metadata = {}
-            if steps.toke == True:
-                processors.extend(['tokenize', 'mwt'])
-            if steps.synt_parse == True:
-                processors.extend(['pos', 'lemma', 'depparse'])
-            #if steps.ssplit == True:
-            #    processors.append('constituency')
-            if steps.crea_meta == True:
-                metadata['line'] = steps.meta_line
-                metadata['sent'] = steps.meta_sent
-            if data.meta == True:
-                metadata['meta'] = True
-            processors_str = ",".join(processors)
-            synt_pars.with_stanza(data.lang, data_in, data_out, processors_str, metadata)
-        else:
-            exit(f"Exited. Not valid syntactic tool: \"{steps.synt_tool}\". Options: \"stanza\". Change your config file.")
-    else:
-        data_out = data_in
-    my_logs['data_preprocessed'] = data_out
-    return data_out 
-
-def data_to_ner_format(data_in):
-    """
-    This fonction build the NER format upon the Segmentor works.
-    INPUT: Tokenized text with whatever number of columns.
-    OUTPUT: Tokenized text with just 4 columns.
-    """
-    data_ner = f"{data_in}.ner"
-    conv_to_ner.main(data_in, data_ner, "conll") # <-- TODO faire en relatif
-
-    #TODO add same for train/dev/test for config train
-
-    my_logs['data_ner'] = data_ner
-    return data_ner
-
-def make_predictions(data_in, model_path):
-    model = model_path # add def get_model from v1
-    data_out = f"{data.resu}/{data.name}_pred.json"
-    #cmd = f"allennlp predict --use-dataset-reader --output-file {data_out} {model_path} {data_in} &> {steps.data.resu}/logs.txt"
-    cmd = f"allennlp predict --use-dataset-reader --output-file {data_out} {model_path} {data_in} &> {data.resu}/logs_predictions.txt"
-    os.system(cmd)
-    return data_out
-
-def pred_json_to_toke(data_in):
-    data_out = f"{data.resu}/{data.name}_pred.conll"
-    json_to_connl.js2conll(data_in, data_out, "conll") # <-- TODO faire en relatif
-    return data_out
-
-def pred_json_to_conll_with_metadata(data_pred_json, data_meta):
-    data_out = f"{data.resu}/{data.name}_pred_n_meta.conll"
-    json_to_connl.js2conllNmeta(data_pred_json, data_out, "conll", data_meta) # <-- TODO faire en relatif
-    return data_out
-
-
+    print(my_logs) # <-- ahaha TBD
+    
 
 
 
@@ -124,20 +188,28 @@ if __name__ == '__main__':
     my_logs["stamp"] = stamp
    
     data, steps, prod = get_config_infos(config, stamp)
-    create_folders([data.conv, data.resu])
-
-    data_preprocessed = pre_processing(data, steps)
+    data.create_folders()
+    data.pre_processing(steps)
+    data.make_ner_format()
     #TEST data_preprocessed = "/home/lriviere/andiamo/discut22/data/edgar_poe_short/data_converted_vendredi/edgar_poe_short.conll"
-    data_ner = data_to_ner_format(data_preprocessed)
+    steps.get_model()
 
     if steps.main == "annotation" or steps.main == "test":
-        data_pred_json = make_predictions(data_ner, steps.model)
+        data.make_predictions(steps)
         #data_pred_json = "/home/lriviere/andiamo/discut22/data/edgar_poe_short/results_vendredi/edgar_poe_short_pred.json"
+        
+        data.pred_json_to_conll()
 
         if prod.metadata == True:
-            data_pred_and_meta_conll = pred_json_to_conll_with_metadata(data_pred_json, data_preprocessed)
-        else:
-            data_pred_toke = pred_json_to_toke(data_pred_json)
+            data.pred_json_to_conll_with_metadata()
+        
+        if prod.prod_bracket == True:
+            if data.exte != ".txt":
+                exit("pb")
+            else:
+                data.brackets_txt()
+                data.brackets_txt_with_metadata()
+
 
     #elif steps.main == "train":
 
diff --git a/code/utils/conll2bracket.py b/code/utils/conll2bracket.py
index f1dac8b..ec597fd 100644
--- a/code/utils/conll2bracket.py
+++ b/code/utils/conll2bracket.py
@@ -50,8 +50,26 @@ def conll2brackets(in_f, out_f):
                     start = False
             file_out.write("]\n\n")
             
-def main(f_in, f_out):
-    input = f_in
-    output = f_out
-    
-    conll2brackets(input, output)
\ No newline at end of file
+def conll2brackets_with_meta(in_f, out_f):
+    start = True
+    input = in_f
+
+    with open(out_f, 'w') as file_out:
+        with open(in_f, 'r') as input:
+            for line in input: 
+                if line.startswith("#"):
+                    file_out.write(f"{line}\n")
+                elif line.strip()=="":
+                    file_out.write("]")
+                    file_out.write("\n\n")
+                    start = True
+                else:
+                    n, word, *junk, tag = line.split()
+                    if tag=="BeginSeg=Yes":
+                        if not(start):
+                            file_out.write("] ")
+                        file_out.write(f"[ {word} ")
+                    else:
+                        file_out.write(f"{word} ")
+                    start = False
+            file_out.write("]\n\n")
\ No newline at end of file
diff --git a/code/utils/json2conll.py b/code/utils/json2conll.py
index e102b2e..73e1646 100644
--- a/code/utils/json2conll.py
+++ b/code/utils/json2conll.py
@@ -45,7 +45,7 @@ def js2conllNmeta(data_pred_json, data_out, config, data_meta):
 
     with open(data_out, 'w', encoding='utf-8') as fo, open(data_meta, 'r') as fm:       
         
-        
+        # id 
         for line in fm:
             line = line.strip()
             if line.startswith("#"):
@@ -60,12 +60,11 @@ def js2conllNmeta(data_pred_json, data_out, config, data_meta):
                 tag = data[sent_pred_count]['tags'][tok]
                 tok += 1
                 #print(f"tok: {tok}, tag: {tag}, word: {word}, line : {line}")
-                
-                if int(line.split("\t")[0]) == tok and line.split("\t")[1] == word:
-                    fo.write(f"{line}\t{tag}\n")
+                fo.write(f"{line}\t{map[tag]}\n")
+                #if int(line.split("\t")[0]) == tok and line.split("\t")[1] == word:
+                #    fo.write(f"{line}\t{tag}\n")
                     
-                else:
-                    exit("pb js2conllNmeta")
+
 
                 #print(f"sentpred : {sent_pred}\n")
                 #print(f"word n tag : {word}:::{tag}\n")
diff --git a/code/utils_2/__pycache__/syntactic_parsing.cpython-37.pyc b/code/utils_2/__pycache__/syntactic_parsing.cpython-37.pyc
index 606e61ba6140f958cfde6090fe9566581f319f63..14dde82debc911172ec9de77498bcdab8b0d87a4 100644
GIT binary patch
delta 163
zcmaFL{g#{0iI<m)fq{WxssEN_NA`()$&8aG7U`(Q)v}c^*D$59rLd&1_A*ak?02eV
zPhqd+n7~+JQp4uLFo7{vYvSQJo)q31mIZ7J8M4`mZcNr>)Kr{O!;{6~!Vue4%Ui>h
z!eP!(%ay_pQ!|0FNNaLFV=}){I71El0`?T{g^Y|0g)cTgWSqjx$HK!X#>mCQ!N|kN
L!NfPYnKcOjKHMr=

delta 155
zcmaFM{gj)}iI<m)fq{YH@a750&)FvOB{OzSEYeX8t7R);u3<`HOJPZ2?PZ?8*l$(K
zp2A+sF@do{r-sdiVFF{U(!|4Y95pN{ypjw>S0?K+YRXQi;mP7~VTf(3<*ngL;V@^Y
z<*H#x;g@8Xz*wX-xtuYXUn`uUhJ68h3im=rMux&Cn_n_cVdi7zU=(BIV&Y)rVdP-s
Jn%vEr1OO0xC*c49

diff --git a/code/utils_2/syntactic_parsing.py b/code/utils_2/syntactic_parsing.py
index 0d85fd4..8515d7d 100644
--- a/code/utils_2/syntactic_parsing.py
+++ b/code/utils_2/syntactic_parsing.py
@@ -42,14 +42,16 @@ def with_stanza(lang, f_in, f_out, process, meta):
                 fo.write("\n")
             else:
                 
-                if meta['line']:
+                #if meta['line']:
+                if "line" in meta.keys():
                     txt = f"#{meta['line']}-{count_line}\n"
                     fo.write(txt)
 
                 doc = nlp(line)
                 for sent in doc.sentences:
                     count_sent += 1
-                    if meta['sent']:
+                    #if meta['sent']:
+                    if "sent" in meta.keys():
                         txt = f"#{meta['sent']}-{count_sent}\n#text=\"{sent.text}\"\n"
                         fo.write(txt)
 
-- 
GitLab