add possibility of output with metadata

415aed4a · laura.riviere · 4f783b18 · 415aed4a · 415aed4a · 415aed4a
Commit 415aed4a authored 2 years ago by laura.riviere
--- a/code/classes_def.py
+++ b/code/classes_def.py
@@ -44,4 +44,5 @@ class Process:
        self.test_data = infos['gold_test_data_path']
        self.post_bracket = infos['post-processing']['tab_to_bracket']
+        self.post_conll = infos['post-processing']['metadata_conll']
\ No newline at end of file
--- a/code/config_global_2.json
+++ b/code/config_global_2.json
@@ -2,15 +2,15 @@
    "usecase_description": "Config file for usecase_2",
    "input": {
        "name": "fra.sdrt.annodis_dev",
-        "file": ".ttok",
+        "file": ".conllu",
        "language": "fr"
    },
    "steps":{
-        "main": "annotation",
+        "main": "test",
        "pre-processing": {
            "tokenization": false,
            "tokenization_tool" : "spacy",
-            "sentence_split": true,
+            "sentence_split": false,
            "sentence_split_splitor": "stanza",
            "syntactic_parsing": false, 
            "NER_format_initialisation": true
@@ -27,9 +27,10 @@
        },
        "post-processing": {
            "json_to_tab": true,
+            "metadata_conll": true,
            "tab_to_bracket":true
        },
-        "evaluation": false,
+        "evaluation": true,
        "gold_test_data_path": null
    }
 }

--- a/code/discut22_1.py
+++ b/code/discut22_1.py
@@ -47,6 +47,8 @@ def get_model(model_name):
            output = f"../model/{name}/{arch}"
        else:
            print("Tony already in place !")
+            output = f"../model/{name}/{arch}"
    else:
        output = model_name
@@ -63,10 +65,6 @@ def text_tokenization(f_in, f_out, lang, tool):
 def main(steps):
-    #steps = get_config_infos(config) # on obtient la liste des trucs
-    # à faire, donnée par la classe Process
-    #print([x for x in enumerate(steps)])
-    #suivant la liste ordonnée, faire les trucs (for now simple usecase1):
    # FN: soit besoin sent split, soit besoin tokenizer, soit aucun des deux
@@ -126,8 +124,12 @@ def main(steps):
    # #allennlp predict --use-dataset-reader --output-file ${RESULT_DIR}/${FILE}.json ${MODEL} ${RAW}.ner.tok
        print(f"Checking for model...{steps.model}")
        model_path = get_model(steps.model)
+        print(f"model{model_path}")
        data_json = f"{steps.data.resu}/{steps.data.name}.json"
+        print(f"datapred: {data_json}\n")
+        print(f"input: {data_ner}\n")
        cmd = f"allennlp predict --use-dataset-reader --output-file {data_json} {model_path} {data_ner} &> {steps.data.resu}/logs.txt"
+        print(f"comd{cmd}")
        print("Starting Prediction...")
        os.system(cmd)
    #### ------------------------------- TBD do the same but with python script (or JIANT ??)
@@ -163,8 +165,6 @@ def main(steps):
            data_conll = f"{steps.data.resu}/{steps.data.name}_test.predictions.conll" ## à faire en relatif
            print(f"Starting Formating from json to tok format...to {data_conll}")
            j2c.main(data_json, "split.tok", data_conll)
-            #data_pred_ner = f"{steps.data.resu}/eng.rst.rstdt_test.predictions.conll.ner"
-            #c2n.main(data_conll, data_pred_ner, steps.data.file)
            print(f"starting eval, gold={data_gold}, predictions={data_conll}, model={model_path}")
            data_g = "../data/eng.rst.rstdt/eng.rst.rstdt_test.conllu"
            data_p = "../data/eng.rst.rstdt/results/eng.rst.rstdt_test.predictions.conll" # == data_conll
@@ -179,6 +179,46 @@ def main(steps):
            os.system(cmd)
+    if steps.post_conll == True:
+        f_pred = f"{steps.data.resu}/{steps.data.name}.split.tok"
+        predictions = open(f_pred, 'r')
+        first_line = predictions.readline()
+        columns = first_line.split("\t")
+        predictions.close()
+        f_out = f"{steps.data.resu}/{steps.data.name}_full_output.conllu"
+        with open(f_out, "w") as fo:
+            f_in = f"{steps.data.path}/{steps.data.name}{steps.data.file}"
+            with open(f_in, "r") as fi:
+                f_pred = f"{steps.data.resu}/{steps.data.name}.split.tok"
+                with open(f_pred, "r") as fp:
+                    df = pd.read_csv(fp, header=None, sep="\t", usecols=[len(columns)-1])
+                    #df = df.dropna()
+                    print(f"longueur={len(df)}")
+                    print(f"line bug: {df.iloc[3047-148:3060-148,:]}\n")
+                    print(f"type {type(df.iloc[4,:])}")
+                    i = 0
+                    for line in fi:
+                        line = line.strip()
+                        if line.startswith("#"):
+                            fo.write(f"{line}\n")
+                        elif line == "":
+                            fo.write(f"{line}\n")
+                            i +=1
+                        else:
+                            fo.write(f"{line}")
+                            labels = df.iloc[i,:].values.tolist()
+                            for tag in labels:
+                                fo.write(f"\t{tag}")
+                            fo.write("\n")
+                            #fo.write(f"{df.iloc[i,:]}\n")
+                            i += 1
+                            #print(f"i::{i}\t")
    if steps.post_bracket == True :
@@ -199,7 +239,8 @@ if __name__ == '__main__':
    config = args.config
    now = datetime.now()
-    stamp = re.sub('[\s:]', '_', str(now))
+    #stamp = re.sub('[\s:]', '_', str(now))
+    stamp = "debug1205"
    my_logs = {}
    my_logs['stamp'] = stamp
@@ -207,4 +248,4 @@ if __name__ == '__main__':
    print(stamp)
    main(steps)
-    print("Done.")
+    #print("Done.")
\ No newline at end of file