add refacto usecase2

af630340 · laura.riviere · 6744dc6b · af630340 · af630340 · af630340
Commit af630340 authored 2 years ago by laura.riviere
--- a/code/config_global_1.2.json
+++ b/code/config_global_1.2.json
@@ -30,16 +30,18 @@
                "validation_data_path": null
            }
        },
-        "evaluation": false,
        "gold_test_data_path": null
    },
    "output":{
-        "file":{
-            "tab_to_bracket": true,
-            "conllu":true,
-            "metadata": true
+        "conll_file":{
+            "to_do": true,
+            "metadata": true,
+            "with_gold_labels": true
        },
-        "scores":false
+        "txt_file":{
+            "to_do": true,
+            "metadata": true
+        }
    }
 }


--- a/code/config_global_1.21.json
+++ b/code/config_global_1.21.json
@@ -30,16 +30,18 @@
                "validation_data_path": null
            }
        },
-        "evaluation": false,
        "gold_test_data_path": null
    },
    "output":{
-        "file":{
-            "conllu":true,
+        "conll_file":{
+            "to_do": true,
            "metadata": true,
-            "tab_to_bracket": false
+            "with_gold_labels": true
        },
-        "scores":false
+        "txt_file":{
+            "to_do": true,
+            "metadata": true
+        }
    }
 }


--- a/code/config_global_2.2.json
+++ b/code/config_global_2.2.json
+{
+    "usecase_description": "Config file for usecase_2 : from a gold text, make predictions with an existing model, compare gold and predictions with metrics.",
+    "data_raw": {
+        "name": "fra.sdrt.annodis_dev",
+        "exte": ".conllu",
+        "language": "fr",
+        "existing_metadata": true
+    },
+    "steps":{
+        "main": "test",
+        "pre-processing": {
+            "to_do": false,
+            "syntactic_tool": "stanza",
+            "sentence_split": true,
+            "tokenization": true,
+            "syntactic_parsing": true,
+            "create_metadata": {
+                "to_do": false,
+                "line": "paragraph",
+                "sent": "sent"
+            }
+        },
+        "discourse_segmenter": {
+            "model": "tony",
+            "training": {
+                "toolkit": null,
+                "pre_trained_lm": null,
+                "config_file": null,
+                "train_data_path": null,
+                "validation_data_path": null
+            }
+        },
+        "gold_test_data_path": null
+    },
+    "output":{
+        "conll_file":{
+            "to_do": true,
+            "metadata": true,
+            "with_gold_labels": true
+        },
+        "txt_file":{
+            "to_do": true,
+            "metadata": true
+        }
+    }
+}
+
+
+
--- a/code/discut22_2.py
+++ b/code/discut22_2.py
@@ -12,11 +12,12 @@ from datetime import datetime
 import os
 import re
 import json
-import utils_2.syntactic_parsing as synt_pars
+import utils.syntactic_parsing as synt_pars
 import utils.conv2ner as conv_to_ner # TODO clean it
 import utils.json2conll as json_to_connl # TODO clean it
 import utils.training_allennlp as tr_allen
 import utils.conll2bracket as c2bracket
+import utils.seg_eval as seg_eval



@@ -26,6 +27,7 @@ class Data:
        self.lang = infos['language']
        self.path = f"../data/{self.name}"
        self.exte = infos['exte']
+        self.raw = f"{self.path}/{self.name}{self.exte}"
        self.stamp = stamp
        self.conv = f"{self.path}/data_converted_{stamp}"
        self.resu = f"{self.path}/results_{stamp}"
@@ -33,11 +35,14 @@ class Data:

    def create_folders(self): # -> can be rtansfor into method of class
        for it in [self.conv, self.resu]:
+            print(f"----> Checking/creating folder {it}.")
            if not os.path.isdir(it):
                os.mkdir(it)
+            my_logs['folders'] = f"{self.conv}, {self.resu}"

    def pre_processing(self, steps):
-        file_in = f"{self.path}/{self.name}{self.exte}"
+        print("----> Preprocessing input data.")
+        file_in = self.raw
        if steps.pre_process_to_do == True:
            file_out = f"{self.conv}/{self.name}.conll"
            if steps.synt_tool == "stanza":
@@ -70,21 +75,38 @@ class Data:
        OUTPUT: Tokenized text with just 4 columns.
        """
        self.ner = f"{self.preprocessed}.ner"
+        self.ner = f"{self.conv}/{self.name}.conll.ner"
+        print(f"----> Making NER format {self.ner}.")
        conv_to_ner.main(self.preprocessed, self.ner, "conll") # <-- TODO faire en relatif#TODO add same for train/dev/test for config train
        my_logs['data_ner'] = self.ner

    def make_predictions(self, steps):
        self.pred_json = f"{self.resu}/{self.name}_pred.json"
        cmd = f"allennlp predict --use-dataset-reader --output-file {self.pred_json} {steps.model_path} {self.ner} &> {self.resu}/logs_predictions.txt"
+        print(f"----> Making predictions: {cmd}.")
        os.system(cmd)
+        my_logs['predictions_cmd'] = cmd

-    def pred_json_to_conll_with_metadata(self):
-        self.pred_meta_conll = f"{self.resu}/{self.name}_pred_n_meta.conll"
+
+    def pred_json_to_conll_w_metadata_w_gold(self): # here and 3 below..sorry..factorsation TBD
+        self.pred_conll_meta_gold = f"{self.resu}/{self.name}_pred_meta_gold.conll"
+        json_to_connl.js2conllNmetaNgold(self.pred_json, self.pred_conll_meta_gold, "conll", self.preprocessed)
+        return self.pred_conll_meta_gold
+
+    def pred_json_to_conll_w_metadata(self):
+        self.pred_meta_conll = f"{self.resu}/{self.name}_pred_meta.conll"
        json_to_connl.js2conllNmeta(self.pred_json, self.pred_meta_conll, "conll", self.preprocessed) 
+        return self.pred_meta_conll
+
+    def pred_json_to_conll_w_gold(self):
+        self.pred_conll_gold = f"{self.resu}/{self.name}_pred_gold.conll"
+        json_to_connl.js2conll(self.pred_json, self.pred_conll_gold, "conll") 
+        return self.pred_conll_gold

    def pred_json_to_conll(self):
        self.pred_conll = f"{self.resu}/{self.name}_pred.conll"
        json_to_connl.js2conll(self.pred_json, self.pred_conll, "conll") 
+        return self.pred_conll

    def brackets_txt(self):
        self.brack = f"{self.resu}/{self.name}_brac.txt"
@@ -95,6 +117,44 @@ class Data:
        c2bracket.conll2brackets_with_meta(self.pred_meta_conll, self.brack_meta)


+    def evaluation(self, prod):
+        self.basic_metrics = f"{self.resu}/Evaluation_metrics.json"
+
+        if self.exte == ".conll" or self.exte == ".conllu": # get gold file
+            gold = self.raw
+        else:
+            gold = self.preprocessed
+
+        if prod.conll_todo == False:    # get pred_file
+            pred = self.pred_json_to_conll()
+        else:                       
+            if prod.conll_meta == True:
+                if prod.conll_w_gold == True:
+                    pred = self.pred_json_to_conll_w_metadata_w_gold()
+                else:
+                    pred = self.pred_json_to_conll_w_metadata()
+            else:
+                if prod.conll_w_gold == True:
+                    pred = self.pred_json_to_conll_w_gold()
+                else:
+                    pred = self.pred_json_to_conll()
+
+        print(f"----> Predictions to file {pred}")
+        print(f"----> Evaluation scores to file {self.basic_metrics}")
+        scores_dict = seg_eval.get_scores(gold, pred)
+        with open(self.basic_metrics, 'w') as fo:
+            json.dump(scores_dict, fo)
+
+
+class Output:
+    def __init__(self, infos):
+        self.conll_todo = infos['conll_file']['to_do']
+        self.conll_meta = infos['conll_file']['metadata']
+        self.conll_w_gold = infos['conll_file']['with_gold_labels']
+        self.txt_todo = infos['txt_file']['to_do']
+        self.txt_meta = infos['txt_file']['metadata']
+
+

 class Process:
    def __init__(self, infos):
@@ -109,7 +169,6 @@ class Process:
        self.meta_line = infos['pre-processing']['create_metadata']['line']
        self.meta_sent = infos['pre-processing']['create_metadata']['sent']

-
        #if self.main == "train":
            #if self.ner_init == True : # à faire en relatif !! split truc
            #    self.train_data = f"{self.data.path}/{self.data.name}_train.ner{self.data.file}"
@@ -120,14 +179,14 @@ class Process:
        self.toolkit = infos['discourse_segmenter']['training']['toolkit']
        self.tr_config = infos['discourse_segmenter']['training']['config_file']
        self.pretr_lm = infos['discourse_segmenter']['training']['pre_trained_lm']
-
        self.model = infos['discourse_segmenter']['model'] # ezpz for Tony 

-        #self.post_tab = infos['post-processing']['json_to_tab']
-
-        self.eval = infos['evaluation']
        self.test_data = infos['gold_test_data_path']

+    def get_evaluation_status(self):
+        if self.main == "test":
+            self.eval = True
+        #elif self.main == "train":

    def get_model(self):
        self.model_path = ""
@@ -138,21 +197,12 @@ class Process:
                os.system(dl)
                self.model_path = f"../model/tony/{arch}"
            else:
-                print("Tony already in place !")
+                print("----> Tony already in place !")
                self.model_path = f"../model/tony/{arch}"
        else:
            self.model_path = self.model


-
-        
-class Output:
-    def __init__(self, infos):
-        self.prod_bracket = infos['file']['tab_to_bracket']
-        self.prod_conll = infos['file']['conllu']
-        self.metadata = infos['file']['metadata']
-
-
 def get_stamp():
    now = datetime.now()
    stamp = re.sub('[\s:]', '_', str(now))
@@ -168,14 +218,14 @@ def get_config_infos(config, stamp):
    return data, steps, prod


-def print_logs():
+def print_logs(dict_logs):
    file_logs = f"{data.resu}/processes_logs.json"
-    print(my_logs) # <-- ahaha TBD
+    with open(file_logs, 'w') as fi:
+        json.dump(dict_logs, fi, indent=4)
    



-
 if __name__ == '__main__':
    my_logs = {}
    stamp = get_stamp()
@@ -191,32 +241,15 @@ if __name__ == '__main__':
    data.create_folders()
    data.pre_processing(steps)
    data.make_ner_format()
-    #TEST data_preprocessed = "/home/lriviere/andiamo/discut22/data/edgar_poe_short/data_converted_vendredi/edgar_poe_short.conll"
    steps.get_model()

    if steps.main == "annotation" or steps.main == "test":
-        data.make_predictions(steps)
-        #data_pred_json = "/home/lriviere/andiamo/discut22/data/edgar_poe_short/results_vendredi/edgar_poe_short_pred.json"
-        
-        data.pred_json_to_conll()
-
-        if prod.metadata == True:
-            data.pred_json_to_conll_with_metadata()
-        
-        if prod.prod_bracket == True:
-            if data.exte != ".txt":
-                exit("pb")
-            else:
-                data.brackets_txt()
-                data.brackets_txt_with_metadata()
-
-
+        data.make_predictions(steps) # output allennlp JSON
    #elif steps.main == "train":

+    steps.get_evaluation_status()
+    if steps.eval == True:
+        data.evaluation(prod)
        

-
-
-    #scores = compare_pred_gold()
-
-    #print_logs()
\ No newline at end of file
+    print_logs(my_logs) # <-- attention variable globale !
\ No newline at end of file
--- a/code/utils/conv2ner.py
+++ b/code/utils/conv2ner.py
@@ -110,8 +110,8 @@ def conversion2ner(input, output, params=None):
                        # then, previous token label is set to B-E to signal end of previous segment
                        res[-1][-1] = "B-E"
                    start_doc = False
-                    if label not in maptags:
-                        print("warning, strange label ",label,file=sys.stderr)
+                    #if label not in maptags:
+                        #print("warning, strange label ",label,file=sys.stderr)
                    res.append([w,pos,"O",tag])
                    
            for line in res:

--- a/code/utils/json2conll.py
+++ b/code/utils/json2conll.py
@@ -5,6 +5,7 @@ conll format

 import json
 import sys
+import re

 #filepath = sys.argv[1]
 #config = sys.argv[2]
@@ -60,7 +61,8 @@ def js2conllNmeta(data_pred_json, data_out, config, data_meta):
                tag = data[sent_pred_count]['tags'][tok]
                tok += 1
                #print(f"tok: {tok}, tag: {tag}, word: {word}, line : {line}")
-                fo.write(f"{line}\t{map[tag]}\n")
+                new_line = re.sub('\t[^\t]+$', '', line)
+                fo.write(f"{new_line}\t{map[tag]}\n")
                #if int(line.split("\t")[0]) == tok and line.split("\t")[1] == word:
                #    fo.write(f"{line}\t{tag}\n")
                    
@@ -68,4 +70,29 @@ def js2conllNmeta(data_pred_json, data_out, config, data_meta):

                #print(f"sentpred : {sent_pred}\n")
                #print(f"word n tag : {word}:::{tag}\n")
-                
\ No newline at end of file
+                
+def js2conllNmetaNgold(data_pred_json, data_out, config, gold_n_meta):
+    data = []
+    sent_pred_count = 0
+    tok = 0
+    for line in open(data_pred_json, 'r'):
+        data.append(json.loads(line))
+
+    with open(data_out, 'w', encoding='utf-8') as fo, open(gold_n_meta, 'r') as fm:       
+        
+        # id 
+        for line in fm:
+            line = line.strip()
+            if line.startswith("#"):
+                fo.write(f"{line}\n")
+            elif line == "":
+                sent_pred_count += 1
+                tok = 0
+                fo.write(f"{line}\n")
+            else:
+                sent_pred = data[sent_pred_count]
+                word = data[sent_pred_count]['words'][tok]
+                tag = data[sent_pred_count]['tags'][tok]
+                tok += 1
+                #print(f"tok: {tok}, tag: {tag}, word: {word}, line : {line}")
+                fo.write(f"{line}\t{map[tag]}\n")
\ No newline at end of file
--- a/code/utils/seg_eval.py
+++ b/code/utils/seg_eval.py
@@ -68,9 +68,9 @@ Arguments:

 """

-__author__ = "Amir Zeldes"
+#__author__ = "Amir Zeldes"
 __license__ = "Apache 2.0"
-__version__ = "1.0.1"
+#__version__ = "1.0.1"

 def parse_data(infile, string_input=False):
 	if not string_input:
@@ -222,22 +222,3 @@ def get_scores(gold_file, pred_file, string_input=False):

 	return score_dict

-
-if __name__ == "__main__":
-	p = argparse.ArgumentParser()
-
-	p.add_argument("goldfile",help="Shared task gold file in .tok or .conll format")
-	p.add_argument("predfile",help="Corresponding file with system predictions")
-	p.add_argument("-s","--string_input",action="store_true",help="Whether inputs are file names or strings")
-
-	opts = p.parse_args()
-
-	score_dict = get_scores(opts.goldfile,opts.predfile,opts.string_input)
-
-	print("File: " + score_dict["doc_name"])
-	print("o Total tokens: " + str(score_dict["tok_count"]))
-	print("o Gold " +score_dict["seg_type"]+": " + str(score_dict["gold_seg_count"]))
-	print("o Predicted "+score_dict["seg_type"]+": " + str(score_dict["pred_seg_count"]))
-	print("o Precision: " + str(score_dict["prec"]))
-	print("o Recall: " + str(score_dict["rec"]))
-	print("o F-Score: " + str(score_dict["f_score"]))
--- a/code/utils_2/syntactic_parsing.py
+++ b/code/utils_2/syntactic_parsing.py
--- a/code/utils_2/__init__.py
+++ b/code/utils_2/__init__.py
--- a/code/utils_2/__pycache__/__init__.cpython-37.pyc
+++ b/code/utils_2/__pycache__/__init__.cpython-37.pyc
--- a/code/utils_2/__pycache__/syntactic_parsing.cpython-37.pyc
+++ b/code/utils_2/__pycache__/syntactic_parsing.cpython-37.pyc