From af6303402edc42962ab5907ec5ae7594cc3a4598 Mon Sep 17 00:00:00 2001
From: "laura.riviere" <laura.riviere@irit.fr>
Date: Fri, 6 Jan 2023 16:05:26 +0100
Subject: [PATCH] add refacto usecase2

---
 code/config_global_1.2.json                   |  14 +-
 code/config_global_1.21.json                  |  12 +-
 code/config_global_2.2.json                   |  49 +++++++
 code/discut22_2.py                            | 121 +++++++++++-------
 code/utils/conv2ner.py                        |   4 +-
 code/utils/json2conll.py                      |  31 ++++-
 code/utils/seg_eval.py                        |  23 +---
 code/{utils_2 => utils}/syntactic_parsing.py  |   0
 code/utils_2/__init__.py                      |   0
 .../__pycache__/__init__.cpython-37.pyc       | Bin 138 -> 0 bytes
 .../syntactic_parsing.cpython-37.pyc          | Bin 1517 -> 0 bytes
 11 files changed, 174 insertions(+), 80 deletions(-)
 create mode 100644 code/config_global_2.2.json
 rename code/{utils_2 => utils}/syntactic_parsing.py (100%)
 delete mode 100644 code/utils_2/__init__.py
 delete mode 100644 code/utils_2/__pycache__/__init__.cpython-37.pyc
 delete mode 100644 code/utils_2/__pycache__/syntactic_parsing.cpython-37.pyc

diff --git a/code/config_global_1.2.json b/code/config_global_1.2.json
index 4b51333..8cfbf46 100644
--- a/code/config_global_1.2.json
+++ b/code/config_global_1.2.json
@@ -30,16 +30,18 @@
                 "validation_data_path": null
             }
         },
-        "evaluation": false,
         "gold_test_data_path": null
     },
     "output":{
-        "file":{
-            "tab_to_bracket": true,
-            "conllu":true,
-            "metadata": true
+        "conll_file":{
+            "to_do": true,
+            "metadata": true,
+            "with_gold_labels": true
         },
-        "scores":false
+        "txt_file":{
+            "to_do": true,
+            "metadata": true
+        }
     }
 }
 
diff --git a/code/config_global_1.21.json b/code/config_global_1.21.json
index 1e8c4a9..21351a8 100644
--- a/code/config_global_1.21.json
+++ b/code/config_global_1.21.json
@@ -30,16 +30,18 @@
                 "validation_data_path": null
             }
         },
-        "evaluation": false,
         "gold_test_data_path": null
     },
     "output":{
-        "file":{
-            "conllu":true,
+        "conll_file":{
+            "to_do": true,
             "metadata": true,
-            "tab_to_bracket": false
+            "with_gold_labels": true
         },
-        "scores":false
+        "txt_file":{
+            "to_do": true,
+            "metadata": true
+        }
     }
 }
 
diff --git a/code/config_global_2.2.json b/code/config_global_2.2.json
new file mode 100644
index 0000000..afc2e1f
--- /dev/null
+++ b/code/config_global_2.2.json
@@ -0,0 +1,49 @@
+{
+    "usecase_description": "Config file for usecase_2 : from a gold text, make predictions with an existing model, compare gold and predictions with metrics.",
+    "data_raw": {
+        "name": "fra.sdrt.annodis_dev",
+        "exte": ".conllu",
+        "language": "fr",
+        "existing_metadata": true
+    },
+    "steps":{
+        "main": "test",
+        "pre-processing": {
+            "to_do": false,
+            "syntactic_tool": "stanza",
+            "sentence_split": true,
+            "tokenization": true,
+            "syntactic_parsing": true,
+            "create_metadata": {
+                "to_do": false,
+                "line": "paragraph",
+                "sent": "sent"
+            }
+        },
+        "discourse_segmenter": {
+            "model": "tony",
+            "training": {
+                "toolkit": null,
+                "pre_trained_lm": null,
+                "config_file": null,
+                "train_data_path": null,
+                "validation_data_path": null
+            }
+        },
+        "gold_test_data_path": null
+    },
+    "output":{
+        "conll_file":{
+            "to_do": true,
+            "metadata": true,
+            "with_gold_labels": true
+        },
+        "txt_file":{
+            "to_do": true,
+            "metadata": true
+        }
+    }
+}
+
+
+
diff --git a/code/discut22_2.py b/code/discut22_2.py
index 543cd4d..a10bc31 100644
--- a/code/discut22_2.py
+++ b/code/discut22_2.py
@@ -12,11 +12,12 @@ from datetime import datetime
 import os
 import re
 import json
-import utils_2.syntactic_parsing as synt_pars
+import utils.syntactic_parsing as synt_pars
 import utils.conv2ner as conv_to_ner # TODO clean it
 import utils.json2conll as json_to_connl # TODO clean it
 import utils.training_allennlp as tr_allen
 import utils.conll2bracket as c2bracket
+import utils.seg_eval as seg_eval
 
 
 
@@ -26,6 +27,7 @@ class Data:
         self.lang = infos['language']
         self.path = f"../data/{self.name}"
         self.exte = infos['exte']
+        self.raw = f"{self.path}/{self.name}{self.exte}"
         self.stamp = stamp
         self.conv = f"{self.path}/data_converted_{stamp}"
         self.resu = f"{self.path}/results_{stamp}"
@@ -33,11 +35,14 @@ class Data:
 
     def create_folders(self): # -> can be rtansfor into method of class
         for it in [self.conv, self.resu]:
+            print(f"----> Checking/creating folder {it}.")
             if not os.path.isdir(it):
                 os.mkdir(it)
+            my_logs['folders'] = f"{self.conv}, {self.resu}"
 
     def pre_processing(self, steps):
-        file_in = f"{self.path}/{self.name}{self.exte}"
+        print("----> Preprocessing input data.")
+        file_in = self.raw
         if steps.pre_process_to_do == True:
             file_out = f"{self.conv}/{self.name}.conll"
             if steps.synt_tool == "stanza":
@@ -70,21 +75,38 @@ class Data:
         OUTPUT: Tokenized text with just 4 columns.
         """
         self.ner = f"{self.preprocessed}.ner"
+        self.ner = f"{self.conv}/{self.name}.conll.ner"
+        print(f"----> Making NER format {self.ner}.")
         conv_to_ner.main(self.preprocessed, self.ner, "conll") # <-- TODO faire en relatif#TODO add same for train/dev/test for config train
         my_logs['data_ner'] = self.ner
 
     def make_predictions(self, steps):
         self.pred_json = f"{self.resu}/{self.name}_pred.json"
         cmd = f"allennlp predict --use-dataset-reader --output-file {self.pred_json} {steps.model_path} {self.ner} &> {self.resu}/logs_predictions.txt"
+        print(f"----> Making predictions: {cmd}.")
         os.system(cmd)
+        my_logs['predictions_cmd'] = cmd
 
-    def pred_json_to_conll_with_metadata(self):
-        self.pred_meta_conll = f"{self.resu}/{self.name}_pred_n_meta.conll"
+
+    def pred_json_to_conll_w_metadata_w_gold(self): # here and 3 below..sorry..factorsation TBD
+        self.pred_conll_meta_gold = f"{self.resu}/{self.name}_pred_meta_gold.conll"
+        json_to_connl.js2conllNmetaNgold(self.pred_json, self.pred_conll_meta_gold, "conll", self.preprocessed)
+        return self.pred_conll_meta_gold
+
+    def pred_json_to_conll_w_metadata(self):
+        self.pred_meta_conll = f"{self.resu}/{self.name}_pred_meta.conll"
         json_to_connl.js2conllNmeta(self.pred_json, self.pred_meta_conll, "conll", self.preprocessed) 
+        return self.pred_meta_conll
+
+    def pred_json_to_conll_w_gold(self):
+        self.pred_conll_gold = f"{self.resu}/{self.name}_pred_gold.conll"
+        json_to_connl.js2conll(self.pred_json, self.pred_conll_gold, "conll") 
+        return self.pred_conll_gold
 
     def pred_json_to_conll(self):
         self.pred_conll = f"{self.resu}/{self.name}_pred.conll"
         json_to_connl.js2conll(self.pred_json, self.pred_conll, "conll") 
+        return self.pred_conll
 
     def brackets_txt(self):
         self.brack = f"{self.resu}/{self.name}_brac.txt"
@@ -95,6 +117,44 @@ class Data:
         c2bracket.conll2brackets_with_meta(self.pred_meta_conll, self.brack_meta)
 
 
+    def evaluation(self, prod):
+        self.basic_metrics = f"{self.resu}/Evaluation_metrics.json"
+
+        if self.exte == ".conll" or self.exte == ".conllu": # get gold file
+            gold = self.raw
+        else:
+            gold = self.preprocessed
+
+        if prod.conll_todo == False:    # get pred_file
+            pred = self.pred_json_to_conll()
+        else:                       
+            if prod.conll_meta == True:
+                if prod.conll_w_gold == True:
+                    pred = self.pred_json_to_conll_w_metadata_w_gold()
+                else:
+                    pred = self.pred_json_to_conll_w_metadata()
+            else:
+                if prod.conll_w_gold == True:
+                    pred = self.pred_json_to_conll_w_gold()
+                else:
+                    pred = self.pred_json_to_conll()
+
+        print(f"----> Predictions to file {pred}")
+        print(f"----> Evaluation scores to file {self.basic_metrics}")
+        scores_dict = seg_eval.get_scores(gold, pred)
+        with open(self.basic_metrics, 'w') as fo:
+            json.dump(scores_dict, fo)
+
+
+class Output:
+    def __init__(self, infos):
+        self.conll_todo = infos['conll_file']['to_do']
+        self.conll_meta = infos['conll_file']['metadata']
+        self.conll_w_gold = infos['conll_file']['with_gold_labels']
+        self.txt_todo = infos['txt_file']['to_do']
+        self.txt_meta = infos['txt_file']['metadata']
+
+
 
 class Process:
     def __init__(self, infos):
@@ -109,7 +169,6 @@ class Process:
         self.meta_line = infos['pre-processing']['create_metadata']['line']
         self.meta_sent = infos['pre-processing']['create_metadata']['sent']
 
-
         #if self.main == "train":
             #if self.ner_init == True : # à faire en relatif !! split truc
             #    self.train_data = f"{self.data.path}/{self.data.name}_train.ner{self.data.file}"
@@ -120,14 +179,14 @@ class Process:
         self.toolkit = infos['discourse_segmenter']['training']['toolkit']
         self.tr_config = infos['discourse_segmenter']['training']['config_file']
         self.pretr_lm = infos['discourse_segmenter']['training']['pre_trained_lm']
-
         self.model = infos['discourse_segmenter']['model'] # ezpz for Tony 
 
-        #self.post_tab = infos['post-processing']['json_to_tab']
-
-        self.eval = infos['evaluation']
         self.test_data = infos['gold_test_data_path']
 
+    def get_evaluation_status(self):
+        if self.main == "test":
+            self.eval = True
+        #elif self.main == "train":
 
     def get_model(self):
         self.model_path = ""
@@ -138,21 +197,12 @@ class Process:
                 os.system(dl)
                 self.model_path = f"../model/tony/{arch}"
             else:
-                print("Tony already in place !")
+                print("----> Tony already in place !")
                 self.model_path = f"../model/tony/{arch}"
         else:
             self.model_path = self.model
 
 
-
-        
-class Output:
-    def __init__(self, infos):
-        self.prod_bracket = infos['file']['tab_to_bracket']
-        self.prod_conll = infos['file']['conllu']
-        self.metadata = infos['file']['metadata']
-
-
 def get_stamp():
     now = datetime.now()
     stamp = re.sub('[\s:]', '_', str(now))
@@ -168,14 +218,14 @@ def get_config_infos(config, stamp):
     return data, steps, prod
 
 
-def print_logs():
+def print_logs(dict_logs):
     file_logs = f"{data.resu}/processes_logs.json"
-    print(my_logs) # <-- ahaha TBD
+    with open(file_logs, 'w') as fi:
+        json.dump(dict_logs, fi, indent=4)
     
 
 
 
-
 if __name__ == '__main__':
     my_logs = {}
     stamp = get_stamp()
@@ -191,32 +241,15 @@ if __name__ == '__main__':
     data.create_folders()
     data.pre_processing(steps)
     data.make_ner_format()
-    #TEST data_preprocessed = "/home/lriviere/andiamo/discut22/data/edgar_poe_short/data_converted_vendredi/edgar_poe_short.conll"
     steps.get_model()
 
     if steps.main == "annotation" or steps.main == "test":
-        data.make_predictions(steps)
-        #data_pred_json = "/home/lriviere/andiamo/discut22/data/edgar_poe_short/results_vendredi/edgar_poe_short_pred.json"
-        
-        data.pred_json_to_conll()
-
-        if prod.metadata == True:
-            data.pred_json_to_conll_with_metadata()
-        
-        if prod.prod_bracket == True:
-            if data.exte != ".txt":
-                exit("pb")
-            else:
-                data.brackets_txt()
-                data.brackets_txt_with_metadata()
-
-
+        data.make_predictions(steps) # output allennlp JSON
     #elif steps.main == "train":
 
+    steps.get_evaluation_status()
+    if steps.eval == True:
+        data.evaluation(prod)
         
 
-
-
-    #scores = compare_pred_gold()
-
-    #print_logs()
\ No newline at end of file
+    print_logs(my_logs) # <-- attention variable globale !
\ No newline at end of file
diff --git a/code/utils/conv2ner.py b/code/utils/conv2ner.py
index 71216df..4e6edbe 100644
--- a/code/utils/conv2ner.py
+++ b/code/utils/conv2ner.py
@@ -110,8 +110,8 @@ def conversion2ner(input, output, params=None):
                         # then, previous token label is set to B-E to signal end of previous segment
                         res[-1][-1] = "B-E"
                     start_doc = False
-                    if label not in maptags:
-                        print("warning, strange label ",label,file=sys.stderr)
+                    #if label not in maptags:
+                        #print("warning, strange label ",label,file=sys.stderr)
                     res.append([w,pos,"O",tag])
                     
             for line in res:
diff --git a/code/utils/json2conll.py b/code/utils/json2conll.py
index 73e1646..37c5a0f 100644
--- a/code/utils/json2conll.py
+++ b/code/utils/json2conll.py
@@ -5,6 +5,7 @@ conll format
 
 import json
 import sys
+import re
 
 #filepath = sys.argv[1]
 #config = sys.argv[2]
@@ -60,7 +61,8 @@ def js2conllNmeta(data_pred_json, data_out, config, data_meta):
                 tag = data[sent_pred_count]['tags'][tok]
                 tok += 1
                 #print(f"tok: {tok}, tag: {tag}, word: {word}, line : {line}")
-                fo.write(f"{line}\t{map[tag]}\n")
+                new_line = re.sub('\t[^\t]+$', '', line)
+                fo.write(f"{new_line}\t{map[tag]}\n")
                 #if int(line.split("\t")[0]) == tok and line.split("\t")[1] == word:
                 #    fo.write(f"{line}\t{tag}\n")
                     
@@ -68,4 +70,29 @@ def js2conllNmeta(data_pred_json, data_out, config, data_meta):
 
                 #print(f"sentpred : {sent_pred}\n")
                 #print(f"word n tag : {word}:::{tag}\n")
-                
\ No newline at end of file
+                
+def js2conllNmetaNgold(data_pred_json, data_out, config, gold_n_meta):
+    data = []
+    sent_pred_count = 0
+    tok = 0
+    for line in open(data_pred_json, 'r'):
+        data.append(json.loads(line))
+
+    with open(data_out, 'w', encoding='utf-8') as fo, open(gold_n_meta, 'r') as fm:       
+        
+        # id 
+        for line in fm:
+            line = line.strip()
+            if line.startswith("#"):
+                fo.write(f"{line}\n")
+            elif line == "":
+                sent_pred_count += 1
+                tok = 0
+                fo.write(f"{line}\n")
+            else:
+                sent_pred = data[sent_pred_count]
+                word = data[sent_pred_count]['words'][tok]
+                tag = data[sent_pred_count]['tags'][tok]
+                tok += 1
+                #print(f"tok: {tok}, tag: {tag}, word: {word}, line : {line}")
+                fo.write(f"{line}\t{map[tag]}\n")
\ No newline at end of file
diff --git a/code/utils/seg_eval.py b/code/utils/seg_eval.py
index 1808782..d61d2ef 100644
--- a/code/utils/seg_eval.py
+++ b/code/utils/seg_eval.py
@@ -68,9 +68,9 @@ Arguments:
 
 """
 
-__author__ = "Amir Zeldes"
+#__author__ = "Amir Zeldes"
 __license__ = "Apache 2.0"
-__version__ = "1.0.1"
+#__version__ = "1.0.1"
 
 def parse_data(infile, string_input=False):
 	if not string_input:
@@ -222,22 +222,3 @@ def get_scores(gold_file, pred_file, string_input=False):
 
 	return score_dict
 
-
-if __name__ == "__main__":
-	p = argparse.ArgumentParser()
-
-	p.add_argument("goldfile",help="Shared task gold file in .tok or .conll format")
-	p.add_argument("predfile",help="Corresponding file with system predictions")
-	p.add_argument("-s","--string_input",action="store_true",help="Whether inputs are file names or strings")
-
-	opts = p.parse_args()
-
-	score_dict = get_scores(opts.goldfile,opts.predfile,opts.string_input)
-
-	print("File: " + score_dict["doc_name"])
-	print("o Total tokens: " + str(score_dict["tok_count"]))
-	print("o Gold " +score_dict["seg_type"]+": " + str(score_dict["gold_seg_count"]))
-	print("o Predicted "+score_dict["seg_type"]+": " + str(score_dict["pred_seg_count"]))
-	print("o Precision: " + str(score_dict["prec"]))
-	print("o Recall: " + str(score_dict["rec"]))
-	print("o F-Score: " + str(score_dict["f_score"]))
diff --git a/code/utils_2/syntactic_parsing.py b/code/utils/syntactic_parsing.py
similarity index 100%
rename from code/utils_2/syntactic_parsing.py
rename to code/utils/syntactic_parsing.py
diff --git a/code/utils_2/__init__.py b/code/utils_2/__init__.py
deleted file mode 100644
index e69de29..0000000
diff --git a/code/utils_2/__pycache__/__init__.cpython-37.pyc b/code/utils_2/__pycache__/__init__.cpython-37.pyc
deleted file mode 100644
index cefb68a0f66c85292395518615b5064b13104453..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 138
zcmZ?b<>g{vU|_h?JRuoGKL!yn%*epN;K0DZP|U)>z>vZa%%I8Wx00a<B#a<_8SCff
z7p0~omL%#Y=A~pN=H~0CWELlvmKYi7C+DZ6>X(*e<`l;p>Bq-s=4F<|$LkeT-r}&y
Q%}*)KNwotR_8DX$01kv8Q~&?~

diff --git a/code/utils_2/__pycache__/syntactic_parsing.cpython-37.pyc b/code/utils_2/__pycache__/syntactic_parsing.cpython-37.pyc
deleted file mode 100644
index 14dde82debc911172ec9de77498bcdab8b0d87a4..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 1517
zcmZ?b<>g{vU|?A4za`m`oq^#ohy%k+3=9ko3=9m#3JeSkDGVu$ISf%Cnkk1dmnn*g
zks*aSg{6fdiaCWbm_d{EB}kv2CgUwu=X^gOpJY}LpMilvfPsO5n}LDB8D!KXMh1ow
zh6Ri@3=0_<8A=$km}(emm{S;2n0lFOSxcB}7(s@nu=X-fVC;{pWh-H>VM<|3VS$P`
z)v~9s*K$l?tT3rzb77dk7^_vwS;LXSVa`y?k-}cfRl~V}Wg$Z?XAS29))dZ#j72k2
zxN2Ayuq|ZBW-FRj!;-=!$&kXB%~Uk4hJ68h4d-lz6z;i9Ah#5H)o`ZpWHS}50m-Cr
zL&Q>e;ks|6aMrM-@JcewW=P?y;Q;Got>p%(t^upAVXfh8W~}9@m{P-&#o@vb+f~b3
z!v%I5R|-GOoC%CYT40m-p(b(7W=IjJ;mrcMMQ|=VvWXQoHN06|E)20owR|Pq3wUby
z7Vs`)NMV;`Sjbq*53)H$XaQdh|3XHPYr%Z5OC~TD8ig|;_=PXR84A6^8NwM-gaa9Z
z7$T5ZDZ;@Fnj(p_7#SE8xD*r=6oN|<^QscnixrY{5{ru!ob&zse5|;@Ql2gfwhHQ*
zDe7Qeh-*X$h+mRgQ34V1arO0e1PSG&=H@0s#6ko7gF!;21^LAgkqD?r1zg0<)iDID
zC@nRy1ggrz)e&r4MrtC|J{Q-3AXgudSt+RnMX5Ou(_CBwz*<vM3!plEJ%gP=s&X@n
zlhqY;?G)6DOA?Dp;*&EHi)vEyQosy|=HL*=ppbZH562*o281$*tgD|3Tn<&Ao2RRf
zOMHlbyr)Zqf~`Wer;C+>fsO(wM647Hbriq>VWnWCqX6>0m4dO30?5}^3MM)VV9#19
znCd8iylAChrlSDynU#V$m>F!PV4<S`@`aUxWi2GAHJNU46%^$srxq9I7ZpdbmlmhS
zrx%poVl1j+EiFmYwa{d|#gUqqoS%}Jmwt<}yoiy3f#D^H_{FGvizPR;B=Hs_*DZ!y
zEIFBZslOO?Z?P1o=9N@&a4CbL&sM35Nr~$hBd4Dx|1Gv+a0-axNXakH%gIkniQ)*z
zEJy`uyv34VkeYXkwYa1xv)~pNIKYa_GfOgVv1F%K7T;nmFUl-Qjp78Eotl@NT6~KI
zWc)3*lKkw{yy7BIkrE}8oS#>gT2vAb7K~5HOfHFHFUbcpZn0$LXXa@N++xW|%uA1A
zNsG_Si(*ZS&o3>BVuuDm5hnu!LlkpfPQfjvw9H#fY57rH$@!&uCGj997V&^&p#mUx
zL@}3Cl-y!Y$xklg14*%ht&QRaQ}N09c{w>N8H!{X7#M!p=;!7arKTj7B<d&TrDP`N
z=If_q7AKdM7#Zm&=clCVmzHGa6vrFs7gy$$Bqo<+CdU^f78PgarRx<`-r@!YL_8$$
zK&5K28K?vl=3*3JRAFRc<YVMuWMSlB;$oCw6k_CH<YD1q6l3IK;$Y-q<Y3}slw;&#
z;$akG6krr!Dw1blV9;bK0+pRnLJ%YMz?SQQ9Sjagup+R-kq9{k28LT4Ho5sJr8%i~
Qpaf9N!oa}5!N|h_0F1+pd;kCd

-- 
GitLab