From cfc00e33c701811265e8c347dc1f41246932ed95 Mon Sep 17 00:00:00 2001
From: "laura.riviere" <laura.riviere@irit.fr>
Date: Fri, 9 Dec 2022 18:30:47 +0100
Subject: [PATCH] refact config1

---
 code/classes_def_2.py                         |   2 -
 code/discut22_2.py                            |  42 ++++++++++--
 code/utils/json2conll.py                      |  40 +++++++++--
 code/utils_2/__init__.py                      |   0
 .../__pycache__/__init__.cpython-37.pyc       | Bin 0 -> 138 bytes
 .../syntactic_parsing.cpython-37.pyc          | Bin 0 -> 1509 bytes
 code/utils_2/syntactic_parsing.py             |  63 ++++++++++++++++++
 7 files changed, 134 insertions(+), 13 deletions(-)
 create mode 100644 code/utils_2/__init__.py
 create mode 100644 code/utils_2/__pycache__/__init__.cpython-37.pyc
 create mode 100644 code/utils_2/__pycache__/syntactic_parsing.cpython-37.pyc
 create mode 100644 code/utils_2/syntactic_parsing.py

diff --git a/code/classes_def_2.py b/code/classes_def_2.py
index 20aee54..9233d21 100644
--- a/code/classes_def_2.py
+++ b/code/classes_def_2.py
@@ -27,7 +27,6 @@ class Process:
         self.meta_line = infos['pre-processing']['create_metadata']['line']
         self.meta_sent = infos['pre-processing']['create_metadata']['sent']
 
-        #self.ner_init = infos['pre-processing']['NER_format_initialisation'] # useless because done anyway
 
         #if self.main == "train":
             #if self.ner_init == True : # à faire en relatif !! split truc
@@ -51,7 +50,6 @@ class Process:
         
 class Output:
     def __init__(self, infos):
-        self.prod_tab = infos['file']['json_to_tab']
         self.prod_bracket = infos['file']['tab_to_bracket']
         self.prod_conll = infos['file']['conllu']
         self.metadata = infos['file']['metadata']
\ No newline at end of file
diff --git a/code/discut22_2.py b/code/discut22_2.py
index e5b10fd..5fe0218 100644
--- a/code/discut22_2.py
+++ b/code/discut22_2.py
@@ -18,8 +18,8 @@ import re
 import json
 from classes_def_2 import Data, Process, Output
 import utils_2.syntactic_parsing as synt_pars
-import utils.conv2ner as conv_to_ner
-
+import utils.conv2ner as conv_to_ner # TODO clean it
+import utils.json2conll as json_to_connl # TODO clean it
 
 
 
@@ -38,7 +38,7 @@ def get_config_infos(config, stamp):
         my_logs["config"] = infos
     return data, steps, prod
 
-def create_folders(li):
+def create_folders(li): # -> can be rtansfor into method of class
     for it in li:
         if not os.path.isdir(it):
             os.mkdir(it)
@@ -50,7 +50,7 @@ def print_logs():
 def pre_processing(data, steps):
     data_in = f"{data.path}/{data.name}{data.exte}"
     if steps.pre_process_to_do == True:
-        data_out = f"{data.path}/{data.name}.conll"
+        data_out = f"{data.conv}/{data.name}.conll"
         if steps.synt_tool == "stanza":
             processors = []
             metadata = {}
@@ -81,13 +81,34 @@ def data_to_ner_format(data_in):
     OUTPUT: Tokenized text with just 4 columns.
     """
     data_ner = f"{data_in}.ner"
-    conv_to_ner.main(data_in, data_ner, "conll")
+    conv_to_ner.main(data_in, data_ner, "conll") # <-- TODO faire en relatif
 
     #TODO add same for train/dev/test for config train
 
     my_logs['data_ner'] = data_ner
     return data_ner
 
+def make_predictions(data_in, model_path):
+    model = model_path # add def get_model from v1
+    data_out = f"{data.resu}/{data.name}_pred.json"
+    #cmd = f"allennlp predict --use-dataset-reader --output-file {data_out} {model_path} {data_in} &> {steps.data.resu}/logs.txt"
+    cmd = f"allennlp predict --use-dataset-reader --output-file {data_out} {model_path} {data_in}" # &> {steps.data.resu}/logs.txt"
+    os.system(cmd)
+    return data_out
+
+def pred_json_to_toke(data_in):
+    data_out = f"{data.resu}/{data.name}_pred.conll"
+    json_to_connl.js2conll(data_in, data_out, "conll") # <-- TODO faire en relatif
+    return data_out
+
+def pred_json_to_conll_with_metadata(data_pred_json, data_meta):
+    data_out = f"{data.resu}/{data.name}_pred_n_meta.conll"
+    json_to_connl.js2conllNmeta(data_pred_json, data_out, "conll", data_meta) # <-- TODO faire en relatif
+    return data_out
+
+
+
+
 
 
 if __name__ == '__main__':
@@ -105,6 +126,17 @@ if __name__ == '__main__':
     create_folders([data.conv, data.resu])
 
     data_preprocessed = pre_processing(data, steps)
+    #data_preprocessed = "/home/lriviere/andiamo/discut22/data/edgar_poe_short/data_converted_vendredi/edgar_poe_short.conll"
     data_ner = data_to_ner_format(data_preprocessed)
 
+    if steps.main == "annotation":
+        #data_pred_json = make_predictions(data_ner, steps.model)
+        data_pred_json = "/home/lriviere/andiamo/discut22/data/edgar_poe_short/results_vendredi/edgar_poe_short_pred.json"
+
+        if prod.metadata == True:
+            data_pred_and_meta_conll = pred_json_to_conll_with_metadata(data_pred_json, data_preprocessed)
+        else:
+            data_pred_toke = pred_json_to_toke(data_pred_json)
+
+        
     #print_logs()
\ No newline at end of file
diff --git a/code/utils/json2conll.py b/code/utils/json2conll.py
index c657f89..e102b2e 100644
--- a/code/utils/json2conll.py
+++ b/code/utils/json2conll.py
@@ -26,7 +26,6 @@ def js2conll(filepath, fileoutpath, config):
     data = [] 
     for line in open(filepath, 'r'):
         data.append(json.loads(line))
-
     with open(fileoutpath, 'w') as f_out:
         for doc in data:
             tokens = zip(doc["words"],doc["tags"])
@@ -37,8 +36,37 @@ def js2conll(filepath, fileoutpath, config):
             f_out.write("\n")
             #print()
 
-def main(f_in, form, f_out):
-    input = f_in
-    output = f_out
-    forma = form
-    js2conll(input, output, forma) 
\ No newline at end of file
+def js2conllNmeta(data_pred_json, data_out, config, data_meta):
+    data = []
+    sent_pred_count = 0
+    tok = 0
+    for line in open(data_pred_json, 'r'):
+        data.append(json.loads(line))
+
+    with open(data_out, 'w', encoding='utf-8') as fo, open(data_meta, 'r') as fm:       
+        
+        
+        for line in fm:
+            line = line.strip()
+            if line.startswith("#"):
+                fo.write(f"{line}\n")
+            elif line == "":
+                sent_pred_count += 1
+                tok = 0
+                fo.write(f"{line}\n")
+            else:
+                sent_pred = data[sent_pred_count]
+                word = data[sent_pred_count]['words'][tok]
+                tag = data[sent_pred_count]['tags'][tok]
+                tok += 1
+                #print(f"tok: {tok}, tag: {tag}, word: {word}, line : {line}")
+                
+                if int(line.split("\t")[0]) == tok and line.split("\t")[1] == word:
+                    fo.write(f"{line}\t{tag}\n")
+                    
+                else:
+                    exit("pb js2conllNmeta")
+
+                #print(f"sentpred : {sent_pred}\n")
+                #print(f"word n tag : {word}:::{tag}\n")
+                
\ No newline at end of file
diff --git a/code/utils_2/__init__.py b/code/utils_2/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/code/utils_2/__pycache__/__init__.cpython-37.pyc b/code/utils_2/__pycache__/__init__.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cefb68a0f66c85292395518615b5064b13104453
GIT binary patch
literal 138
zcmZ?b<>g{vU|_h?JRuoGKL!yn%*epN;K0DZP|U)>z>vZa%%I8Wx00a<B#a<_8SCff
z7p0~omL%#Y=A~pN=H~0CWELlvmKYi7C+DZ6>X(*e<`l;p>Bq-s=4F<|$LkeT-r}&y
Q%}*)KNwotR_8DX$01kv8Q~&?~

literal 0
HcmV?d00001

diff --git a/code/utils_2/__pycache__/syntactic_parsing.cpython-37.pyc b/code/utils_2/__pycache__/syntactic_parsing.cpython-37.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..606e61ba6140f958cfde6090fe9566581f319f63
GIT binary patch
literal 1509
zcmZ?b<>g{vU|=}Bc|!7YHU@^rAPx*OF)%PVFfcF_D=;uHq%fo~<}gHoXr>&-T&5@{
zMursT6qXi-DCQK#U<OUrmmqz9nvAzto%8*Ce3Dr~d<F&v0R{#JZUzPhXOK}{j0_AV
z3=0@*7#1=zGL$f6G1V~CFsCr4F!eImvX(H{FoFzCVeMs}z}O#F%T~f%!<537!U7es
zs%1}MujQD)SfNwH=E5+6F;=OTvxXyu!<?a(BZa+|tA=v{%R+`)&Kk}ItSOue8H;A5
zaMiFZU|YzL%~mw6h9!kdk|BjLo2h784f_K28qV1aDcp0JKyE4Ys^LuG$!02A1CmMM
zhKjMI@JcciT}k1r0kdW^r0~^nfOWCfa)VUXfK}G8)^Ij6*78(LsNu=taAAmTtL3fX
z0=tW=h9!kxl3@a4krLPxen|$fDO|G|QUq#vvp{YToXd`ET7^vwZx)veL#$CPUkUdD
zo*KRdybBpp*d-YjGS>2gY)uhbz*ob+kP+lcFdyuS35<nW;S30V;gfKNLa%U!aE27&
zK!zZO2qadDa4>_WNa8F;1_lK#1qB6#;F83=szmi-h2)&X;$j8od_O-QD=x5<r;CEE
zf_i3(I+z#Y8W95Gm!wvdKtz08eSIB4LOH3qxrq?5&;b8nkWgtselbKO0xD7g7jbiS
z3;`=jOHC|+s`7Ak1lyL8nh3Se#Wf(v)dyr&N@_t-Y7WFS7uNu=)|Au&s7_zcU}uo3
z+|1%+bp>5J1@+>R#G;b;<c!3kn$)}$Fax4FIK(k1B;MJ>F$kmqp$sDH>gNKNL)GWz
z>FVPWAL1YH=@Ow}t5EIfVx?f9qW}sKD+NOx1#m!EDH!P}fV^*|V63A6^0k$MiH-u;
zvsMbGItm~!S}B<6C_sE>rC<(b23sju=qP}EVWnVM3khmXrdwPEMfu68#l`tW#Zl~~
z#i{Y>1*Nwbi>g>lOVV^LG#PJkq~;~(r)1`(-(oB;Vq{=ocnKnYF)H6;$xSUuyv4|M
zi{TbaPG(-}FGk&4EXApLB~=_;%An}8RjOiA;`+tN>8Htmi>(-(0-`ul^2_sb@)J{{
zI07;YQb8JTvE&z|=G|f~E-A_^xWxqyu;TK}l8jp{*{PMqw^+-IGD}jUI6-Eo<|U^V
z-(mq7e~YaoKRY$AxCm5~L<uG5=ar=vmBfPu<5Mz|OQP6I^1+N-ELr)Pd71*ZSaK5c
z(xX_?;xqH2SkvP3OG~2Ip#f0D$-ux6#hjN@aEmD|^A=NDeiT=7eraAwJjjVfJRn)9
z0LUFt%q0~ix0qA%lZ*I3QmkNWqqxCTd~$wXPR>e(B3T9ohF>=Nx%ow@DTyVC`iXfd
znTfgi`YD;k$)zPmM*7M5DXIFUC7C(J@kaW^m3bwJ$t9V|@db%R#hH2OdIgoYxIqCC
z4+%U_X<BRsD*1%D7zG$r7+Dzk7&#bO7&(}@7$q2m7&#bum^m257`d1@7<m{u7`Yhb
z7`d2u7=;)G7zLP$<QW(kG+By3WoDER#7I4`<$7QTg98$*2<&hqLXLrf;TDHYZhlH>
TPO2R!0Ti<^Ffed1@^AnE+IWiZ

literal 0
HcmV?d00001

diff --git a/code/utils_2/syntactic_parsing.py b/code/utils_2/syntactic_parsing.py
new file mode 100644
index 0000000..0d85fd4
--- /dev/null
+++ b/code/utils_2/syntactic_parsing.py
@@ -0,0 +1,63 @@
+import stanza
+from stanza.utils.conll import CoNLL
+
+
+
+
+
+
+
+def with_stanza(lang, f_in, f_out, process, meta):
+    """ 
+    Stanza's class CoNNL:
+
+    ID = 'id'
+    TEXT = 'text'
+    LEMMA = 'lemma'
+    UPOS = 'upos'
+    XPOS = 'xpos'
+    FEATS = 'feats'
+    HEAD = 'head'
+    DEPREL = 'deprel'
+    DEPS = 'deps'
+    MISC = 'misc' -> 'start_char|end_char'
+    START_CHAR = 'start_char'
+    END_CHAR = 'end_char'
+    FIELD_TO_IDX = {ID: 0, TEXT: 1, LEMMA: 2, UPOS: 3, XPOS: 4, FEATS: 5, HEAD: 6, DEPREL: 7, DEPS: 8, MISC: 9}
+    """
+
+    stanza.download(lang)
+    nlp = stanza.Pipeline(lang, processors=process, use_gpu=True)
+    with open(f_in, 'r', encoding='utf-8') as fi, open(f_out, 'w', encoding='utf-8') as fo:
+        count_line = 0
+        for line in fi:
+            count_line += 1
+            count_sent = 0
+            line = line.strip()
+
+            if line.startswith("#"):
+                if "meta" in meta.keys() and meta['meta'] == True:
+                    fo.write(f"{line}\n")
+            elif line == "":
+                fo.write("\n")
+            else:
+                
+                if meta['line']:
+                    txt = f"#{meta['line']}-{count_line}\n"
+                    fo.write(txt)
+
+                doc = nlp(line)
+                for sent in doc.sentences:
+                    count_sent += 1
+                    if meta['sent']:
+                        txt = f"#{meta['sent']}-{count_sent}\n#text=\"{sent.text}\"\n"
+                        fo.write(txt)
+
+                    for token in sent.tokens:
+                        token_conll = CoNLL.convert_token_dict(token.to_dict()[0])
+                        fo.write("\t".join(token_conll))
+                        fo.write("\n")
+
+                    fo.write("\n")
+
+            
\ No newline at end of file
-- 
GitLab