From cfc00e33c701811265e8c347dc1f41246932ed95 Mon Sep 17 00:00:00 2001 From: "laura.riviere" <laura.riviere@irit.fr> Date: Fri, 9 Dec 2022 18:30:47 +0100 Subject: [PATCH] refact config1 --- code/classes_def_2.py | 2 - code/discut22_2.py | 42 ++++++++++-- code/utils/json2conll.py | 40 +++++++++-- code/utils_2/__init__.py | 0 .../__pycache__/__init__.cpython-37.pyc | Bin 0 -> 138 bytes .../syntactic_parsing.cpython-37.pyc | Bin 0 -> 1509 bytes code/utils_2/syntactic_parsing.py | 63 ++++++++++++++++++ 7 files changed, 134 insertions(+), 13 deletions(-) create mode 100644 code/utils_2/__init__.py create mode 100644 code/utils_2/__pycache__/__init__.cpython-37.pyc create mode 100644 code/utils_2/__pycache__/syntactic_parsing.cpython-37.pyc create mode 100644 code/utils_2/syntactic_parsing.py diff --git a/code/classes_def_2.py b/code/classes_def_2.py index 20aee54..9233d21 100644 --- a/code/classes_def_2.py +++ b/code/classes_def_2.py @@ -27,7 +27,6 @@ class Process: self.meta_line = infos['pre-processing']['create_metadata']['line'] self.meta_sent = infos['pre-processing']['create_metadata']['sent'] - #self.ner_init = infos['pre-processing']['NER_format_initialisation'] # useless because done anyway #if self.main == "train": #if self.ner_init == True : # à faire en relatif !! split truc @@ -51,7 +50,6 @@ class Process: class Output: def __init__(self, infos): - self.prod_tab = infos['file']['json_to_tab'] self.prod_bracket = infos['file']['tab_to_bracket'] self.prod_conll = infos['file']['conllu'] self.metadata = infos['file']['metadata'] \ No newline at end of file diff --git a/code/discut22_2.py b/code/discut22_2.py index e5b10fd..5fe0218 100644 --- a/code/discut22_2.py +++ b/code/discut22_2.py @@ -18,8 +18,8 @@ import re import json from classes_def_2 import Data, Process, Output import utils_2.syntactic_parsing as synt_pars -import utils.conv2ner as conv_to_ner - +import utils.conv2ner as conv_to_ner # TODO clean it +import utils.json2conll as json_to_connl # TODO clean it @@ -38,7 +38,7 @@ def get_config_infos(config, stamp): my_logs["config"] = infos return data, steps, prod -def create_folders(li): +def create_folders(li): # -> can be rtansfor into method of class for it in li: if not os.path.isdir(it): os.mkdir(it) @@ -50,7 +50,7 @@ def print_logs(): def pre_processing(data, steps): data_in = f"{data.path}/{data.name}{data.exte}" if steps.pre_process_to_do == True: - data_out = f"{data.path}/{data.name}.conll" + data_out = f"{data.conv}/{data.name}.conll" if steps.synt_tool == "stanza": processors = [] metadata = {} @@ -81,13 +81,34 @@ def data_to_ner_format(data_in): OUTPUT: Tokenized text with just 4 columns. """ data_ner = f"{data_in}.ner" - conv_to_ner.main(data_in, data_ner, "conll") + conv_to_ner.main(data_in, data_ner, "conll") # <-- TODO faire en relatif #TODO add same for train/dev/test for config train my_logs['data_ner'] = data_ner return data_ner +def make_predictions(data_in, model_path): + model = model_path # add def get_model from v1 + data_out = f"{data.resu}/{data.name}_pred.json" + #cmd = f"allennlp predict --use-dataset-reader --output-file {data_out} {model_path} {data_in} &> {steps.data.resu}/logs.txt" + cmd = f"allennlp predict --use-dataset-reader --output-file {data_out} {model_path} {data_in}" # &> {steps.data.resu}/logs.txt" + os.system(cmd) + return data_out + +def pred_json_to_toke(data_in): + data_out = f"{data.resu}/{data.name}_pred.conll" + json_to_connl.js2conll(data_in, data_out, "conll") # <-- TODO faire en relatif + return data_out + +def pred_json_to_conll_with_metadata(data_pred_json, data_meta): + data_out = f"{data.resu}/{data.name}_pred_n_meta.conll" + json_to_connl.js2conllNmeta(data_pred_json, data_out, "conll", data_meta) # <-- TODO faire en relatif + return data_out + + + + if __name__ == '__main__': @@ -105,6 +126,17 @@ if __name__ == '__main__': create_folders([data.conv, data.resu]) data_preprocessed = pre_processing(data, steps) + #data_preprocessed = "/home/lriviere/andiamo/discut22/data/edgar_poe_short/data_converted_vendredi/edgar_poe_short.conll" data_ner = data_to_ner_format(data_preprocessed) + if steps.main == "annotation": + #data_pred_json = make_predictions(data_ner, steps.model) + data_pred_json = "/home/lriviere/andiamo/discut22/data/edgar_poe_short/results_vendredi/edgar_poe_short_pred.json" + + if prod.metadata == True: + data_pred_and_meta_conll = pred_json_to_conll_with_metadata(data_pred_json, data_preprocessed) + else: + data_pred_toke = pred_json_to_toke(data_pred_json) + + #print_logs() \ No newline at end of file diff --git a/code/utils/json2conll.py b/code/utils/json2conll.py index c657f89..e102b2e 100644 --- a/code/utils/json2conll.py +++ b/code/utils/json2conll.py @@ -26,7 +26,6 @@ def js2conll(filepath, fileoutpath, config): data = [] for line in open(filepath, 'r'): data.append(json.loads(line)) - with open(fileoutpath, 'w') as f_out: for doc in data: tokens = zip(doc["words"],doc["tags"]) @@ -37,8 +36,37 @@ def js2conll(filepath, fileoutpath, config): f_out.write("\n") #print() -def main(f_in, form, f_out): - input = f_in - output = f_out - forma = form - js2conll(input, output, forma) \ No newline at end of file +def js2conllNmeta(data_pred_json, data_out, config, data_meta): + data = [] + sent_pred_count = 0 + tok = 0 + for line in open(data_pred_json, 'r'): + data.append(json.loads(line)) + + with open(data_out, 'w', encoding='utf-8') as fo, open(data_meta, 'r') as fm: + + + for line in fm: + line = line.strip() + if line.startswith("#"): + fo.write(f"{line}\n") + elif line == "": + sent_pred_count += 1 + tok = 0 + fo.write(f"{line}\n") + else: + sent_pred = data[sent_pred_count] + word = data[sent_pred_count]['words'][tok] + tag = data[sent_pred_count]['tags'][tok] + tok += 1 + #print(f"tok: {tok}, tag: {tag}, word: {word}, line : {line}") + + if int(line.split("\t")[0]) == tok and line.split("\t")[1] == word: + fo.write(f"{line}\t{tag}\n") + + else: + exit("pb js2conllNmeta") + + #print(f"sentpred : {sent_pred}\n") + #print(f"word n tag : {word}:::{tag}\n") + \ No newline at end of file diff --git a/code/utils_2/__init__.py b/code/utils_2/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/code/utils_2/__pycache__/__init__.cpython-37.pyc b/code/utils_2/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cefb68a0f66c85292395518615b5064b13104453 GIT binary patch literal 138 zcmZ?b<>g{vU|_h?JRuoGKL!yn%*epN;K0DZP|U)>z>vZa%%I8Wx00a<B#a<_8SCff z7p0~omL%#Y=A~pN=H~0CWELlvmKYi7C+DZ6>X(*e<`l;p>Bq-s=4F<|$LkeT-r}&y Q%}*)KNwotR_8DX$01kv8Q~&?~ literal 0 HcmV?d00001 diff --git a/code/utils_2/__pycache__/syntactic_parsing.cpython-37.pyc b/code/utils_2/__pycache__/syntactic_parsing.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..606e61ba6140f958cfde6090fe9566581f319f63 GIT binary patch literal 1509 zcmZ?b<>g{vU|=}Bc|!7YHU@^rAPx*OF)%PVFfcF_D=;uHq%fo~<}gHoXr>&-T&5@{ zMursT6qXi-DCQK#U<OUrmmqz9nvAzto%8*Ce3Dr~d<F&v0R{#JZUzPhXOK}{j0_AV z3=0@*7#1=zGL$f6G1V~CFsCr4F!eImvX(H{FoFzCVeMs}z}O#F%T~f%!<537!U7es zs%1}MujQD)SfNwH=E5+6F;=OTvxXyu!<?a(BZa+|tA=v{%R+`)&Kk}ItSOue8H;A5 zaMiFZU|YzL%~mw6h9!kdk|BjLo2h784f_K28qV1aDcp0JKyE4Ys^LuG$!02A1CmMM zhKjMI@JcciT}k1r0kdW^r0~^nfOWCfa)VUXfK}G8)^Ij6*78(LsNu=taAAmTtL3fX z0=tW=h9!kxl3@a4krLPxen|$fDO|G|QUq#vvp{YToXd`ET7^vwZx)veL#$CPUkUdD zo*KRdybBpp*d-YjGS>2gY)uhbz*ob+kP+lcFdyuS35<nW;S30V;gfKNLa%U!aE27& zK!zZO2qadDa4>_WNa8F;1_lK#1qB6#;F83=szmi-h2)&X;$j8od_O-QD=x5<r;CEE zf_i3(I+z#Y8W95Gm!wvdKtz08eSIB4LOH3qxrq?5&;b8nkWgtselbKO0xD7g7jbiS z3;`=jOHC|+s`7Ak1lyL8nh3Se#Wf(v)dyr&N@_t-Y7WFS7uNu=)|Au&s7_zcU}uo3 z+|1%+bp>5J1@+>R#G;b;<c!3kn$)}$Fax4FIK(k1B;MJ>F$kmqp$sDH>gNKNL)GWz z>FVPWAL1YH=@Ow}t5EIfVx?f9qW}sKD+NOx1#m!EDH!P}fV^*|V63A6^0k$MiH-u; zvsMbGItm~!S}B<6C_sE>rC<(b23sju=qP}EVWnVM3khmXrdwPEMfu68#l`tW#Zl~~ z#i{Y>1*Nwbi>g>lOVV^LG#PJkq~;~(r)1`(-(oB;Vq{=ocnKnYF)H6;$xSUuyv4|M zi{TbaPG(-}FGk&4EXApLB~=_;%An}8RjOiA;`+tN>8Htmi>(-(0-`ul^2_sb@)J{{ zI07;YQb8JTvE&z|=G|f~E-A_^xWxqyu;TK}l8jp{*{PMqw^+-IGD}jUI6-Eo<|U^V z-(mq7e~YaoKRY$AxCm5~L<uG5=ar=vmBfPu<5Mz|OQP6I^1+N-ELr)Pd71*ZSaK5c z(xX_?;xqH2SkvP3OG~2Ip#f0D$-ux6#hjN@aEmD|^A=NDeiT=7eraAwJjjVfJRn)9 z0LUFt%q0~ix0qA%lZ*I3QmkNWqqxCTd~$wXPR>e(B3T9ohF>=Nx%ow@DTyVC`iXfd znTfgi`YD;k$)zPmM*7M5DXIFUC7C(J@kaW^m3bwJ$t9V|@db%R#hH2OdIgoYxIqCC z4+%U_X<BRsD*1%D7zG$r7+Dzk7&#bO7&(}@7$q2m7&#bum^m257`d1@7<m{u7`Yhb z7`d2u7=;)G7zLP$<QW(kG+By3WoDER#7I4`<$7QTg98$*2<&hqLXLrf;TDHYZhlH> TPO2R!0Ti<^Ffed1@^AnE+IWiZ literal 0 HcmV?d00001 diff --git a/code/utils_2/syntactic_parsing.py b/code/utils_2/syntactic_parsing.py new file mode 100644 index 0000000..0d85fd4 --- /dev/null +++ b/code/utils_2/syntactic_parsing.py @@ -0,0 +1,63 @@ +import stanza +from stanza.utils.conll import CoNLL + + + + + + + +def with_stanza(lang, f_in, f_out, process, meta): + """ + Stanza's class CoNNL: + + ID = 'id' + TEXT = 'text' + LEMMA = 'lemma' + UPOS = 'upos' + XPOS = 'xpos' + FEATS = 'feats' + HEAD = 'head' + DEPREL = 'deprel' + DEPS = 'deps' + MISC = 'misc' -> 'start_char|end_char' + START_CHAR = 'start_char' + END_CHAR = 'end_char' + FIELD_TO_IDX = {ID: 0, TEXT: 1, LEMMA: 2, UPOS: 3, XPOS: 4, FEATS: 5, HEAD: 6, DEPREL: 7, DEPS: 8, MISC: 9} + """ + + stanza.download(lang) + nlp = stanza.Pipeline(lang, processors=process, use_gpu=True) + with open(f_in, 'r', encoding='utf-8') as fi, open(f_out, 'w', encoding='utf-8') as fo: + count_line = 0 + for line in fi: + count_line += 1 + count_sent = 0 + line = line.strip() + + if line.startswith("#"): + if "meta" in meta.keys() and meta['meta'] == True: + fo.write(f"{line}\n") + elif line == "": + fo.write("\n") + else: + + if meta['line']: + txt = f"#{meta['line']}-{count_line}\n" + fo.write(txt) + + doc = nlp(line) + for sent in doc.sentences: + count_sent += 1 + if meta['sent']: + txt = f"#{meta['sent']}-{count_sent}\n#text=\"{sent.text}\"\n" + fo.write(txt) + + for token in sent.tokens: + token_conll = CoNLL.convert_token_dict(token.to_dict()[0]) + fo.write("\t".join(token_conll)) + fo.write("\n") + + fo.write("\n") + + \ No newline at end of file -- GitLab