refact config1

cfc00e33 · laura.riviere · b43945e7 · cfc00e33 · cfc00e33 · cfc00e33
Commit cfc00e33 authored 2 years ago by laura.riviere
--- a/code/classes_def_2.py
+++ b/code/classes_def_2.py
@@ -27,7 +27,6 @@ class Process:
        self.meta_line = infos['pre-processing']['create_metadata']['line']
        self.meta_sent = infos['pre-processing']['create_metadata']['sent']
-        #self.ner_init = infos['pre-processing']['NER_format_initialisation'] # useless because done anyway
        #if self.main == "train":
            #if self.ner_init == True : # à faire en relatif !! split truc
@@ -51,7 +50,6 @@ class Process:
 class Output:
    def __init__(self, infos):
-        self.prod_tab = infos['file']['json_to_tab']
        self.prod_bracket = infos['file']['tab_to_bracket']
        self.prod_conll = infos['file']['conllu']
        self.metadata = infos['file']['metadata']
\ No newline at end of file
--- a/code/discut22_2.py
+++ b/code/discut22_2.py
@@ -18,8 +18,8 @@ import re
 import json
 from classes_def_2 import Data, Process, Output
 import utils_2.syntactic_parsing as synt_pars
-import utils.conv2ner as conv_to_ner
+import utils.conv2ner as conv_to_ner # TODO clean it
+import utils.json2conll as json_to_connl # TODO clean it
@@ -38,7 +38,7 @@ def get_config_infos(config, stamp):
        my_logs["config"] = infos
    return data, steps, prod
-def create_folders(li):
+def create_folders(li): # -> can be rtansfor into method of class
    for it in li:
        if not os.path.isdir(it):
            os.mkdir(it)
@@ -50,7 +50,7 @@ def print_logs():
 def pre_processing(data, steps):
    data_in = f"{data.path}/{data.name}{data.exte}"
    if steps.pre_process_to_do == True:
-        data_out = f"{data.path}/{data.name}.conll"
+        data_out = f"{data.conv}/{data.name}.conll"
        if steps.synt_tool == "stanza":
            processors = []
            metadata = {}
@@ -81,13 +81,34 @@ def data_to_ner_format(data_in):
    OUTPUT: Tokenized text with just 4 columns.
    """
    data_ner = f"{data_in}.ner"
-    conv_to_ner.main(data_in, data_ner, "conll")
+    conv_to_ner.main(data_in, data_ner, "conll") # <-- TODO faire en relatif
    #TODO add same for train/dev/test for config train
    my_logs['data_ner'] = data_ner
    return data_ner
+def make_predictions(data_in, model_path):
+    model = model_path # add def get_model from v1
+    data_out = f"{data.resu}/{data.name}_pred.json"
+    #cmd = f"allennlp predict --use-dataset-reader --output-file {data_out} {model_path} {data_in} &> {steps.data.resu}/logs.txt"
+    cmd = f"allennlp predict --use-dataset-reader --output-file {data_out} {model_path} {data_in}" # &> {steps.data.resu}/logs.txt"
+    os.system(cmd)
+    return data_out
+def pred_json_to_toke(data_in):
+    data_out = f"{data.resu}/{data.name}_pred.conll"
+    json_to_connl.js2conll(data_in, data_out, "conll") # <-- TODO faire en relatif
+    return data_out
+def pred_json_to_conll_with_metadata(data_pred_json, data_meta):
+    data_out = f"{data.resu}/{data.name}_pred_n_meta.conll"
+    json_to_connl.js2conllNmeta(data_pred_json, data_out, "conll", data_meta) # <-- TODO faire en relatif
+    return data_out
 if __name__ == '__main__':
@@ -105,6 +126,17 @@ if __name__ == '__main__':
    create_folders([data.conv, data.resu])
    data_preprocessed = pre_processing(data, steps)
+    #data_preprocessed = "/home/lriviere/andiamo/discut22/data/edgar_poe_short/data_converted_vendredi/edgar_poe_short.conll"
    data_ner = data_to_ner_format(data_preprocessed)
+    if steps.main == "annotation":
+        #data_pred_json = make_predictions(data_ner, steps.model)
+        data_pred_json = "/home/lriviere/andiamo/discut22/data/edgar_poe_short/results_vendredi/edgar_poe_short_pred.json"
+        if prod.metadata == True:
+            data_pred_and_meta_conll = pred_json_to_conll_with_metadata(data_pred_json, data_preprocessed)
+        else:
+            data_pred_toke = pred_json_to_toke(data_pred_json)
    #print_logs()
\ No newline at end of file
--- a/code/utils/json2conll.py
+++ b/code/utils/json2conll.py
@@ -26,7 +26,6 @@ def js2conll(filepath, fileoutpath, config):
    data = [] 
    for line in open(filepath, 'r'):
        data.append(json.loads(line))
    with open(fileoutpath, 'w') as f_out:
        for doc in data:
            tokens = zip(doc["words"],doc["tags"])
@@ -37,8 +36,37 @@ def js2conll(filepath, fileoutpath, config):
            f_out.write("\n")
            #print()
-def main(f_in, form, f_out):
+def js2conllNmeta(data_pred_json, data_out, config, data_meta):
-    input = f_in
+    data = []
-    output = f_out
+    sent_pred_count = 0
-    forma = form
+    tok = 0
-    js2conll(input, output, forma) 
+    for line in open(data_pred_json, 'r'):
\ No newline at end of file
+        data.append(json.loads(line))
+    with open(data_out, 'w', encoding='utf-8') as fo, open(data_meta, 'r') as fm:       
+        for line in fm:
+            line = line.strip()
+            if line.startswith("#"):
+                fo.write(f"{line}\n")
+            elif line == "":
+                sent_pred_count += 1
+                tok = 0
+                fo.write(f"{line}\n")
+            else:
+                sent_pred = data[sent_pred_count]
+                word = data[sent_pred_count]['words'][tok]
+                tag = data[sent_pred_count]['tags'][tok]
+                tok += 1
+                #print(f"tok: {tok}, tag: {tag}, word: {word}, line : {line}")
+                if int(line.split("\t")[0]) == tok and line.split("\t")[1] == word:
+                    fo.write(f"{line}\t{tag}\n")
+                else:
+                    exit("pb js2conllNmeta")
+                #print(f"sentpred : {sent_pred}\n")
+                #print(f"word n tag : {word}:::{tag}\n")
\ No newline at end of file
--- a/code/utils_2/__init__.py
+++ b/code/utils_2/__init__.py
--- a/code/utils_2/__pycache__/__init__.cpython-37.pyc
+++ b/code/utils_2/__pycache__/__init__.cpython-37.pyc
--- a/code/utils_2/__pycache__/syntactic_parsing.cpython-37.pyc
+++ b/code/utils_2/__pycache__/syntactic_parsing.cpython-37.pyc
--- a/code/utils_2/syntactic_parsing.py
+++ b/code/utils_2/syntactic_parsing.py
+import stanza
+from stanza.utils.conll import CoNLL
+def with_stanza(lang, f_in, f_out, process, meta):
+    """ 
+    Stanza's class CoNNL:
+    ID = 'id'
+    TEXT = 'text'
+    LEMMA = 'lemma'
+    UPOS = 'upos'
+    XPOS = 'xpos'
+    FEATS = 'feats'
+    HEAD = 'head'
+    DEPREL = 'deprel'
+    DEPS = 'deps'
+    MISC = 'misc' -> 'start_char|end_char'
+    START_CHAR = 'start_char'
+    END_CHAR = 'end_char'
+    FIELD_TO_IDX = {ID: 0, TEXT: 1, LEMMA: 2, UPOS: 3, XPOS: 4, FEATS: 5, HEAD: 6, DEPREL: 7, DEPS: 8, MISC: 9}
+    """
+    stanza.download(lang)
+    nlp = stanza.Pipeline(lang, processors=process, use_gpu=True)
+    with open(f_in, 'r', encoding='utf-8') as fi, open(f_out, 'w', encoding='utf-8') as fo:
+        count_line = 0
+        for line in fi:
+            count_line += 1
+            count_sent = 0
+            line = line.strip()
+            if line.startswith("#"):
+                if "meta" in meta.keys() and meta['meta'] == True:
+                    fo.write(f"{line}\n")
+            elif line == "":
+                fo.write("\n")
+            else:
+                if meta['line']:
+                    txt = f"#{meta['line']}-{count_line}\n"
+                    fo.write(txt)
+                doc = nlp(line)
+                for sent in doc.sentences:
+                    count_sent += 1
+                    if meta['sent']:
+                        txt = f"#{meta['sent']}-{count_sent}\n#text=\"{sent.text}\"\n"
+                        fo.write(txt)
+                    for token in sent.tokens:
+                        token_conll = CoNLL.convert_token_dict(token.to_dict()[0])
+                        fo.write("\t".join(token_conll))
+                        fo.write("\n")
+                    fo.write("\n")
\ No newline at end of file