diff --git a/code/classes_def_2.py b/code/classes_def_2.py index 20aee54530ce61beb3f5c45d90df9f86d81bf654..9233d2176be06990789b4a46dfb61dcb74f6f331 100644 --- a/code/classes_def_2.py +++ b/code/classes_def_2.py @@ -27,7 +27,6 @@ class Process: self.meta_line = infos['pre-processing']['create_metadata']['line'] self.meta_sent = infos['pre-processing']['create_metadata']['sent'] - #self.ner_init = infos['pre-processing']['NER_format_initialisation'] # useless because done anyway #if self.main == "train": #if self.ner_init == True : # à faire en relatif !! split truc @@ -51,7 +50,6 @@ class Process: class Output: def __init__(self, infos): - self.prod_tab = infos['file']['json_to_tab'] self.prod_bracket = infos['file']['tab_to_bracket'] self.prod_conll = infos['file']['conllu'] self.metadata = infos['file']['metadata'] \ No newline at end of file diff --git a/code/discut22_2.py b/code/discut22_2.py index e5b10fda0a6fa11b65eb6e221c9d38218fa516af..5fe0218583bbfef724f3af1c27c5182695e7a951 100644 --- a/code/discut22_2.py +++ b/code/discut22_2.py @@ -18,8 +18,8 @@ import re import json from classes_def_2 import Data, Process, Output import utils_2.syntactic_parsing as synt_pars -import utils.conv2ner as conv_to_ner - +import utils.conv2ner as conv_to_ner # TODO clean it +import utils.json2conll as json_to_connl # TODO clean it @@ -38,7 +38,7 @@ def get_config_infos(config, stamp): my_logs["config"] = infos return data, steps, prod -def create_folders(li): +def create_folders(li): # -> can be rtansfor into method of class for it in li: if not os.path.isdir(it): os.mkdir(it) @@ -50,7 +50,7 @@ def print_logs(): def pre_processing(data, steps): data_in = f"{data.path}/{data.name}{data.exte}" if steps.pre_process_to_do == True: - data_out = f"{data.path}/{data.name}.conll" + data_out = f"{data.conv}/{data.name}.conll" if steps.synt_tool == "stanza": processors = [] metadata = {} @@ -81,13 +81,34 @@ def data_to_ner_format(data_in): OUTPUT: Tokenized text with just 4 columns. """ data_ner = f"{data_in}.ner" - conv_to_ner.main(data_in, data_ner, "conll") + conv_to_ner.main(data_in, data_ner, "conll") # <-- TODO faire en relatif #TODO add same for train/dev/test for config train my_logs['data_ner'] = data_ner return data_ner +def make_predictions(data_in, model_path): + model = model_path # add def get_model from v1 + data_out = f"{data.resu}/{data.name}_pred.json" + #cmd = f"allennlp predict --use-dataset-reader --output-file {data_out} {model_path} {data_in} &> {steps.data.resu}/logs.txt" + cmd = f"allennlp predict --use-dataset-reader --output-file {data_out} {model_path} {data_in}" # &> {steps.data.resu}/logs.txt" + os.system(cmd) + return data_out + +def pred_json_to_toke(data_in): + data_out = f"{data.resu}/{data.name}_pred.conll" + json_to_connl.js2conll(data_in, data_out, "conll") # <-- TODO faire en relatif + return data_out + +def pred_json_to_conll_with_metadata(data_pred_json, data_meta): + data_out = f"{data.resu}/{data.name}_pred_n_meta.conll" + json_to_connl.js2conllNmeta(data_pred_json, data_out, "conll", data_meta) # <-- TODO faire en relatif + return data_out + + + + if __name__ == '__main__': @@ -105,6 +126,17 @@ if __name__ == '__main__': create_folders([data.conv, data.resu]) data_preprocessed = pre_processing(data, steps) + #data_preprocessed = "/home/lriviere/andiamo/discut22/data/edgar_poe_short/data_converted_vendredi/edgar_poe_short.conll" data_ner = data_to_ner_format(data_preprocessed) + if steps.main == "annotation": + #data_pred_json = make_predictions(data_ner, steps.model) + data_pred_json = "/home/lriviere/andiamo/discut22/data/edgar_poe_short/results_vendredi/edgar_poe_short_pred.json" + + if prod.metadata == True: + data_pred_and_meta_conll = pred_json_to_conll_with_metadata(data_pred_json, data_preprocessed) + else: + data_pred_toke = pred_json_to_toke(data_pred_json) + + #print_logs() \ No newline at end of file diff --git a/code/utils/json2conll.py b/code/utils/json2conll.py index c657f89b763874719740a4149fd7323c4c6933f4..e102b2e1dcb803fefe64aecec838ac01cbfb3f9d 100644 --- a/code/utils/json2conll.py +++ b/code/utils/json2conll.py @@ -26,7 +26,6 @@ def js2conll(filepath, fileoutpath, config): data = [] for line in open(filepath, 'r'): data.append(json.loads(line)) - with open(fileoutpath, 'w') as f_out: for doc in data: tokens = zip(doc["words"],doc["tags"]) @@ -37,8 +36,37 @@ def js2conll(filepath, fileoutpath, config): f_out.write("\n") #print() -def main(f_in, form, f_out): - input = f_in - output = f_out - forma = form - js2conll(input, output, forma) \ No newline at end of file +def js2conllNmeta(data_pred_json, data_out, config, data_meta): + data = [] + sent_pred_count = 0 + tok = 0 + for line in open(data_pred_json, 'r'): + data.append(json.loads(line)) + + with open(data_out, 'w', encoding='utf-8') as fo, open(data_meta, 'r') as fm: + + + for line in fm: + line = line.strip() + if line.startswith("#"): + fo.write(f"{line}\n") + elif line == "": + sent_pred_count += 1 + tok = 0 + fo.write(f"{line}\n") + else: + sent_pred = data[sent_pred_count] + word = data[sent_pred_count]['words'][tok] + tag = data[sent_pred_count]['tags'][tok] + tok += 1 + #print(f"tok: {tok}, tag: {tag}, word: {word}, line : {line}") + + if int(line.split("\t")[0]) == tok and line.split("\t")[1] == word: + fo.write(f"{line}\t{tag}\n") + + else: + exit("pb js2conllNmeta") + + #print(f"sentpred : {sent_pred}\n") + #print(f"word n tag : {word}:::{tag}\n") + \ No newline at end of file diff --git a/code/utils_2/__init__.py b/code/utils_2/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/code/utils_2/__pycache__/__init__.cpython-37.pyc b/code/utils_2/__pycache__/__init__.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cefb68a0f66c85292395518615b5064b13104453 Binary files /dev/null and b/code/utils_2/__pycache__/__init__.cpython-37.pyc differ diff --git a/code/utils_2/__pycache__/syntactic_parsing.cpython-37.pyc b/code/utils_2/__pycache__/syntactic_parsing.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..606e61ba6140f958cfde6090fe9566581f319f63 Binary files /dev/null and b/code/utils_2/__pycache__/syntactic_parsing.cpython-37.pyc differ diff --git a/code/utils_2/syntactic_parsing.py b/code/utils_2/syntactic_parsing.py new file mode 100644 index 0000000000000000000000000000000000000000..0d85fd4f8a3cc310494952562704a4dd83bdba04 --- /dev/null +++ b/code/utils_2/syntactic_parsing.py @@ -0,0 +1,63 @@ +import stanza +from stanza.utils.conll import CoNLL + + + + + + + +def with_stanza(lang, f_in, f_out, process, meta): + """ + Stanza's class CoNNL: + + ID = 'id' + TEXT = 'text' + LEMMA = 'lemma' + UPOS = 'upos' + XPOS = 'xpos' + FEATS = 'feats' + HEAD = 'head' + DEPREL = 'deprel' + DEPS = 'deps' + MISC = 'misc' -> 'start_char|end_char' + START_CHAR = 'start_char' + END_CHAR = 'end_char' + FIELD_TO_IDX = {ID: 0, TEXT: 1, LEMMA: 2, UPOS: 3, XPOS: 4, FEATS: 5, HEAD: 6, DEPREL: 7, DEPS: 8, MISC: 9} + """ + + stanza.download(lang) + nlp = stanza.Pipeline(lang, processors=process, use_gpu=True) + with open(f_in, 'r', encoding='utf-8') as fi, open(f_out, 'w', encoding='utf-8') as fo: + count_line = 0 + for line in fi: + count_line += 1 + count_sent = 0 + line = line.strip() + + if line.startswith("#"): + if "meta" in meta.keys() and meta['meta'] == True: + fo.write(f"{line}\n") + elif line == "": + fo.write("\n") + else: + + if meta['line']: + txt = f"#{meta['line']}-{count_line}\n" + fo.write(txt) + + doc = nlp(line) + for sent in doc.sentences: + count_sent += 1 + if meta['sent']: + txt = f"#{meta['sent']}-{count_sent}\n#text=\"{sent.text}\"\n" + fo.write(txt) + + for token in sent.tokens: + token_conll = CoNLL.convert_token_dict(token.to_dict()[0]) + fo.write("\t".join(token_conll)) + fo.write("\n") + + fo.write("\n") + + \ No newline at end of file