From 6744dc6b6848374e1a258e98d33302e14fbbe847 Mon Sep 17 00:00:00 2001 From: "laura.riviere" <laura.riviere@irit.fr> Date: Tue, 3 Jan 2023 18:51:40 +0100 Subject: [PATCH] change to classes methodes --- code/classes_def.py | 48 ---- code/classes_def_2.py | 55 ---- code/config_global_1.2.json | 4 +- code/discut22_1.py | 251 ------------------ code/discut22_2.py | 234 ++++++++++------ code/utils/conll2bracket.py | 28 +- code/utils/json2conll.py | 11 +- .../syntactic_parsing.cpython-37.pyc | Bin 1509 -> 1517 bytes code/utils_2/syntactic_parsing.py | 6 +- 9 files changed, 187 insertions(+), 450 deletions(-) delete mode 100644 code/classes_def.py delete mode 100644 code/classes_def_2.py delete mode 100644 code/discut22_1.py diff --git a/code/classes_def.py b/code/classes_def.py deleted file mode 100644 index b8ca4bd..0000000 --- a/code/classes_def.py +++ /dev/null @@ -1,48 +0,0 @@ -# Classes for discut22 - - - -class Input: - def __init__(self, infos, stamp): - self.name = infos['name'] - self.lang = infos['language'] -# self.path = infos['folder_path'] # misused - self.path = f"../data/{self.name}" - self.file = infos['file'] - self.stamp = stamp - self.conv = f"{self.path}/data_converted_{stamp}" # à intégrer - self.resu = f"{self.path}/results_{stamp}" - - -class Process: - def __init__(self, infos, data): - self.data = data - self.main = infos["main"] # train test annotation - - self.toke = infos['pre-processing']['tokenization'] - self.toke_tool = infos['pre-processing']['tokenization_tool'] - self.ssplit = infos['pre-processing']['sentence_split'] - self.ssplitor = infos['pre-processing']['sentence_split_splitor'] - self.ner_init = infos['pre-processing']['NER_format_initialisation'] # useless because done anyway - - if self.main == "train": - if self.ner_init == True : # à faire en relatif !! split truc - self.train_data = f"{self.data.path}/{self.data.name}_train.ner{self.data.file}" - self.dev_data = f"{self.data.path}/{self.data.name}_dev.ner{self.data.file}" - else : - self.train_data = infos['discourse_segmenter']['training']['train_data_path'] - self.dev_data = infos['discourse_segmenter']['training']['validation_data_path'] - self.toolkit = infos['discourse_segmenter']['training']['toolkit'] - self.tr_config = infos['discourse_segmenter']['training']['config_file'] - self.pretr_lm = infos['discourse_segmenter']['training']['pre_trained_lm'] - - self.model = infos['discourse_segmenter']['model'] # ezpz for Tony - - self.post_tab = infos['post-processing']['json_to_tab'] - - self.eval = infos['evaluation'] - self.test_data = infos['gold_test_data_path'] - - self.post_bracket = infos['post-processing']['tab_to_bracket'] - self.post_conll = infos['post-processing']['metadata_conll'] - \ No newline at end of file diff --git a/code/classes_def_2.py b/code/classes_def_2.py deleted file mode 100644 index 9233d21..0000000 --- a/code/classes_def_2.py +++ /dev/null @@ -1,55 +0,0 @@ -# Classes for discut22 - - - -class Data: - def __init__(self, infos, stamp): - self.name = infos['name'] - self.lang = infos['language'] - self.path = f"../data/{self.name}" - self.exte = infos['exte'] - self.stamp = stamp - self.conv = f"{self.path}/data_converted_{stamp}" # à intégrer - self.resu = f"{self.path}/results_{stamp}" - self.meta = infos['existing_metadata'] - - -class Process: - def __init__(self, infos): - self.main = infos["main"] # train test annotation - - self.pre_process_to_do = infos['pre-processing']['to_do'] - self.synt_tool = infos['pre-processing']['syntactic_tool'] - self.synt_parse = infos['pre-processing']['syntactic_parsing'] - self.toke = infos['pre-processing']['tokenization'] - self.ssplit = infos['pre-processing']['sentence_split'] - self.crea_meta = infos['pre-processing']['create_metadata']['to_do'] - self.meta_line = infos['pre-processing']['create_metadata']['line'] - self.meta_sent = infos['pre-processing']['create_metadata']['sent'] - - - #if self.main == "train": - #if self.ner_init == True : # à faire en relatif !! split truc - # self.train_data = f"{self.data.path}/{self.data.name}_train.ner{self.data.file}" - # self.dev_data = f"{self.data.path}/{self.data.name}_dev.ner{self.data.file}" - #else : - # self.train_data = infos['discourse_segmenter']['training']['train_data_path'] - # self.dev_data = infos['discourse_segmenter']['training']['validation_data_path'] - self.toolkit = infos['discourse_segmenter']['training']['toolkit'] - self.tr_config = infos['discourse_segmenter']['training']['config_file'] - self.pretr_lm = infos['discourse_segmenter']['training']['pre_trained_lm'] - - self.model = infos['discourse_segmenter']['model'] # ezpz for Tony - - #self.post_tab = infos['post-processing']['json_to_tab'] - - self.eval = infos['evaluation'] - self.test_data = infos['gold_test_data_path'] - - - -class Output: - def __init__(self, infos): - self.prod_bracket = infos['file']['tab_to_bracket'] - self.prod_conll = infos['file']['conllu'] - self.metadata = infos['file']['metadata'] \ No newline at end of file diff --git a/code/config_global_1.2.json b/code/config_global_1.2.json index 0e71493..4b51333 100644 --- a/code/config_global_1.2.json +++ b/code/config_global_1.2.json @@ -13,9 +13,9 @@ "syntactic_tool": "stanza", "sentence_split": true, "tokenization": true, - "syntactic_parsing": true, + "syntactic_parsing": false, "create_metadata": { - "to_do": true, + "to_do": false, "line": "paragraph", "sent": "sent" } diff --git a/code/discut22_1.py b/code/discut22_1.py deleted file mode 100644 index 7a8739b..0000000 --- a/code/discut22_1.py +++ /dev/null @@ -1,251 +0,0 @@ -###################################### -###### DISCOURSE SEGMENTOR 2022 ###### -###################################### -""" This the main script - And the only one to run, - after completion of config.json """ - -import os -import sys -import argparse -import re -from datetime import datetime -import pandas as pd # for futur clean output in df -import json - -from classes_def import Input, Process -import utils -#import utils.fr_tokenize as tk -import utils.conv2ner as c2n -import utils.json2conll as j2c -import utils.conll2bracket as c2bracket -import utils.sent_split as ssent -import utils.training_allennlp as tr_allen - - -# fonction to get config stuffs -def get_config_infos(stamp, config_file): - - with open(config_file) as f: - infos = json.load(f) - data_in = Input(infos['input'], stamp) - actions = Process(infos['steps'], data_in) - print(f"data to be process : {data_in.name}") - return actions - - -# fonction to load existing model -> only tony for now -def get_model(model_name): - name = model_name - output = "" - - if name == "tony": - arch = "french_tokens.tar.gz" - if not os.path.isfile(f"../model/{name}/{arch}"): - dl = "wget https://zenodo.org/record/4235850/files/french_tokens.tar.gz -P ../model --progress=bar" - os.system(dl) - output = f"../model/{name}/{arch}" - else: - print("Tony already in place !") - output = f"../model/{name}/{arch}" - - else: - output = model_name - - return output - - -def text_tokenization(f_in, f_out, lang, tool): - if lang == "fr" : - if tool == "spacy" : - tk.main(f_in, f_out) # .ss -> .tok - - - - -def main(steps): - - - - # FN: soit besoin sent split, soit besoin tokenizer, soit aucun des deux - if steps.ssplit == True : # python code/ssplit/parse_corpus.py ${dataset} --parser stanza --out_dir data - #### Split text into sentence : not in usecase1 - if not steps.ssplitor == "stanza" : - print("pls define sentence splitor") # raise error n kill process - data_in = f"{steps.data.path}/{steps.data.name}{steps.data.file}" - data_tok = f"{steps.data.path}/{steps.data.name}.tok" - print(f"Starting sentence spliting...to {steps.data.path}/steps.data.name") - ssent.main(data_in, data_tok, steps.ssplitor, steps.data.lang) - elif steps.toke == True : - #### Tokenization du text # #python ${SEG_DIR}/code/utils/fr_tokenize.py $RAW > ${RAW}.tok - data_in = f"{steps.data.path}/{steps.data.name}{steps.data.file}" - data_tok = f"{steps.data.path}/{steps.data.name}.tok" - print(f"Starting Tokenization...to {data_tok}") - #tk.main(f_in, f_out) # .ss -> .tok - text_tokenization(data_in, data_tok, steps.data.lang, steps.toke_tool) # .ss -> .tok - else: - data_tok = f"{steps.data.path}/{steps.data.name}{steps.data.file}" - - - if steps.ner_init == True: - if steps.main == "test" or steps.main =="annotation": - #### Conversion en NER pb # #python $RUNTIME/conv2ner.py ${RAW}.tok > ${RAW}.ner.tok - data_ner = f"{steps.data.path}/{steps.data.name}.ner.tok" - print(f"Starting conversion to NER format...to {data_ner}") - c2n.main(data_tok, data_ner, steps.data.file) - elif steps.main == "train": - for part in ["train", "dev", "test"]: - data_tok = f"{steps.data.path}/{steps.data.name}_{part}{steps.data.file}" - data_ner = f"{steps.data.path}/{steps.data.name}_{part}.ner{steps.data.file}" - print("Starting conversion to NER format...to {}".format(data_ner)) - c2n.main(data_tok, data_ner, steps.data.file) - - - # Create the results directory - if not os.path.isdir(steps.data.resu): - print(" result directory does not exist yet") - os.mkdir(steps.data.resu) - - - if steps.main == "train": - if steps.toolkit == "allennlp": - print("toolkit allennlp for training") - tr_allen.main(steps) - # set the value of model from null to what was just created by training - steps.model = f"{steps.data.resu}/model.tar.gz" - elif steps.toolkit == "jiant": - print("Jiant toolkit not ready") - else : - print("toolkit unknown") - - #check config train file - elif steps.main == "test" or steps.main =="annotation": - #### Appliquer le model choisi, sortir le JSON avec les predictions :score, proba, tags - # #allennlp predict --use-dataset-reader --output-file ${RESULT_DIR}/${FILE}.json ${MODEL} ${RAW}.ner.tok - print(f"Checking for model...{steps.model}") - model_path = get_model(steps.model) - print(f"model{model_path}") - data_json = f"{steps.data.resu}/{steps.data.name}.json" - print(f"datapred: {data_json}\n") - print(f"input: {data_ner}\n") - cmd = f"allennlp predict --use-dataset-reader --output-file {data_json} {model_path} {data_ner} &> {steps.data.resu}/logs.txt" - print(f"comd{cmd}") - print("Starting Prediction...") - os.system(cmd) - #### ------------------------------- TBD do the same but with python script (or JIANT ??) - else: - print(" pb define model") - - if steps.post_tab == True : - #### Appliquer les predictions au texte et sortir le texte tokenisé avec la colone des tags-prédis - # # #python $RUNTIME/json2conll.py ${RESULT_DIR}/${FILE}.json split.tok > ${RESULT_DIR}/${FILE}.split.tok - data_conll = f"{steps.data.resu}/{steps.data.name}.split.tok" - format = "split.tok" # to retrive from config file !!! - print(f"Starting Formating from json to tok format...to {data_conll}") - j2c.main(data_json, format, data_conll) - - ####### EVALUATION AGAINST GOLD - # python discut/code/utils/seg_eval.py data_gold data_pred (-s) - if steps.eval == True : - if steps.main == "train": - data_gold = steps.test_data # (())== data NER because of ner_init == true((deleted)) - if steps.ner_init == True : - data_gold_ner = f"{steps.data.path}/{steps.data.name}_test.ner.conllu" - - # make predictions on test_data - model_path = steps.model # model just been created - # data_json about to be created by predict cmd - data_json = f"{steps.data.resu}/{steps.data.name}_test.predictions.json" ## à faire en relatif !! [opt : --silent ??] - cmd = f"allennlp predict --use-dataset-reader --output-file {data_json} {model_path} {data_gold_ner} --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --predictor sentence-tagger --include-package allen_custom.custom_bert_token_embedder &> {steps.data.resu}/logs.txt" - #cmd = f"allennlp predict --use-dataset-reader --output-file {data_json} {model_path} {data_gold} &> {steps.data.resu} /logs.txt" - print("Starting Prediction...") - print(f"cmd prediction: {cmd}") - os.system(cmd) - - data_conll = f"{steps.data.resu}/{steps.data.name}_test.predictions.conll" ## à faire en relatif - print(f"Starting Formating from json to tok format...to {data_conll}") - j2c.main(data_json, "split.tok", data_conll) - print(f"starting eval, gold={data_gold}, predictions={data_conll}, model={model_path}") - data_g = "../data/eng.rst.rstdt/eng.rst.rstdt_test.conllu" - data_p = "../data/eng.rst.rstdt/results/eng.rst.rstdt_test.predictions.conll" # == data_conll - cmd = f"python utils/seg_eval.py {data_gold} {data_conll} &> {steps.data.resu}/Evaluation.txt" - os.system(cmd) - - - else : - data_gold = data_tok # changer les noms des var, c'est pas clair ! - data_pred = data_conll # - cmd = f"python utils/seg_eval.py {data_gold} {data_pred} &> {steps.data.resu}/Evaluation.txt" - os.system(cmd) - - - if steps.post_conll == True: - f_pred = f"{steps.data.resu}/{steps.data.name}.split.tok" - predictions = open(f_pred, 'r') - first_line = predictions.readline() - columns = first_line.split("\t") - predictions.close() - - f_out = f"{steps.data.resu}/{steps.data.name}_full_output.conllu" - with open(f_out, "w") as fo: - f_in = f"{steps.data.path}/{steps.data.name}{steps.data.file}" - with open(f_in, "r") as fi: - f_pred = f"{steps.data.resu}/{steps.data.name}.split.tok" - with open(f_pred, "r") as fp: - df = pd.read_csv(fp, header=None, sep="\t", usecols=[len(columns)-1]) - #df = df.dropna() - print(f"longueur={len(df)}") - print(f"line bug: {df.iloc[3047-148:3060-148,:]}\n") - print(f"type {type(df.iloc[4,:])}") - i = 0 - for line in fi: - line = line.strip() - if line.startswith("#"): - fo.write(f"{line}\n") - elif line == "": - fo.write(f"{line}\n") - i +=1 - else: - - fo.write(f"{line}") - - labels = df.iloc[i,:].values.tolist() - for tag in labels: - fo.write(f"\t{tag}") - - fo.write("\n") - #fo.write(f"{df.iloc[i,:]}\n") - i += 1 - #print(f"i::{i}\t") - - - - - if steps.post_bracket == True : - ####prendre le texte tokénisé+tags-prédits et sortir le texte en plain (format du d'ebut, for now en suite de phrases) avec les brackets - # # #python $RUNTIME/conll2bracket.py ${RESULT_DIR}/${FILE}.split.tok > ${RESULT_DIR}/${FILE}.split.tok.bracket - data_bracket = f"{steps.data.resu}/{steps.data.name}.split.tok.bracket" - print(f"Starting formating into bracket text...to {data_bracket}") - c2bracket.main(data_conll, data_bracket) - - - - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - parser.add_argument('--config', help='Config file in JSON') - args = parser.parse_args() - config = args.config - - now = datetime.now() - #stamp = re.sub('[\s:]', '_', str(now)) - stamp = "_debug1214" - my_logs = {} - my_logs['stamp'] = stamp - - steps = get_config_infos(stamp, config) - print(stamp) - main(steps) - - #print("Done.") \ No newline at end of file diff --git a/code/discut22_2.py b/code/discut22_2.py index 0bf7b74..543cd4d 100644 --- a/code/discut22_2.py +++ b/code/discut22_2.py @@ -5,25 +5,153 @@ And the only one to run, after completion of config.json Discut22 uses allennlp toolkit. For that, it need NER intermediary format. - - - - - - """ +""" + import argparse from datetime import datetime import os import re import json -from classes_def_2 import Data, Process, Output import utils_2.syntactic_parsing as synt_pars import utils.conv2ner as conv_to_ner # TODO clean it import utils.json2conll as json_to_connl # TODO clean it import utils.training_allennlp as tr_allen +import utils.conll2bracket as c2bracket + + + +class Data: + def __init__(self, infos, stamp): + self.name = infos['name'] + self.lang = infos['language'] + self.path = f"../data/{self.name}" + self.exte = infos['exte'] + self.stamp = stamp + self.conv = f"{self.path}/data_converted_{stamp}" + self.resu = f"{self.path}/results_{stamp}" + self.meta = infos['existing_metadata'] + + def create_folders(self): # -> can be rtansfor into method of class + for it in [self.conv, self.resu]: + if not os.path.isdir(it): + os.mkdir(it) + + def pre_processing(self, steps): + file_in = f"{self.path}/{self.name}{self.exte}" + if steps.pre_process_to_do == True: + file_out = f"{self.conv}/{self.name}.conll" + if steps.synt_tool == "stanza": + processors = [] + metadata = {} + if steps.toke == True: + processors.extend(['tokenize', 'mwt']) + if steps.synt_parse == True: + processors.extend(['pos', 'lemma', 'depparse']) + #if steps.ssplit == True: + # processors.append('constituency') + if steps.crea_meta == True: + metadata['line'] = steps.meta_line + metadata['sent'] = steps.meta_sent + if data.meta == True: + metadata['meta'] = True + processors_str = ",".join(processors) + synt_pars.with_stanza(data.lang, file_in, file_out, processors_str, metadata) + else: + exit(f"Exited. Not valid syntactic tool: \"{steps.synt_tool}\". Options: \"stanza\". Change your config file.") + else: + file_out = file_in + my_logs['data_preprocessed'] = file_out + self.preprocessed = file_out + + def make_ner_format(self): + """ + This fonction build the NER format upon the Segmentor works. + INPUT: Tokenized text with whatever number of columns. + OUTPUT: Tokenized text with just 4 columns. + """ + self.ner = f"{self.preprocessed}.ner" + conv_to_ner.main(self.preprocessed, self.ner, "conll") # <-- TODO faire en relatif#TODO add same for train/dev/test for config train + my_logs['data_ner'] = self.ner + + def make_predictions(self, steps): + self.pred_json = f"{self.resu}/{self.name}_pred.json" + cmd = f"allennlp predict --use-dataset-reader --output-file {self.pred_json} {steps.model_path} {self.ner} &> {self.resu}/logs_predictions.txt" + os.system(cmd) + + def pred_json_to_conll_with_metadata(self): + self.pred_meta_conll = f"{self.resu}/{self.name}_pred_n_meta.conll" + json_to_connl.js2conllNmeta(self.pred_json, self.pred_meta_conll, "conll", self.preprocessed) + + def pred_json_to_conll(self): + self.pred_conll = f"{self.resu}/{self.name}_pred.conll" + json_to_connl.js2conll(self.pred_json, self.pred_conll, "conll") + + def brackets_txt(self): + self.brack = f"{self.resu}/{self.name}_brac.txt" + c2bracket.conll2brackets(self.pred_conll, self.brack) + + def brackets_txt_with_metadata(self): + self.brack_meta = f"{self.resu}/{self.name}_brac_meta.txt" + c2bracket.conll2brackets_with_meta(self.pred_meta_conll, self.brack_meta) + + + +class Process: + def __init__(self, infos): + self.main = infos["main"] # train test annotation + + self.pre_process_to_do = infos['pre-processing']['to_do'] + self.synt_tool = infos['pre-processing']['syntactic_tool'] + self.synt_parse = infos['pre-processing']['syntactic_parsing'] + self.toke = infos['pre-processing']['tokenization'] + self.ssplit = infos['pre-processing']['sentence_split'] + self.crea_meta = infos['pre-processing']['create_metadata']['to_do'] + self.meta_line = infos['pre-processing']['create_metadata']['line'] + self.meta_sent = infos['pre-processing']['create_metadata']['sent'] + + + #if self.main == "train": + #if self.ner_init == True : # à faire en relatif !! split truc + # self.train_data = f"{self.data.path}/{self.data.name}_train.ner{self.data.file}" + # self.dev_data = f"{self.data.path}/{self.data.name}_dev.ner{self.data.file}" + #else : + # self.train_data = infos['discourse_segmenter']['training']['train_data_path'] + # self.dev_data = infos['discourse_segmenter']['training']['validation_data_path'] + self.toolkit = infos['discourse_segmenter']['training']['toolkit'] + self.tr_config = infos['discourse_segmenter']['training']['config_file'] + self.pretr_lm = infos['discourse_segmenter']['training']['pre_trained_lm'] + + self.model = infos['discourse_segmenter']['model'] # ezpz for Tony + + #self.post_tab = infos['post-processing']['json_to_tab'] + + self.eval = infos['evaluation'] + self.test_data = infos['gold_test_data_path'] + + + def get_model(self): + self.model_path = "" + if self.model == "tony": + arch = "french_tokens.tar.gz" + if not os.path.isfile(f"../model/tony/{arch}"): + dl = "wget https://zenodo.org/record/4235850/files/french_tokens.tar.gz -P ../model --progress=bar" + os.system(dl) + self.model_path = f"../model/tony/{arch}" + else: + print("Tony already in place !") + self.model_path = f"../model/tony/{arch}" + else: + self.model_path = self.model + +class Output: + def __init__(self, infos): + self.prod_bracket = infos['file']['tab_to_bracket'] + self.prod_conll = infos['file']['conllu'] + self.metadata = infos['file']['metadata'] + def get_stamp(): now = datetime.now() @@ -39,75 +167,11 @@ def get_config_infos(config, stamp): my_logs["config"] = infos return data, steps, prod -def create_folders(li): # -> can be rtansfor into method of class - for it in li: - if not os.path.isdir(it): - os.mkdir(it) def print_logs(): file_logs = f"{data.resu}/processes_logs.json" - print(my_logs) - -def pre_processing(data, steps): - data_in = f"{data.path}/{data.name}{data.exte}" - if steps.pre_process_to_do == True: - data_out = f"{data.conv}/{data.name}.conll" - if steps.synt_tool == "stanza": - processors = [] - metadata = {} - if steps.toke == True: - processors.extend(['tokenize', 'mwt']) - if steps.synt_parse == True: - processors.extend(['pos', 'lemma', 'depparse']) - #if steps.ssplit == True: - # processors.append('constituency') - if steps.crea_meta == True: - metadata['line'] = steps.meta_line - metadata['sent'] = steps.meta_sent - if data.meta == True: - metadata['meta'] = True - processors_str = ",".join(processors) - synt_pars.with_stanza(data.lang, data_in, data_out, processors_str, metadata) - else: - exit(f"Exited. Not valid syntactic tool: \"{steps.synt_tool}\". Options: \"stanza\". Change your config file.") - else: - data_out = data_in - my_logs['data_preprocessed'] = data_out - return data_out - -def data_to_ner_format(data_in): - """ - This fonction build the NER format upon the Segmentor works. - INPUT: Tokenized text with whatever number of columns. - OUTPUT: Tokenized text with just 4 columns. - """ - data_ner = f"{data_in}.ner" - conv_to_ner.main(data_in, data_ner, "conll") # <-- TODO faire en relatif - - #TODO add same for train/dev/test for config train - - my_logs['data_ner'] = data_ner - return data_ner - -def make_predictions(data_in, model_path): - model = model_path # add def get_model from v1 - data_out = f"{data.resu}/{data.name}_pred.json" - #cmd = f"allennlp predict --use-dataset-reader --output-file {data_out} {model_path} {data_in} &> {steps.data.resu}/logs.txt" - cmd = f"allennlp predict --use-dataset-reader --output-file {data_out} {model_path} {data_in} &> {data.resu}/logs_predictions.txt" - os.system(cmd) - return data_out - -def pred_json_to_toke(data_in): - data_out = f"{data.resu}/{data.name}_pred.conll" - json_to_connl.js2conll(data_in, data_out, "conll") # <-- TODO faire en relatif - return data_out - -def pred_json_to_conll_with_metadata(data_pred_json, data_meta): - data_out = f"{data.resu}/{data.name}_pred_n_meta.conll" - json_to_connl.js2conllNmeta(data_pred_json, data_out, "conll", data_meta) # <-- TODO faire en relatif - return data_out - - + print(my_logs) # <-- ahaha TBD + @@ -124,20 +188,28 @@ if __name__ == '__main__': my_logs["stamp"] = stamp data, steps, prod = get_config_infos(config, stamp) - create_folders([data.conv, data.resu]) - - data_preprocessed = pre_processing(data, steps) + data.create_folders() + data.pre_processing(steps) + data.make_ner_format() #TEST data_preprocessed = "/home/lriviere/andiamo/discut22/data/edgar_poe_short/data_converted_vendredi/edgar_poe_short.conll" - data_ner = data_to_ner_format(data_preprocessed) + steps.get_model() if steps.main == "annotation" or steps.main == "test": - data_pred_json = make_predictions(data_ner, steps.model) + data.make_predictions(steps) #data_pred_json = "/home/lriviere/andiamo/discut22/data/edgar_poe_short/results_vendredi/edgar_poe_short_pred.json" + + data.pred_json_to_conll() if prod.metadata == True: - data_pred_and_meta_conll = pred_json_to_conll_with_metadata(data_pred_json, data_preprocessed) - else: - data_pred_toke = pred_json_to_toke(data_pred_json) + data.pred_json_to_conll_with_metadata() + + if prod.prod_bracket == True: + if data.exte != ".txt": + exit("pb") + else: + data.brackets_txt() + data.brackets_txt_with_metadata() + #elif steps.main == "train": diff --git a/code/utils/conll2bracket.py b/code/utils/conll2bracket.py index f1dac8b..ec597fd 100644 --- a/code/utils/conll2bracket.py +++ b/code/utils/conll2bracket.py @@ -50,8 +50,26 @@ def conll2brackets(in_f, out_f): start = False file_out.write("]\n\n") -def main(f_in, f_out): - input = f_in - output = f_out - - conll2brackets(input, output) \ No newline at end of file +def conll2brackets_with_meta(in_f, out_f): + start = True + input = in_f + + with open(out_f, 'w') as file_out: + with open(in_f, 'r') as input: + for line in input: + if line.startswith("#"): + file_out.write(f"{line}\n") + elif line.strip()=="": + file_out.write("]") + file_out.write("\n\n") + start = True + else: + n, word, *junk, tag = line.split() + if tag=="BeginSeg=Yes": + if not(start): + file_out.write("] ") + file_out.write(f"[ {word} ") + else: + file_out.write(f"{word} ") + start = False + file_out.write("]\n\n") \ No newline at end of file diff --git a/code/utils/json2conll.py b/code/utils/json2conll.py index e102b2e..73e1646 100644 --- a/code/utils/json2conll.py +++ b/code/utils/json2conll.py @@ -45,7 +45,7 @@ def js2conllNmeta(data_pred_json, data_out, config, data_meta): with open(data_out, 'w', encoding='utf-8') as fo, open(data_meta, 'r') as fm: - + # id for line in fm: line = line.strip() if line.startswith("#"): @@ -60,12 +60,11 @@ def js2conllNmeta(data_pred_json, data_out, config, data_meta): tag = data[sent_pred_count]['tags'][tok] tok += 1 #print(f"tok: {tok}, tag: {tag}, word: {word}, line : {line}") - - if int(line.split("\t")[0]) == tok and line.split("\t")[1] == word: - fo.write(f"{line}\t{tag}\n") + fo.write(f"{line}\t{map[tag]}\n") + #if int(line.split("\t")[0]) == tok and line.split("\t")[1] == word: + # fo.write(f"{line}\t{tag}\n") - else: - exit("pb js2conllNmeta") + #print(f"sentpred : {sent_pred}\n") #print(f"word n tag : {word}:::{tag}\n") diff --git a/code/utils_2/__pycache__/syntactic_parsing.cpython-37.pyc b/code/utils_2/__pycache__/syntactic_parsing.cpython-37.pyc index 606e61ba6140f958cfde6090fe9566581f319f63..14dde82debc911172ec9de77498bcdab8b0d87a4 100644 GIT binary patch delta 163 zcmaFL{g#{0iI<m)fq{WxssEN_NA`()$&8aG7U`(Q)v}c^*D$59rLd&1_A*ak?02eV zPhqd+n7~+JQp4uLFo7{vYvSQJo)q31mIZ7J8M4`mZcNr>)Kr{O!;{6~!Vue4%Ui>h z!eP!(%ay_pQ!|0FNNaLFV=}){I71El0`?T{g^Y|0g)cTgWSqjx$HK!X#>mCQ!N|kN L!NfPYnKcOjKHMr= delta 155 zcmaFM{gj)}iI<m)fq{YH@a750&)FvOB{OzSEYeX8t7R);u3<`HOJPZ2?PZ?8*l$(K zp2A+sF@do{r-sdiVFF{U(!|4Y95pN{ypjw>S0?K+YRXQi;mP7~VTf(3<*ngL;V@^Y z<*H#x;g@8Xz*wX-xtuYXUn`uUhJ68h3im=rMux&Cn_n_cVdi7zU=(BIV&Y)rVdP-s Jn%vEr1OO0xC*c49 diff --git a/code/utils_2/syntactic_parsing.py b/code/utils_2/syntactic_parsing.py index 0d85fd4..8515d7d 100644 --- a/code/utils_2/syntactic_parsing.py +++ b/code/utils_2/syntactic_parsing.py @@ -42,14 +42,16 @@ def with_stanza(lang, f_in, f_out, process, meta): fo.write("\n") else: - if meta['line']: + #if meta['line']: + if "line" in meta.keys(): txt = f"#{meta['line']}-{count_line}\n" fo.write(txt) doc = nlp(line) for sent in doc.sentences: count_sent += 1 - if meta['sent']: + #if meta['sent']: + if "sent" in meta.keys(): txt = f"#{meta['sent']}-{count_sent}\n#text=\"{sent.text}\"\n" fo.write(txt) -- GitLab