diff --git a/code/classes_def.py b/code/classes_def.py index baf173327ca66311592349889ab1658895466d0c..b8ca4bd850923d5605379814e55dca28f360c789 100644 --- a/code/classes_def.py +++ b/code/classes_def.py @@ -44,4 +44,5 @@ class Process: self.test_data = infos['gold_test_data_path'] self.post_bracket = infos['post-processing']['tab_to_bracket'] + self.post_conll = infos['post-processing']['metadata_conll'] \ No newline at end of file diff --git a/code/config_global_2.json b/code/config_global_2.json index e46805bab828ba094025a51396c0448c1f6c3064..c720ae7208ed46c27e96c4ee61c9a6097573dcd2 100644 --- a/code/config_global_2.json +++ b/code/config_global_2.json @@ -2,15 +2,15 @@ "usecase_description": "Config file for usecase_2", "input": { "name": "fra.sdrt.annodis_dev", - "file": ".ttok", + "file": ".conllu", "language": "fr" }, "steps":{ - "main": "annotation", + "main": "test", "pre-processing": { "tokenization": false, "tokenization_tool" : "spacy", - "sentence_split": true, + "sentence_split": false, "sentence_split_splitor": "stanza", "syntactic_parsing": false, "NER_format_initialisation": true @@ -27,9 +27,10 @@ }, "post-processing": { "json_to_tab": true, + "metadata_conll": true, "tab_to_bracket":true }, - "evaluation": false, + "evaluation": true, "gold_test_data_path": null } } diff --git a/code/discut22_1.py b/code/discut22_1.py index 60c543c91db77ee70a92113a220e9f8a9fc0c670..a65dddd19b03a3326668cf8e20c9ecc3fd9ec551 100644 --- a/code/discut22_1.py +++ b/code/discut22_1.py @@ -47,6 +47,8 @@ def get_model(model_name): output = f"../model/{name}/{arch}" else: print("Tony already in place !") + output = f"../model/{name}/{arch}" + else: output = model_name @@ -63,10 +65,6 @@ def text_tokenization(f_in, f_out, lang, tool): def main(steps): - #steps = get_config_infos(config) # on obtient la liste des trucs - # à faire, donnée par la classe Process - #print([x for x in enumerate(steps)]) - #suivant la liste ordonnée, faire les trucs (for now simple usecase1): # FN: soit besoin sent split, soit besoin tokenizer, soit aucun des deux @@ -126,8 +124,12 @@ def main(steps): # #allennlp predict --use-dataset-reader --output-file ${RESULT_DIR}/${FILE}.json ${MODEL} ${RAW}.ner.tok print(f"Checking for model...{steps.model}") model_path = get_model(steps.model) + print(f"model{model_path}") data_json = f"{steps.data.resu}/{steps.data.name}.json" + print(f"datapred: {data_json}\n") + print(f"input: {data_ner}\n") cmd = f"allennlp predict --use-dataset-reader --output-file {data_json} {model_path} {data_ner} &> {steps.data.resu}/logs.txt" + print(f"comd{cmd}") print("Starting Prediction...") os.system(cmd) #### ------------------------------- TBD do the same but with python script (or JIANT ??) @@ -163,8 +165,6 @@ def main(steps): data_conll = f"{steps.data.resu}/{steps.data.name}_test.predictions.conll" ## à faire en relatif print(f"Starting Formating from json to tok format...to {data_conll}") j2c.main(data_json, "split.tok", data_conll) - #data_pred_ner = f"{steps.data.resu}/eng.rst.rstdt_test.predictions.conll.ner" - #c2n.main(data_conll, data_pred_ner, steps.data.file) print(f"starting eval, gold={data_gold}, predictions={data_conll}, model={model_path}") data_g = "../data/eng.rst.rstdt/eng.rst.rstdt_test.conllu" data_p = "../data/eng.rst.rstdt/results/eng.rst.rstdt_test.predictions.conll" # == data_conll @@ -179,6 +179,46 @@ def main(steps): os.system(cmd) + if steps.post_conll == True: + f_pred = f"{steps.data.resu}/{steps.data.name}.split.tok" + predictions = open(f_pred, 'r') + first_line = predictions.readline() + columns = first_line.split("\t") + predictions.close() + + f_out = f"{steps.data.resu}/{steps.data.name}_full_output.conllu" + with open(f_out, "w") as fo: + f_in = f"{steps.data.path}/{steps.data.name}{steps.data.file}" + with open(f_in, "r") as fi: + f_pred = f"{steps.data.resu}/{steps.data.name}.split.tok" + with open(f_pred, "r") as fp: + df = pd.read_csv(fp, header=None, sep="\t", usecols=[len(columns)-1]) + #df = df.dropna() + print(f"longueur={len(df)}") + print(f"line bug: {df.iloc[3047-148:3060-148,:]}\n") + print(f"type {type(df.iloc[4,:])}") + i = 0 + for line in fi: + line = line.strip() + if line.startswith("#"): + fo.write(f"{line}\n") + elif line == "": + fo.write(f"{line}\n") + i +=1 + else: + + fo.write(f"{line}") + + labels = df.iloc[i,:].values.tolist() + for tag in labels: + fo.write(f"\t{tag}") + + fo.write("\n") + #fo.write(f"{df.iloc[i,:]}\n") + i += 1 + #print(f"i::{i}\t") + + if steps.post_bracket == True : @@ -199,7 +239,8 @@ if __name__ == '__main__': config = args.config now = datetime.now() - stamp = re.sub('[\s:]', '_', str(now)) + #stamp = re.sub('[\s:]', '_', str(now)) + stamp = "debug1205" my_logs = {} my_logs['stamp'] = stamp @@ -207,4 +248,4 @@ if __name__ == '__main__': print(stamp) main(steps) - print("Done.") \ No newline at end of file + #print("Done.") \ No newline at end of file