diff --git a/code/config_global_1.2.json b/code/config_global_1.2.json index 4b5133336367dcfe435400b7b67faacb913bc81d..8cfbf46448bcb18611089df54b86b308fc63b7d9 100644 --- a/code/config_global_1.2.json +++ b/code/config_global_1.2.json @@ -30,16 +30,18 @@ "validation_data_path": null } }, - "evaluation": false, "gold_test_data_path": null }, "output":{ - "file":{ - "tab_to_bracket": true, - "conllu":true, - "metadata": true + "conll_file":{ + "to_do": true, + "metadata": true, + "with_gold_labels": true }, - "scores":false + "txt_file":{ + "to_do": true, + "metadata": true + } } } diff --git a/code/config_global_1.21.json b/code/config_global_1.21.json index 1e8c4a92ea30cea09445fafac64523cf0939d0fa..21351a824b3bbaf6558f3f1b307911bd3af3c570 100644 --- a/code/config_global_1.21.json +++ b/code/config_global_1.21.json @@ -30,16 +30,18 @@ "validation_data_path": null } }, - "evaluation": false, "gold_test_data_path": null }, "output":{ - "file":{ - "conllu":true, + "conll_file":{ + "to_do": true, "metadata": true, - "tab_to_bracket": false + "with_gold_labels": true }, - "scores":false + "txt_file":{ + "to_do": true, + "metadata": true + } } } diff --git a/code/config_global_2.2.json b/code/config_global_2.2.json new file mode 100644 index 0000000000000000000000000000000000000000..afc2e1f77059016fd1ee08f4b7ee825eeb3883a6 --- /dev/null +++ b/code/config_global_2.2.json @@ -0,0 +1,49 @@ +{ + "usecase_description": "Config file for usecase_2 : from a gold text, make predictions with an existing model, compare gold and predictions with metrics.", + "data_raw": { + "name": "fra.sdrt.annodis_dev", + "exte": ".conllu", + "language": "fr", + "existing_metadata": true + }, + "steps":{ + "main": "test", + "pre-processing": { + "to_do": false, + "syntactic_tool": "stanza", + "sentence_split": true, + "tokenization": true, + "syntactic_parsing": true, + "create_metadata": { + "to_do": false, + "line": "paragraph", + "sent": "sent" + } + }, + "discourse_segmenter": { + "model": "tony", + "training": { + "toolkit": null, + "pre_trained_lm": null, + "config_file": null, + "train_data_path": null, + "validation_data_path": null + } + }, + "gold_test_data_path": null + }, + "output":{ + "conll_file":{ + "to_do": true, + "metadata": true, + "with_gold_labels": true + }, + "txt_file":{ + "to_do": true, + "metadata": true + } + } +} + + + diff --git a/code/discut22_2.py b/code/discut22_2.py index 543cd4d7f38beb0fddd0054490da25234f8f7492..a10bc31571a037f5e5ce46ece487ad6ec0be35b5 100644 --- a/code/discut22_2.py +++ b/code/discut22_2.py @@ -12,11 +12,12 @@ from datetime import datetime import os import re import json -import utils_2.syntactic_parsing as synt_pars +import utils.syntactic_parsing as synt_pars import utils.conv2ner as conv_to_ner # TODO clean it import utils.json2conll as json_to_connl # TODO clean it import utils.training_allennlp as tr_allen import utils.conll2bracket as c2bracket +import utils.seg_eval as seg_eval @@ -26,6 +27,7 @@ class Data: self.lang = infos['language'] self.path = f"../data/{self.name}" self.exte = infos['exte'] + self.raw = f"{self.path}/{self.name}{self.exte}" self.stamp = stamp self.conv = f"{self.path}/data_converted_{stamp}" self.resu = f"{self.path}/results_{stamp}" @@ -33,11 +35,14 @@ class Data: def create_folders(self): # -> can be rtansfor into method of class for it in [self.conv, self.resu]: + print(f"----> Checking/creating folder {it}.") if not os.path.isdir(it): os.mkdir(it) + my_logs['folders'] = f"{self.conv}, {self.resu}" def pre_processing(self, steps): - file_in = f"{self.path}/{self.name}{self.exte}" + print("----> Preprocessing input data.") + file_in = self.raw if steps.pre_process_to_do == True: file_out = f"{self.conv}/{self.name}.conll" if steps.synt_tool == "stanza": @@ -70,21 +75,38 @@ class Data: OUTPUT: Tokenized text with just 4 columns. """ self.ner = f"{self.preprocessed}.ner" + self.ner = f"{self.conv}/{self.name}.conll.ner" + print(f"----> Making NER format {self.ner}.") conv_to_ner.main(self.preprocessed, self.ner, "conll") # <-- TODO faire en relatif#TODO add same for train/dev/test for config train my_logs['data_ner'] = self.ner def make_predictions(self, steps): self.pred_json = f"{self.resu}/{self.name}_pred.json" cmd = f"allennlp predict --use-dataset-reader --output-file {self.pred_json} {steps.model_path} {self.ner} &> {self.resu}/logs_predictions.txt" + print(f"----> Making predictions: {cmd}.") os.system(cmd) + my_logs['predictions_cmd'] = cmd - def pred_json_to_conll_with_metadata(self): - self.pred_meta_conll = f"{self.resu}/{self.name}_pred_n_meta.conll" + + def pred_json_to_conll_w_metadata_w_gold(self): # here and 3 below..sorry..factorsation TBD + self.pred_conll_meta_gold = f"{self.resu}/{self.name}_pred_meta_gold.conll" + json_to_connl.js2conllNmetaNgold(self.pred_json, self.pred_conll_meta_gold, "conll", self.preprocessed) + return self.pred_conll_meta_gold + + def pred_json_to_conll_w_metadata(self): + self.pred_meta_conll = f"{self.resu}/{self.name}_pred_meta.conll" json_to_connl.js2conllNmeta(self.pred_json, self.pred_meta_conll, "conll", self.preprocessed) + return self.pred_meta_conll + + def pred_json_to_conll_w_gold(self): + self.pred_conll_gold = f"{self.resu}/{self.name}_pred_gold.conll" + json_to_connl.js2conll(self.pred_json, self.pred_conll_gold, "conll") + return self.pred_conll_gold def pred_json_to_conll(self): self.pred_conll = f"{self.resu}/{self.name}_pred.conll" json_to_connl.js2conll(self.pred_json, self.pred_conll, "conll") + return self.pred_conll def brackets_txt(self): self.brack = f"{self.resu}/{self.name}_brac.txt" @@ -95,6 +117,44 @@ class Data: c2bracket.conll2brackets_with_meta(self.pred_meta_conll, self.brack_meta) + def evaluation(self, prod): + self.basic_metrics = f"{self.resu}/Evaluation_metrics.json" + + if self.exte == ".conll" or self.exte == ".conllu": # get gold file + gold = self.raw + else: + gold = self.preprocessed + + if prod.conll_todo == False: # get pred_file + pred = self.pred_json_to_conll() + else: + if prod.conll_meta == True: + if prod.conll_w_gold == True: + pred = self.pred_json_to_conll_w_metadata_w_gold() + else: + pred = self.pred_json_to_conll_w_metadata() + else: + if prod.conll_w_gold == True: + pred = self.pred_json_to_conll_w_gold() + else: + pred = self.pred_json_to_conll() + + print(f"----> Predictions to file {pred}") + print(f"----> Evaluation scores to file {self.basic_metrics}") + scores_dict = seg_eval.get_scores(gold, pred) + with open(self.basic_metrics, 'w') as fo: + json.dump(scores_dict, fo) + + +class Output: + def __init__(self, infos): + self.conll_todo = infos['conll_file']['to_do'] + self.conll_meta = infos['conll_file']['metadata'] + self.conll_w_gold = infos['conll_file']['with_gold_labels'] + self.txt_todo = infos['txt_file']['to_do'] + self.txt_meta = infos['txt_file']['metadata'] + + class Process: def __init__(self, infos): @@ -109,7 +169,6 @@ class Process: self.meta_line = infos['pre-processing']['create_metadata']['line'] self.meta_sent = infos['pre-processing']['create_metadata']['sent'] - #if self.main == "train": #if self.ner_init == True : # à faire en relatif !! split truc # self.train_data = f"{self.data.path}/{self.data.name}_train.ner{self.data.file}" @@ -120,14 +179,14 @@ class Process: self.toolkit = infos['discourse_segmenter']['training']['toolkit'] self.tr_config = infos['discourse_segmenter']['training']['config_file'] self.pretr_lm = infos['discourse_segmenter']['training']['pre_trained_lm'] - self.model = infos['discourse_segmenter']['model'] # ezpz for Tony - #self.post_tab = infos['post-processing']['json_to_tab'] - - self.eval = infos['evaluation'] self.test_data = infos['gold_test_data_path'] + def get_evaluation_status(self): + if self.main == "test": + self.eval = True + #elif self.main == "train": def get_model(self): self.model_path = "" @@ -138,21 +197,12 @@ class Process: os.system(dl) self.model_path = f"../model/tony/{arch}" else: - print("Tony already in place !") + print("----> Tony already in place !") self.model_path = f"../model/tony/{arch}" else: self.model_path = self.model - - -class Output: - def __init__(self, infos): - self.prod_bracket = infos['file']['tab_to_bracket'] - self.prod_conll = infos['file']['conllu'] - self.metadata = infos['file']['metadata'] - - def get_stamp(): now = datetime.now() stamp = re.sub('[\s:]', '_', str(now)) @@ -168,14 +218,14 @@ def get_config_infos(config, stamp): return data, steps, prod -def print_logs(): +def print_logs(dict_logs): file_logs = f"{data.resu}/processes_logs.json" - print(my_logs) # <-- ahaha TBD + with open(file_logs, 'w') as fi: + json.dump(dict_logs, fi, indent=4) - if __name__ == '__main__': my_logs = {} stamp = get_stamp() @@ -191,32 +241,15 @@ if __name__ == '__main__': data.create_folders() data.pre_processing(steps) data.make_ner_format() - #TEST data_preprocessed = "/home/lriviere/andiamo/discut22/data/edgar_poe_short/data_converted_vendredi/edgar_poe_short.conll" steps.get_model() if steps.main == "annotation" or steps.main == "test": - data.make_predictions(steps) - #data_pred_json = "/home/lriviere/andiamo/discut22/data/edgar_poe_short/results_vendredi/edgar_poe_short_pred.json" - - data.pred_json_to_conll() - - if prod.metadata == True: - data.pred_json_to_conll_with_metadata() - - if prod.prod_bracket == True: - if data.exte != ".txt": - exit("pb") - else: - data.brackets_txt() - data.brackets_txt_with_metadata() - - + data.make_predictions(steps) # output allennlp JSON #elif steps.main == "train": + steps.get_evaluation_status() + if steps.eval == True: + data.evaluation(prod) - - - #scores = compare_pred_gold() - - #print_logs() \ No newline at end of file + print_logs(my_logs) # <-- attention variable globale ! \ No newline at end of file diff --git a/code/utils/conv2ner.py b/code/utils/conv2ner.py index 71216dfba9ba10c25eeaf65fcbfcb414bb2471f5..4e6edbe72cf1c10f45b9206262887747e78efdb7 100644 --- a/code/utils/conv2ner.py +++ b/code/utils/conv2ner.py @@ -110,8 +110,8 @@ def conversion2ner(input, output, params=None): # then, previous token label is set to B-E to signal end of previous segment res[-1][-1] = "B-E" start_doc = False - if label not in maptags: - print("warning, strange label ",label,file=sys.stderr) + #if label not in maptags: + #print("warning, strange label ",label,file=sys.stderr) res.append([w,pos,"O",tag]) for line in res: diff --git a/code/utils/json2conll.py b/code/utils/json2conll.py index 73e16469176a74801d98641684c3bfd8671b417f..37c5a0fb1894348d7ebf1ed22332b75961172e91 100644 --- a/code/utils/json2conll.py +++ b/code/utils/json2conll.py @@ -5,6 +5,7 @@ conll format import json import sys +import re #filepath = sys.argv[1] #config = sys.argv[2] @@ -60,7 +61,8 @@ def js2conllNmeta(data_pred_json, data_out, config, data_meta): tag = data[sent_pred_count]['tags'][tok] tok += 1 #print(f"tok: {tok}, tag: {tag}, word: {word}, line : {line}") - fo.write(f"{line}\t{map[tag]}\n") + new_line = re.sub('\t[^\t]+$', '', line) + fo.write(f"{new_line}\t{map[tag]}\n") #if int(line.split("\t")[0]) == tok and line.split("\t")[1] == word: # fo.write(f"{line}\t{tag}\n") @@ -68,4 +70,29 @@ def js2conllNmeta(data_pred_json, data_out, config, data_meta): #print(f"sentpred : {sent_pred}\n") #print(f"word n tag : {word}:::{tag}\n") - \ No newline at end of file + +def js2conllNmetaNgold(data_pred_json, data_out, config, gold_n_meta): + data = [] + sent_pred_count = 0 + tok = 0 + for line in open(data_pred_json, 'r'): + data.append(json.loads(line)) + + with open(data_out, 'w', encoding='utf-8') as fo, open(gold_n_meta, 'r') as fm: + + # id + for line in fm: + line = line.strip() + if line.startswith("#"): + fo.write(f"{line}\n") + elif line == "": + sent_pred_count += 1 + tok = 0 + fo.write(f"{line}\n") + else: + sent_pred = data[sent_pred_count] + word = data[sent_pred_count]['words'][tok] + tag = data[sent_pred_count]['tags'][tok] + tok += 1 + #print(f"tok: {tok}, tag: {tag}, word: {word}, line : {line}") + fo.write(f"{line}\t{map[tag]}\n") \ No newline at end of file diff --git a/code/utils/seg_eval.py b/code/utils/seg_eval.py index 1808782d74c9c5c9e8961409fa558af9a07bce1f..d61d2efbcae5898e586d3ecfdfb2e3126a3d9ecf 100644 --- a/code/utils/seg_eval.py +++ b/code/utils/seg_eval.py @@ -68,9 +68,9 @@ Arguments: """ -__author__ = "Amir Zeldes" +#__author__ = "Amir Zeldes" __license__ = "Apache 2.0" -__version__ = "1.0.1" +#__version__ = "1.0.1" def parse_data(infile, string_input=False): if not string_input: @@ -222,22 +222,3 @@ def get_scores(gold_file, pred_file, string_input=False): return score_dict - -if __name__ == "__main__": - p = argparse.ArgumentParser() - - p.add_argument("goldfile",help="Shared task gold file in .tok or .conll format") - p.add_argument("predfile",help="Corresponding file with system predictions") - p.add_argument("-s","--string_input",action="store_true",help="Whether inputs are file names or strings") - - opts = p.parse_args() - - score_dict = get_scores(opts.goldfile,opts.predfile,opts.string_input) - - print("File: " + score_dict["doc_name"]) - print("o Total tokens: " + str(score_dict["tok_count"])) - print("o Gold " +score_dict["seg_type"]+": " + str(score_dict["gold_seg_count"])) - print("o Predicted "+score_dict["seg_type"]+": " + str(score_dict["pred_seg_count"])) - print("o Precision: " + str(score_dict["prec"])) - print("o Recall: " + str(score_dict["rec"])) - print("o F-Score: " + str(score_dict["f_score"])) diff --git a/code/utils_2/syntactic_parsing.py b/code/utils/syntactic_parsing.py similarity index 100% rename from code/utils_2/syntactic_parsing.py rename to code/utils/syntactic_parsing.py diff --git a/code/utils_2/__init__.py b/code/utils_2/__init__.py deleted file mode 100644 index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000 diff --git a/code/utils_2/__pycache__/__init__.cpython-37.pyc b/code/utils_2/__pycache__/__init__.cpython-37.pyc deleted file mode 100644 index cefb68a0f66c85292395518615b5064b13104453..0000000000000000000000000000000000000000 Binary files a/code/utils_2/__pycache__/__init__.cpython-37.pyc and /dev/null differ diff --git a/code/utils_2/__pycache__/syntactic_parsing.cpython-37.pyc b/code/utils_2/__pycache__/syntactic_parsing.cpython-37.pyc deleted file mode 100644 index 14dde82debc911172ec9de77498bcdab8b0d87a4..0000000000000000000000000000000000000000 Binary files a/code/utils_2/__pycache__/syntactic_parsing.cpython-37.pyc and /dev/null differ