diff --git a/code/config_global_1.2.json b/code/config_global_1.2.json index 8cfbf46448bcb18611089df54b86b308fc63b7d9..a6c9f8ccf5aba587ac066181ccaee169ccc38717 100644 --- a/code/config_global_1.2.json +++ b/code/config_global_1.2.json @@ -15,7 +15,7 @@ "tokenization": true, "syntactic_parsing": false, "create_metadata": { - "to_do": false, + "to_do": true, "line": "paragraph", "sent": "sent" } @@ -36,10 +36,10 @@ "conll_file":{ "to_do": true, "metadata": true, - "with_gold_labels": true + "with_gold_labels": false }, "txt_file":{ - "to_do": true, + "to_do": false, "metadata": true } } diff --git a/code/config_global_2.2.json b/code/config_global_2.2.json index afc2e1f77059016fd1ee08f4b7ee825eeb3883a6..395e75891f2c6a204c2e194eab4c9ffb08ec3572 100644 --- a/code/config_global_2.2.json +++ b/code/config_global_2.2.json @@ -1,9 +1,9 @@ { "usecase_description": "Config file for usecase_2 : from a gold text, make predictions with an existing model, compare gold and predictions with metrics.", "data_raw": { - "name": "fra.sdrt.annodis_dev", + "name": "eng.pdtb.pdtb_dev", "exte": ".conllu", - "language": "fr", + "language": "en", "existing_metadata": true }, "steps":{ @@ -15,13 +15,13 @@ "tokenization": true, "syntactic_parsing": true, "create_metadata": { - "to_do": false, + "to_do": true, "line": "paragraph", "sent": "sent" } }, "discourse_segmenter": { - "model": "tony", + "model": "/home/lriviere/andiamo/morteza/discut/Results_conllu/results_eng.pdtb.pdtb_bert/model.tar.gz", "training": { "toolkit": null, "pre_trained_lm": null, diff --git a/code/config_global_3.json b/code/config_global_3.json index 91737b9dac15c5e8afd51dceac3827d8103d904d..ad453a45e30169b6ce0d95ca9e63fcd0986c8a4c 100644 --- a/code/config_global_3.json +++ b/code/config_global_3.json @@ -30,7 +30,7 @@ "validation_data_path": "eng.rst.rstdt_dev" } }, - "gold_test_data_path": "eng.rst.rstdt_test" + "gold_test_data_path": "eng.rst.rstdt_dev" }, "output":{ "conll_file":{ @@ -39,7 +39,7 @@ "with_gold_labels": true }, "txt_file":{ - "to_do": true, + "to_do": false, "metadata": true } } diff --git a/code/config_global_4.json b/code/config_global_4.json index 63b0c330f66c2b49e98e470a60fa44e2c59e79ce..9e1a95cd1e65a47fc99e3de9fa454d2a70393386 100644 --- a/code/config_global_4.json +++ b/code/config_global_4.json @@ -30,7 +30,7 @@ "validation_data_path": "/home/lriviere/andiamo/discut22/data/eng.sdrt.stac/eng.sdrt.stac_dev.conllu" } }, - "gold_test_data_path": "eng.rst.rstdt_test" + "gold_test_data_path": "eng.rst.rstdt_dev" }, "output":{ "conll_file":{ @@ -39,7 +39,7 @@ "with_gold_labels": true }, "txt_file":{ - "to_do": true, + "to_do": false, "metadata": true } } diff --git a/code/discut22_2.py b/code/discut22_2.py index 102555832ffb12d6b59e68e0fc135a734440b42c..abdc9e58cbb7b5d5d08265553f0bf14eac3539cd 100644 --- a/code/discut22_2.py +++ b/code/discut22_2.py @@ -23,27 +23,51 @@ import utils.seg_eval as seg_eval class Data: - def __init__(self, infos, stamp): + def __init__(self, infos, stamp, stamp_time, overwrite): self.name = infos['name'] self.lang = infos['language'] self.path = f"../data/{self.name}" self.exte = infos['exte'] self.raw = f"{self.path}/{self.name}{self.exte}" self.stamp = stamp - self.conv = f"{self.path}/data_converted_{stamp}" - self.resu = f"{self.path}/results_{stamp}" + self.stamp_time = stamp_time + self.proj = "../projects" + self.run = f"{self.proj}/{stamp}" + self.conv = f"{self.run}/data_converted" + self.resu = f"{self.run}/results" + self.over = overwrite self.meta = infos['existing_metadata'] + - def create_folders(self, ft=None): # -> can be rtansfor into method of class - folders_list=[self.conv, self.resu] - #folders_list=[self.conv] - if ft != None: - self.fine = f"{self.resu}/fine_tune_{ft}" - #folders_list.append(self.fine) # made automatically by allennlp - for it in folders_list: - print(f"----> Checking/creating folder {it}.") - if not os.path.isdir(it): - os.mkdir(it) + def create_folders(self): + print(f"----> Checking/creating folders.") + if not os.path.isdir(self.proj): + os.mkdir(self.proj) + if not os.path.isdir(self.run): + os.mkdir(self.run) + + if not os.path.isdir(self.conv): + os.mkdir(self.conv) + elif self.over == False: + self.conv = f"{self.conv}_{stamp_time}" + os.mkdir(self.conv) + + if not os.path.isdir(self.resu): + os.mkdir(self.resu) + elif self.over == False: + self.resu = f"{self.resu}_{stamp_time}" + os.mkdir(self.resu) + + self.resu_fine = f"{self.resu}/fine_tune" + if os.path.isdir(self.resu_fine) and self.over == False: + self.resu_fine = f"{self.resu_fine}_{stamp_time}" + elif os.path.isdir(self.resu_fine) and self.over == True: + os.rmdir(self.resu_fine) + + self.resu_train = f"{self.resu}/train" + if os.path.isdir(self.resu_train) and self.over == False: + self.resu_train = f"{self.resu_train}_{stamp_time}" + def pre_processing(self, steps, file_in=None): file_in = self.raw if file_in == None else file_in @@ -178,6 +202,30 @@ class Data: print(f"----> Predictions to file {pred_txt}.") logs.add_infos('output_txt_file', pred_txt) + def make_output(self, prod): + if prod.conll_todo == True: + if prod.conll_meta == True: + pred = self.pred_json_to_conll_w_metadata() + else: + pred = self.pred_json_to_conll() + print(f"----> Predictions to file {pred}.") + logs.add_infos('output_conll_file', pred) + if prod.txt_todo == True: + if prod.txt_meta == True: + pred = self.pred_meta_conll = f"{self.resu}/{self.name}_pred_meta.conll" + if not os.path.isfile(pred): + self.pred_json_to_conll_w_metadata() + pred_txt = self.brackets_txt_with_metadata() + # os.system(f"rm {pred}) + else: + pred = self.pred_conll = f"{self.resu}/{self.name}_pred.conll" + if not os.path.isfile(pred): + self.pred_json_to_conll() + pred_txt = self.brackets_txt() + # os.system(f"rm {pred}) + print(f"----> Predictions to file {pred_txt}.") + logs.add_infos('output_txt_file', pred_txt) + class Output: def __init__(self, infos): @@ -216,6 +264,8 @@ class Process: def get_evaluation_status(self): if self.main == "test" or self.main == "train" or self.main == "fine_tune": self.eval = True + else: + self.eval = False # "annotation" def get_model(self): self.model_path = "" @@ -242,12 +292,11 @@ class Process: spec: testset is the same that data_raw_name / trainset & devset are elsewhere and config fill with path not just name """ - self.ft_stamp = re.sub('^.*/', '', self.set_train) self.train_raw = self.set_train self.dev_raw = self.set_dev self.test_raw = f"{data.path}/{self.set_test}{data.exte}" # reset names to go ez pz for ner formatage - self.set_train = re.sub('\.[^\.]+$', '', self.ft_stamp) + self.set_train = re.sub('\.[^\.]+$', '', re.sub('^.*/', '', self.set_train)) self.set_dev = re.sub('\.[^\.]+$', '', re.sub('^.*/', '', self.dev_raw)) def make_sets_ner_format(self, data): #[steps.set_train, steps.set_dev, steps.set_test] @@ -262,7 +311,7 @@ class Process: conv_to_ner.main(self.test_raw, self.test_ner, "conll") def update_training_config(self): - logs.add_infos('training_config', self.tr_config) + logs.add_json('training_config', self.tr_config) self.tr_config_updated = re.sub('.jsonnet$', '_up.jsonnet', self.tr_config) with open(self.tr_config, 'r') as js: tr_conf = json.load(js) @@ -270,23 +319,25 @@ class Process: tr_conf['validation_data_path'] = self.dev_ner with open(self.tr_config_updated, 'w') as js: json.dump(tr_conf, js) - logs.add_infos('training_config_updated', self.tr_config_updated) + logs.add_json('training_config_updated', self.tr_config_updated) def training(self, data): - #cmd = f"allennlp train -s {data.resu} -f {self.tr_config_updated} &> {data.resu}/logs_training.txt" - cmd = f"allennlp train -s {data.resu} {self.tr_config_updated}" # &> {data.resu}/logs_training.txt" + cmd = f"allennlp train -s {data.resu_train} {self.tr_config_updated} &> {data.resu}/logs_training_{data.stamp_time}.txt" + cmd = cmd if data.over == False else re.sub('&>', '-f &>', cmd) print(f"----> Training : {cmd}") os.system(cmd) - steps.model_path = f"{data.resu}/model.tar.gz" - logs.add_infos('model_to make predictions', self.model) + steps.model_path = f"{data.resu_train}/model.tar.gz" + logs.add_infos('model_to make predictions', self.model_path) + logs.add_infos('logs_trainning_file', f"{data.resu}/logs_training_{data.stamp_time}.txt" ) def fine_tuning(self, data): logs.add_infos('model_to be fine-tuned', self.model) - cmd = f"allennlp fine-tune -m {self.model_path} -c {self.tr_config_updated} -s {data.fine}" # &> {data.resu}/logs_fine-tuning.txt" + cmd = f"allennlp fine-tune -m {self.model_path} -c {self.tr_config_updated} -s {data.resu_fine} &> {data.resu}/logs_fine-tuning_{data.stamp_time}.txt" print(f"----> Fine-tuning : {cmd}") os.system(cmd) - self.model_ft_path = f"{data.fine}/model.tar.gz" + self.model_ft_path = f"{data.resu_fine}/model.tar.gz" logs.add_infos('model_to make predictions', self.model_ft_path) + logs.add_infos('logs_fine-tuning_file', f"{data.resu}/logs_fine-tuning_{data.stamp_time}.txt") def get_stamp(): @@ -294,43 +345,52 @@ def get_stamp(): stamp = re.sub('[\s:]', '_', str(now)) return stamp -def get_config_infos(config, stamp): +def get_config_infos(config, stamp, stamp_time, logs, overwrite): with open(config, 'r', encoding='utf-8') as f: infos = json.load(f) - data = Data(infos['data_raw'], stamp) + data = Data(infos['data_raw'], stamp, stamp_time, overwrite) steps = Process(infos['steps']) prod = Output(infos['output']) + logs.add_infos('config', infos) return data, steps, prod class Logs: def __init__(self): - self.file_path = f"{data.resu}/logs_processes.json" self.dict = {} def add_infos(self, key, value): self.dict[key] = value - def print(self): + def add_json(self, key, jsonf): + with open(jsonf, 'r', encoding='utf-8') as f: + infos = json.load(f) + self.dict[key] = infos + + def print(self, stamp_time): + self.file_path = f"{data.run}/logs_global_{stamp_time}.json" with open(self.file_path, 'w', encoding='utf-8') as fl: json.dump(self.dict, fl, indent=4) if __name__ == '__main__': - stamp = get_stamp() + stamp = stamp_time = get_stamp() parser = argparse.ArgumentParser() parser.add_argument('--config', help='Config file in JSON.') parser.add_argument('--name',default=stamp , help='Run name.') + parser.add_argument('-o', '--overwrite', action='store_true', help='Overwite output.') args = parser.parse_args() config = args.config stamp = args.name + overwrite = args.overwrite - - data, steps, prod = get_config_infos(config, stamp) - data.create_folders(ft=None) logs = Logs() + data, steps, prod = get_config_infos(config, stamp, stamp_time, logs, overwrite) + data.create_folders() + logs.add_infos("stamp", stamp) - logs.add_infos("infos", config) + logs.add_infos("stamp_time", stamp_time) + logs.add_infos("overwrite", overwrite) if steps.main == "annotation" or steps.main == "test": data.pre_processing(steps) @@ -340,6 +400,8 @@ if __name__ == '__main__': steps.get_evaluation_status() if steps.eval == True: data.evaluation(steps, prod) + else: + data.make_output(prod) elif steps.main == "train": steps.get_data_for_train(data) #[steps.set_train, steps.set_dev, steps.set_test] @@ -353,8 +415,7 @@ if __name__ == '__main__': data.evaluation(steps, prod, name=steps.test_data) elif steps.main == "fine_tune": - steps.get_data_for_fine_tune(data) - data.create_folders(steps.ft_stamp) + steps.get_data_for_fine_tune(data) data.pre_processing(steps, file_in=steps.test_raw) steps.make_sets_ner_format(data) steps.get_model() # model to be fine-tune @@ -365,4 +426,5 @@ if __name__ == '__main__': if steps.eval == True: data.evaluation(steps, prod, name=steps.test_data, model=steps.model_ft_path) - logs.print() \ No newline at end of file + logs.print(stamp_time) + print(f"----> All logs saved in {logs.file_path}") \ No newline at end of file