From 068f5571ca5b558ae7aae67a25729961c22df546 Mon Sep 17 00:00:00 2001 From: "laura.riviere" <laura.riviere@irit.fr> Date: Wed, 11 Jan 2023 10:03:03 +0100 Subject: [PATCH] add fine-tuning --- code/config_global_4.json | 49 +++++++ code/discut22_2.py | 219 +++++++++++++++++++++----------- code/utils/training_allennlp.py | 1 + 3 files changed, 195 insertions(+), 74 deletions(-) create mode 100644 code/config_global_4.json diff --git a/code/config_global_4.json b/code/config_global_4.json new file mode 100644 index 0000000..63b0c33 --- /dev/null +++ b/code/config_global_4.json @@ -0,0 +1,49 @@ +{ + "usecase_description": "Config file for usecase_4 : from a dataset, splited in train/dev/test, fine-tune a model (= made of fine-tune of a LM) and test on testset.", + "data_raw": { + "name": "eng.rst.rstdt", + "exte": ".conllu", + "language": "en", + "existing_metadata": true + }, + "steps":{ + "main": "fine_tune", + "pre-processing": { + "to_do": false, + "syntactic_tool": "stanza", + "sentence_split": true, + "tokenization": true, + "syntactic_parsing": true, + "create_metadata": { + "to_do": false, + "line": "paragraph", + "sent": "sent" + } + }, + "discourse_segmenter": { + "model": "/home/lriviere/andiamo/discut22/data/eng.rst.rstdt/results_lundi9/model.tar.gz", + "training": { + "toolkit": "allennlp", + "pre_trained_lm": "bert", + "config_file": "/home/lriviere/andiamo/discut22/model/config_training_bert_m.jsonnet", + "train_data_path": "/home/lriviere/andiamo/discut22/data/eng.sdrt.stac/eng.sdrt.stac_train.conllu", + "validation_data_path": "/home/lriviere/andiamo/discut22/data/eng.sdrt.stac/eng.sdrt.stac_dev.conllu" + } + }, + "gold_test_data_path": "eng.rst.rstdt_test" + }, + "output":{ + "conll_file":{ + "to_do": true, + "metadata": true, + "with_gold_labels": true + }, + "txt_file":{ + "to_do": true, + "metadata": true + } + } +} + + + diff --git a/code/discut22_2.py b/code/discut22_2.py index be1fe03..1025558 100644 --- a/code/discut22_2.py +++ b/code/discut22_2.py @@ -20,6 +20,7 @@ import utils.conll2bracket as c2bracket import utils.seg_eval as seg_eval + class Data: def __init__(self, infos, stamp): @@ -33,17 +34,21 @@ class Data: self.resu = f"{self.path}/results_{stamp}" self.meta = infos['existing_metadata'] - def create_folders(self): # -> can be rtansfor into method of class - for it in [self.conv, self.resu]: + def create_folders(self, ft=None): # -> can be rtansfor into method of class + folders_list=[self.conv, self.resu] + #folders_list=[self.conv] + if ft != None: + self.fine = f"{self.resu}/fine_tune_{ft}" + #folders_list.append(self.fine) # made automatically by allennlp + for it in folders_list: print(f"----> Checking/creating folder {it}.") if not os.path.isdir(it): os.mkdir(it) - my_logs['folders'] = f"{self.conv}, {self.resu}" - def pre_processing(self, steps): - print("----> Preprocessing input data.") - file_in = self.raw + def pre_processing(self, steps, file_in=None): + file_in = self.raw if file_in == None else file_in if steps.pre_process_to_do == True: + print(f"----> Preprocessing {self.raw}.") file_out = f"{self.conv}/{self.name}.conll" if steps.synt_tool == "stanza": processors = [] @@ -65,7 +70,7 @@ class Data: exit(f"Exited. Not valid syntactic tool: \"{steps.synt_tool}\". Options: \"stanza\". Change your config file.") else: file_out = file_in - my_logs['data_preprocessed'] = file_out + logs.add_infos('data_preprocessed', file_out) self.preprocessed = file_out def make_ner_format(self): @@ -78,72 +83,100 @@ class Data: self.ner = f"{self.conv}/{self.name}.ner" print(f"----> Making NER format {self.ner}.") conv_to_ner.main(self.preprocessed, self.ner, "conll") # <-- TODO faire en relatif#TODO add same for train/dev/test for config train - my_logs['data_ner'] = self.ner - - def make_predictions(self, steps): - self.pred_json = f"{self.resu}/{self.name}_pred.json" - cmd = f"allennlp predict --use-dataset-reader --output-file {self.pred_json} {steps.model_path} {self.ner} &> {self.resu}/logs_predictions.txt" + logs.add_infos('data_ner', self.ner) + + def make_predictions(self, steps, js_name=None, fi_ner=None, model=None): + js_name = self.name if js_name == None else js_name + fi_ner = self.ner if fi_ner == None else fi_ner + model = steps.model_path if model == None else model + self.pred_json = f"{self.resu}/{js_name}_pred.json" + cmd = f"allennlp predict --use-dataset-reader --output-file {self.pred_json} {model} {fi_ner} &> {self.resu}/logs_predictions.txt" print(f"----> Making predictions: {cmd}.") os.system(cmd) - my_logs['predictions_cmd'] = cmd + logs.add_infos('predictions_cmd', cmd) - - def pred_json_to_conll_w_metadata_w_gold(self): # here and 3 below..sorry..factorsation TBD - self.pred_conll_meta_gold = f"{self.resu}/{self.name}_pred_meta_gold.conll" + def pred_json_to_conll_w_metadata_w_gold(self, name=None): # here and 3 below..sorry..factorsation TBD + name = self.name if name == None else name + self.pred_conll_meta_gold = f"{self.resu}/{name}_pred_meta_gold.conll" json_to_connl.js2conllNmetaNgold(self.pred_json, self.pred_conll_meta_gold, "conll", self.preprocessed) return self.pred_conll_meta_gold - def pred_json_to_conll_w_metadata(self): - self.pred_meta_conll = f"{self.resu}/{self.name}_pred_meta.conll" + def pred_json_to_conll_w_metadata(self, name=None): + name = self.name if name == None else name + self.pred_meta_conll = f"{self.resu}/{name}_pred_meta.conll" json_to_connl.js2conllNmeta(self.pred_json, self.pred_meta_conll, "conll", self.preprocessed) return self.pred_meta_conll - def pred_json_to_conll_w_gold(self): - self.pred_conll_gold = f"{self.resu}/{self.name}_pred_gold.conll" + def pred_json_to_conll_w_gold(self, name=None): + name = self.name if name == None else name + self.pred_conll_gold = f"{self.resu}/{name}_pred_gold.conll" json_to_connl.js2conll(self.pred_json, self.pred_conll_gold, "conll") return self.pred_conll_gold - def pred_json_to_conll(self): - self.pred_conll = f"{self.resu}/{self.name}_pred.conll" + def pred_json_to_conll(self, name=None): + name = self.name if name == None else name + self.pred_conll = f"{self.resu}/{name}_pred.conll" json_to_connl.js2conll(self.pred_json, self.pred_conll, "conll") return self.pred_conll - def brackets_txt(self): - self.brack = f"{self.resu}/{self.name}_brac.txt" + def brackets_txt(self, name=None): + name = self.name if name == None else name + self.brack = f"{self.resu}/{name}_brac.txt" c2bracket.conll2brackets(self.pred_conll, self.brack) + return self.brack - def brackets_txt_with_metadata(self): - self.brack_meta = f"{self.resu}/{self.name}_brac_meta.txt" + def brackets_txt_with_metadata(self, name=None): + name = self.name if name == None else name + self.brack_meta = f"{self.resu}/{name}_brac_meta.txt" c2bracket.conll2brackets_with_meta(self.pred_meta_conll, self.brack_meta) + return self.brack_meta - - def evaluation(self, prod): + def evaluation(self, steps, prod, gold=None, name=None, model=None): self.basic_metrics = f"{self.resu}/Evaluation_metrics.json" - if self.exte == ".conll" or self.exte == ".conllu": # get gold file - gold = self.raw - else: - gold = self.preprocessed + gold = self.preprocessed if gold == None else gold + name = self.name if name == None else name + model = steps.model_path if model == None else model - if prod.conll_todo == False: # get pred_file - pred = self.pred_json_to_conll() + if prod.conll_todo == False: # get pred_file to compute metrics with seg_eval + pred = self.pred_json_to_conll(name) else: if prod.conll_meta == True: if prod.conll_w_gold == True: - pred = self.pred_json_to_conll_w_metadata_w_gold() + pred = self.pred_json_to_conll_w_metadata_w_gold(name) else: - pred = self.pred_json_to_conll_w_metadata() + pred = self.pred_json_to_conll_w_metadata(name) else: if prod.conll_w_gold == True: - pred = self.pred_json_to_conll_w_gold() + pred = self.pred_json_to_conll_w_gold(name) else: - pred = self.pred_json_to_conll() + pred = self.pred_json_to_conll(name) - print(f"----> Predictions to file {pred}") - print(f"----> Evaluation scores to file {self.basic_metrics}") + print(f"----> Predictions to file {pred}.") + print(f"----> Evaluation scores to file {self.basic_metrics}.") scores_dict = seg_eval.get_scores(gold, pred) + scores_dict['model'] = model + logs.add_infos('basic_metrics', scores_dict) + logs.add_infos('output_conll_file', pred) + with open(self.basic_metrics, 'w') as fo: - json.dump(scores_dict, fo) + json.dump(scores_dict, fo, indent=4) + + if prod.txt_todo == True: + if prod.txt_meta == True: + pred = f"{self.resu}/{name}_pred_meta.conll" + if not os.path.isfile(pred): + self.pred_json_to_conll_w_metadata(name) + pred_txt = self.brackets_txt_with_metadata(name) + # os.system(f"rm {pred}) + else: + pred = f"{self.resu}/{name}_pred.conll" + if not os.path.isfile(pred): + self.pred_json_to_conll + pred_txt = self.brackets_txt(name) + # os.system(f"rm {pred}) + print(f"----> Predictions to file {pred_txt}.") + logs.add_infos('output_txt_file', pred_txt) class Output: @@ -155,7 +188,6 @@ class Output: self.txt_meta = infos['txt_file']['metadata'] - class Process: def __init__(self, infos): self.main = infos["main"] # train test annotation @@ -169,7 +201,7 @@ class Process: self.meta_line = infos['pre-processing']['create_metadata']['line'] self.meta_sent = infos['pre-processing']['create_metadata']['sent'] - if self.main == "train": + if self.main == "train" or "fine_tune": self.set_train = infos['discourse_segmenter']['training']['train_data_path'] self.set_dev = infos['discourse_segmenter']['training']['validation_data_path'] self.set_test = infos['gold_test_data_path'] @@ -182,9 +214,8 @@ class Process: self.test_data = infos['gold_test_data_path'] def get_evaluation_status(self): - if self.main == "test": + if self.main == "test" or self.main == "train" or self.main == "fine_tune": self.eval = True - #elif self.main == "train": def get_model(self): self.model_path = "" @@ -200,27 +231,38 @@ class Process: else: self.model_path = self.model - def get_data_sets(self, data): + def get_data_for_train(self, data): + # from names get path to input self.train_raw = f"{data.path}/{self.set_train}{data.exte}" self.dev_raw = f"{data.path}/{self.set_dev}{data.exte}" self.test_raw = f"{data.path}/{self.set_test}{data.exte}" + def get_data_for_fine_tune(self, data): + """ + spec: testset is the same that data_raw_name / + trainset & devset are elsewhere and config fill with path not just name + """ + self.ft_stamp = re.sub('^.*/', '', self.set_train) + self.train_raw = self.set_train + self.dev_raw = self.set_dev + self.test_raw = f"{data.path}/{self.set_test}{data.exte}" + # reset names to go ez pz for ner formatage + self.set_train = re.sub('\.[^\.]+$', '', self.ft_stamp) + self.set_dev = re.sub('\.[^\.]+$', '', re.sub('^.*/', '', self.dev_raw)) + def make_sets_ner_format(self, data): #[steps.set_train, steps.set_dev, steps.set_test] self.train_ner = f"{data.conv}/{self.set_train}{data.exte}.ner" self.dev_ner = f"{data.conv}/{self.set_dev}{data.exte}.ner" - self.test_ner = f"{data.conv}/{self.set_test}{data.exte}.ner" - + self.test_ner = f"{data.conv}/{self.set_test}{data.exte}.ner" print(f"----> Making NER format {self.train_ner}.") conv_to_ner.main(self.train_raw, self.train_ner, "conll") print(f"----> Making NER format {self.dev_ner}.") conv_to_ner.main(self.dev_raw, self.dev_ner, "conll") print(f"----> Making NER format {self.test_ner}.") conv_to_ner.main(self.test_raw, self.test_ner, "conll") - #self.ner = f"{self.preprocessed}.ner" - #self.ner = f"{self.conv}/{self.name}.ner" - #my_logs['data_ner'] = self.ner def update_training_config(self): + logs.add_infos('training_config', self.tr_config) self.tr_config_updated = re.sub('.jsonnet$', '_up.jsonnet', self.tr_config) with open(self.tr_config, 'r') as js: tr_conf = json.load(js) @@ -228,10 +270,23 @@ class Process: tr_conf['validation_data_path'] = self.dev_ner with open(self.tr_config_updated, 'w') as js: json.dump(tr_conf, js) + logs.add_infos('training_config_updated', self.tr_config_updated) def training(self, data): - cmd = f"allennlp train -s {data.resu} {self.tr_config_updated} &> {data.resu}/logs_training.txt" + #cmd = f"allennlp train -s {data.resu} -f {self.tr_config_updated} &> {data.resu}/logs_training.txt" + cmd = f"allennlp train -s {data.resu} {self.tr_config_updated}" # &> {data.resu}/logs_training.txt" + print(f"----> Training : {cmd}") + os.system(cmd) + steps.model_path = f"{data.resu}/model.tar.gz" + logs.add_infos('model_to make predictions', self.model) + + def fine_tuning(self, data): + logs.add_infos('model_to be fine-tuned', self.model) + cmd = f"allennlp fine-tune -m {self.model_path} -c {self.tr_config_updated} -s {data.fine}" # &> {data.resu}/logs_fine-tuning.txt" + print(f"----> Fine-tuning : {cmd}") os.system(cmd) + self.model_ft_path = f"{data.fine}/model.tar.gz" + logs.add_infos('model_to make predictions', self.model_ft_path) def get_stamp(): @@ -245,20 +300,23 @@ def get_config_infos(config, stamp): data = Data(infos['data_raw'], stamp) steps = Process(infos['steps']) prod = Output(infos['output']) - my_logs["config"] = infos return data, steps, prod -def print_logs(dict_logs): - file_logs = f"{data.resu}/logs_processes.json" - with open(file_logs, 'w') as fi: - json.dump(dict_logs, fi, indent=4) - +class Logs: + def __init__(self): + self.file_path = f"{data.resu}/logs_processes.json" + self.dict = {} + def add_infos(self, key, value): + self.dict[key] = value + + def print(self): + with open(self.file_path, 'w', encoding='utf-8') as fl: + json.dump(self.dict, fl, indent=4) if __name__ == '__main__': - my_logs = {} stamp = get_stamp() parser = argparse.ArgumentParser() parser.add_argument('--config', help='Config file in JSON.') @@ -266,32 +324,45 @@ if __name__ == '__main__': args = parser.parse_args() config = args.config stamp = args.name - my_logs["stamp"] = stamp + data, steps, prod = get_config_infos(config, stamp) - data.create_folders() + data.create_folders(ft=None) + logs = Logs() + logs.add_infos("stamp", stamp) + logs.add_infos("infos", config) if steps.main == "annotation" or steps.main == "test": data.pre_processing(steps) data.make_ner_format() steps.get_model() data.make_predictions(steps) # output allennlp JSON + steps.get_evaluation_status() + if steps.eval == True: + data.evaluation(steps, prod) elif steps.main == "train": - steps.get_data_sets(data) #[steps.set_train, steps.set_dev, steps.set_test] - # data preprocessing + steps.get_data_for_train(data) #[steps.set_train, steps.set_dev, steps.set_test] + data.pre_processing(steps, file_in=steps.test_raw) steps.make_sets_ner_format(data) steps.update_training_config() steps.training(data) - - - - - - - #steps.get_evaluation_status() - #if steps.eval == True: - #data.evaluation(prod) - + data.make_predictions(steps, js_name=steps.set_test, fi_ner=steps.test_ner) + steps.get_evaluation_status() + if steps.eval == True: + data.evaluation(steps, prod, name=steps.test_data) + + elif steps.main == "fine_tune": + steps.get_data_for_fine_tune(data) + data.create_folders(steps.ft_stamp) + data.pre_processing(steps, file_in=steps.test_raw) + steps.make_sets_ner_format(data) + steps.get_model() # model to be fine-tune + steps.update_training_config() + steps.fine_tuning(data) + data.make_predictions(steps, js_name=steps.set_test, fi_ner=steps.test_ner, model=steps.model_ft_path) + steps.get_evaluation_status() + if steps.eval == True: + data.evaluation(steps, prod, name=steps.test_data, model=steps.model_ft_path) - print_logs(my_logs) # <-- attention variable globale ! \ No newline at end of file + logs.print() \ No newline at end of file diff --git a/code/utils/training_allennlp.py b/code/utils/training_allennlp.py index 04c3957..0aa40d4 100644 --- a/code/utils/training_allennlp.py +++ b/code/utils/training_allennlp.py @@ -50,6 +50,7 @@ def main(steps): # TODO: #### train, has_par == true, en fait on fine_tune... #allennlp fine-tune -m Results_${CONFIG}/results_${PARENT}_${MODEL}/model.tar.gz -c ${CODE}configs/${MODEL}.jsonnet -s Results_${CONFIG}/results_${DATASET}-${PARENT}_${MODEL} --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --include-package allen_custom.custom_bert_token_embedder + # allennlp fine-tune -m MODEL_ARCHIVE -c CONFIG_FILE -s SERIALIZATION_DIR -o overrides # TODO ### ensuite prediction sur valset ou "parent test" ou "finetune test"... ?? -- GitLab