Skip to content
Snippets Groups Projects
Commit 068f5571 authored by laura.riviere's avatar laura.riviere
Browse files

add fine-tuning

parent 237a227a
Branches
No related tags found
No related merge requests found
{
"usecase_description": "Config file for usecase_4 : from a dataset, splited in train/dev/test, fine-tune a model (= made of fine-tune of a LM) and test on testset.",
"data_raw": {
"name": "eng.rst.rstdt",
"exte": ".conllu",
"language": "en",
"existing_metadata": true
},
"steps":{
"main": "fine_tune",
"pre-processing": {
"to_do": false,
"syntactic_tool": "stanza",
"sentence_split": true,
"tokenization": true,
"syntactic_parsing": true,
"create_metadata": {
"to_do": false,
"line": "paragraph",
"sent": "sent"
}
},
"discourse_segmenter": {
"model": "/home/lriviere/andiamo/discut22/data/eng.rst.rstdt/results_lundi9/model.tar.gz",
"training": {
"toolkit": "allennlp",
"pre_trained_lm": "bert",
"config_file": "/home/lriviere/andiamo/discut22/model/config_training_bert_m.jsonnet",
"train_data_path": "/home/lriviere/andiamo/discut22/data/eng.sdrt.stac/eng.sdrt.stac_train.conllu",
"validation_data_path": "/home/lriviere/andiamo/discut22/data/eng.sdrt.stac/eng.sdrt.stac_dev.conllu"
}
},
"gold_test_data_path": "eng.rst.rstdt_test"
},
"output":{
"conll_file":{
"to_do": true,
"metadata": true,
"with_gold_labels": true
},
"txt_file":{
"to_do": true,
"metadata": true
}
}
}
...@@ -20,6 +20,7 @@ import utils.conll2bracket as c2bracket ...@@ -20,6 +20,7 @@ import utils.conll2bracket as c2bracket
import utils.seg_eval as seg_eval import utils.seg_eval as seg_eval
class Data: class Data:
def __init__(self, infos, stamp): def __init__(self, infos, stamp):
...@@ -33,17 +34,21 @@ class Data: ...@@ -33,17 +34,21 @@ class Data:
self.resu = f"{self.path}/results_{stamp}" self.resu = f"{self.path}/results_{stamp}"
self.meta = infos['existing_metadata'] self.meta = infos['existing_metadata']
def create_folders(self): # -> can be rtansfor into method of class def create_folders(self, ft=None): # -> can be rtansfor into method of class
for it in [self.conv, self.resu]: folders_list=[self.conv, self.resu]
#folders_list=[self.conv]
if ft != None:
self.fine = f"{self.resu}/fine_tune_{ft}"
#folders_list.append(self.fine) # made automatically by allennlp
for it in folders_list:
print(f"----> Checking/creating folder {it}.") print(f"----> Checking/creating folder {it}.")
if not os.path.isdir(it): if not os.path.isdir(it):
os.mkdir(it) os.mkdir(it)
my_logs['folders'] = f"{self.conv}, {self.resu}"
def pre_processing(self, steps): def pre_processing(self, steps, file_in=None):
print("----> Preprocessing input data.") file_in = self.raw if file_in == None else file_in
file_in = self.raw
if steps.pre_process_to_do == True: if steps.pre_process_to_do == True:
print(f"----> Preprocessing {self.raw}.")
file_out = f"{self.conv}/{self.name}.conll" file_out = f"{self.conv}/{self.name}.conll"
if steps.synt_tool == "stanza": if steps.synt_tool == "stanza":
processors = [] processors = []
...@@ -65,7 +70,7 @@ class Data: ...@@ -65,7 +70,7 @@ class Data:
exit(f"Exited. Not valid syntactic tool: \"{steps.synt_tool}\". Options: \"stanza\". Change your config file.") exit(f"Exited. Not valid syntactic tool: \"{steps.synt_tool}\". Options: \"stanza\". Change your config file.")
else: else:
file_out = file_in file_out = file_in
my_logs['data_preprocessed'] = file_out logs.add_infos('data_preprocessed', file_out)
self.preprocessed = file_out self.preprocessed = file_out
def make_ner_format(self): def make_ner_format(self):
...@@ -78,72 +83,100 @@ class Data: ...@@ -78,72 +83,100 @@ class Data:
self.ner = f"{self.conv}/{self.name}.ner" self.ner = f"{self.conv}/{self.name}.ner"
print(f"----> Making NER format {self.ner}.") print(f"----> Making NER format {self.ner}.")
conv_to_ner.main(self.preprocessed, self.ner, "conll") # <-- TODO faire en relatif#TODO add same for train/dev/test for config train conv_to_ner.main(self.preprocessed, self.ner, "conll") # <-- TODO faire en relatif#TODO add same for train/dev/test for config train
my_logs['data_ner'] = self.ner logs.add_infos('data_ner', self.ner)
def make_predictions(self, steps): def make_predictions(self, steps, js_name=None, fi_ner=None, model=None):
self.pred_json = f"{self.resu}/{self.name}_pred.json" js_name = self.name if js_name == None else js_name
cmd = f"allennlp predict --use-dataset-reader --output-file {self.pred_json} {steps.model_path} {self.ner} &> {self.resu}/logs_predictions.txt" fi_ner = self.ner if fi_ner == None else fi_ner
model = steps.model_path if model == None else model
self.pred_json = f"{self.resu}/{js_name}_pred.json"
cmd = f"allennlp predict --use-dataset-reader --output-file {self.pred_json} {model} {fi_ner} &> {self.resu}/logs_predictions.txt"
print(f"----> Making predictions: {cmd}.") print(f"----> Making predictions: {cmd}.")
os.system(cmd) os.system(cmd)
my_logs['predictions_cmd'] = cmd logs.add_infos('predictions_cmd', cmd)
def pred_json_to_conll_w_metadata_w_gold(self, name=None): # here and 3 below..sorry..factorsation TBD
def pred_json_to_conll_w_metadata_w_gold(self): # here and 3 below..sorry..factorsation TBD name = self.name if name == None else name
self.pred_conll_meta_gold = f"{self.resu}/{self.name}_pred_meta_gold.conll" self.pred_conll_meta_gold = f"{self.resu}/{name}_pred_meta_gold.conll"
json_to_connl.js2conllNmetaNgold(self.pred_json, self.pred_conll_meta_gold, "conll", self.preprocessed) json_to_connl.js2conllNmetaNgold(self.pred_json, self.pred_conll_meta_gold, "conll", self.preprocessed)
return self.pred_conll_meta_gold return self.pred_conll_meta_gold
def pred_json_to_conll_w_metadata(self): def pred_json_to_conll_w_metadata(self, name=None):
self.pred_meta_conll = f"{self.resu}/{self.name}_pred_meta.conll" name = self.name if name == None else name
self.pred_meta_conll = f"{self.resu}/{name}_pred_meta.conll"
json_to_connl.js2conllNmeta(self.pred_json, self.pred_meta_conll, "conll", self.preprocessed) json_to_connl.js2conllNmeta(self.pred_json, self.pred_meta_conll, "conll", self.preprocessed)
return self.pred_meta_conll return self.pred_meta_conll
def pred_json_to_conll_w_gold(self): def pred_json_to_conll_w_gold(self, name=None):
self.pred_conll_gold = f"{self.resu}/{self.name}_pred_gold.conll" name = self.name if name == None else name
self.pred_conll_gold = f"{self.resu}/{name}_pred_gold.conll"
json_to_connl.js2conll(self.pred_json, self.pred_conll_gold, "conll") json_to_connl.js2conll(self.pred_json, self.pred_conll_gold, "conll")
return self.pred_conll_gold return self.pred_conll_gold
def pred_json_to_conll(self): def pred_json_to_conll(self, name=None):
self.pred_conll = f"{self.resu}/{self.name}_pred.conll" name = self.name if name == None else name
self.pred_conll = f"{self.resu}/{name}_pred.conll"
json_to_connl.js2conll(self.pred_json, self.pred_conll, "conll") json_to_connl.js2conll(self.pred_json, self.pred_conll, "conll")
return self.pred_conll return self.pred_conll
def brackets_txt(self): def brackets_txt(self, name=None):
self.brack = f"{self.resu}/{self.name}_brac.txt" name = self.name if name == None else name
self.brack = f"{self.resu}/{name}_brac.txt"
c2bracket.conll2brackets(self.pred_conll, self.brack) c2bracket.conll2brackets(self.pred_conll, self.brack)
return self.brack
def brackets_txt_with_metadata(self): def brackets_txt_with_metadata(self, name=None):
self.brack_meta = f"{self.resu}/{self.name}_brac_meta.txt" name = self.name if name == None else name
self.brack_meta = f"{self.resu}/{name}_brac_meta.txt"
c2bracket.conll2brackets_with_meta(self.pred_meta_conll, self.brack_meta) c2bracket.conll2brackets_with_meta(self.pred_meta_conll, self.brack_meta)
return self.brack_meta
def evaluation(self, steps, prod, gold=None, name=None, model=None):
def evaluation(self, prod):
self.basic_metrics = f"{self.resu}/Evaluation_metrics.json" self.basic_metrics = f"{self.resu}/Evaluation_metrics.json"
if self.exte == ".conll" or self.exte == ".conllu": # get gold file gold = self.preprocessed if gold == None else gold
gold = self.raw name = self.name if name == None else name
else: model = steps.model_path if model == None else model
gold = self.preprocessed
if prod.conll_todo == False: # get pred_file if prod.conll_todo == False: # get pred_file to compute metrics with seg_eval
pred = self.pred_json_to_conll() pred = self.pred_json_to_conll(name)
else: else:
if prod.conll_meta == True: if prod.conll_meta == True:
if prod.conll_w_gold == True: if prod.conll_w_gold == True:
pred = self.pred_json_to_conll_w_metadata_w_gold() pred = self.pred_json_to_conll_w_metadata_w_gold(name)
else: else:
pred = self.pred_json_to_conll_w_metadata() pred = self.pred_json_to_conll_w_metadata(name)
else: else:
if prod.conll_w_gold == True: if prod.conll_w_gold == True:
pred = self.pred_json_to_conll_w_gold() pred = self.pred_json_to_conll_w_gold(name)
else: else:
pred = self.pred_json_to_conll() pred = self.pred_json_to_conll(name)
print(f"----> Predictions to file {pred}") print(f"----> Predictions to file {pred}.")
print(f"----> Evaluation scores to file {self.basic_metrics}") print(f"----> Evaluation scores to file {self.basic_metrics}.")
scores_dict = seg_eval.get_scores(gold, pred) scores_dict = seg_eval.get_scores(gold, pred)
scores_dict['model'] = model
logs.add_infos('basic_metrics', scores_dict)
logs.add_infos('output_conll_file', pred)
with open(self.basic_metrics, 'w') as fo: with open(self.basic_metrics, 'w') as fo:
json.dump(scores_dict, fo) json.dump(scores_dict, fo, indent=4)
if prod.txt_todo == True:
if prod.txt_meta == True:
pred = f"{self.resu}/{name}_pred_meta.conll"
if not os.path.isfile(pred):
self.pred_json_to_conll_w_metadata(name)
pred_txt = self.brackets_txt_with_metadata(name)
# os.system(f"rm {pred})
else:
pred = f"{self.resu}/{name}_pred.conll"
if not os.path.isfile(pred):
self.pred_json_to_conll
pred_txt = self.brackets_txt(name)
# os.system(f"rm {pred})
print(f"----> Predictions to file {pred_txt}.")
logs.add_infos('output_txt_file', pred_txt)
class Output: class Output:
...@@ -155,7 +188,6 @@ class Output: ...@@ -155,7 +188,6 @@ class Output:
self.txt_meta = infos['txt_file']['metadata'] self.txt_meta = infos['txt_file']['metadata']
class Process: class Process:
def __init__(self, infos): def __init__(self, infos):
self.main = infos["main"] # train test annotation self.main = infos["main"] # train test annotation
...@@ -169,7 +201,7 @@ class Process: ...@@ -169,7 +201,7 @@ class Process:
self.meta_line = infos['pre-processing']['create_metadata']['line'] self.meta_line = infos['pre-processing']['create_metadata']['line']
self.meta_sent = infos['pre-processing']['create_metadata']['sent'] self.meta_sent = infos['pre-processing']['create_metadata']['sent']
if self.main == "train": if self.main == "train" or "fine_tune":
self.set_train = infos['discourse_segmenter']['training']['train_data_path'] self.set_train = infos['discourse_segmenter']['training']['train_data_path']
self.set_dev = infos['discourse_segmenter']['training']['validation_data_path'] self.set_dev = infos['discourse_segmenter']['training']['validation_data_path']
self.set_test = infos['gold_test_data_path'] self.set_test = infos['gold_test_data_path']
...@@ -182,9 +214,8 @@ class Process: ...@@ -182,9 +214,8 @@ class Process:
self.test_data = infos['gold_test_data_path'] self.test_data = infos['gold_test_data_path']
def get_evaluation_status(self): def get_evaluation_status(self):
if self.main == "test": if self.main == "test" or self.main == "train" or self.main == "fine_tune":
self.eval = True self.eval = True
#elif self.main == "train":
def get_model(self): def get_model(self):
self.model_path = "" self.model_path = ""
...@@ -200,27 +231,38 @@ class Process: ...@@ -200,27 +231,38 @@ class Process:
else: else:
self.model_path = self.model self.model_path = self.model
def get_data_sets(self, data): def get_data_for_train(self, data):
# from names get path to input
self.train_raw = f"{data.path}/{self.set_train}{data.exte}" self.train_raw = f"{data.path}/{self.set_train}{data.exte}"
self.dev_raw = f"{data.path}/{self.set_dev}{data.exte}" self.dev_raw = f"{data.path}/{self.set_dev}{data.exte}"
self.test_raw = f"{data.path}/{self.set_test}{data.exte}" self.test_raw = f"{data.path}/{self.set_test}{data.exte}"
def get_data_for_fine_tune(self, data):
"""
spec: testset is the same that data_raw_name /
trainset & devset are elsewhere and config fill with path not just name
"""
self.ft_stamp = re.sub('^.*/', '', self.set_train)
self.train_raw = self.set_train
self.dev_raw = self.set_dev
self.test_raw = f"{data.path}/{self.set_test}{data.exte}"
# reset names to go ez pz for ner formatage
self.set_train = re.sub('\.[^\.]+$', '', self.ft_stamp)
self.set_dev = re.sub('\.[^\.]+$', '', re.sub('^.*/', '', self.dev_raw))
def make_sets_ner_format(self, data): #[steps.set_train, steps.set_dev, steps.set_test] def make_sets_ner_format(self, data): #[steps.set_train, steps.set_dev, steps.set_test]
self.train_ner = f"{data.conv}/{self.set_train}{data.exte}.ner" self.train_ner = f"{data.conv}/{self.set_train}{data.exte}.ner"
self.dev_ner = f"{data.conv}/{self.set_dev}{data.exte}.ner" self.dev_ner = f"{data.conv}/{self.set_dev}{data.exte}.ner"
self.test_ner = f"{data.conv}/{self.set_test}{data.exte}.ner" self.test_ner = f"{data.conv}/{self.set_test}{data.exte}.ner"
print(f"----> Making NER format {self.train_ner}.") print(f"----> Making NER format {self.train_ner}.")
conv_to_ner.main(self.train_raw, self.train_ner, "conll") conv_to_ner.main(self.train_raw, self.train_ner, "conll")
print(f"----> Making NER format {self.dev_ner}.") print(f"----> Making NER format {self.dev_ner}.")
conv_to_ner.main(self.dev_raw, self.dev_ner, "conll") conv_to_ner.main(self.dev_raw, self.dev_ner, "conll")
print(f"----> Making NER format {self.test_ner}.") print(f"----> Making NER format {self.test_ner}.")
conv_to_ner.main(self.test_raw, self.test_ner, "conll") conv_to_ner.main(self.test_raw, self.test_ner, "conll")
#self.ner = f"{self.preprocessed}.ner"
#self.ner = f"{self.conv}/{self.name}.ner"
#my_logs['data_ner'] = self.ner
def update_training_config(self): def update_training_config(self):
logs.add_infos('training_config', self.tr_config)
self.tr_config_updated = re.sub('.jsonnet$', '_up.jsonnet', self.tr_config) self.tr_config_updated = re.sub('.jsonnet$', '_up.jsonnet', self.tr_config)
with open(self.tr_config, 'r') as js: with open(self.tr_config, 'r') as js:
tr_conf = json.load(js) tr_conf = json.load(js)
...@@ -228,10 +270,23 @@ class Process: ...@@ -228,10 +270,23 @@ class Process:
tr_conf['validation_data_path'] = self.dev_ner tr_conf['validation_data_path'] = self.dev_ner
with open(self.tr_config_updated, 'w') as js: with open(self.tr_config_updated, 'w') as js:
json.dump(tr_conf, js) json.dump(tr_conf, js)
logs.add_infos('training_config_updated', self.tr_config_updated)
def training(self, data): def training(self, data):
cmd = f"allennlp train -s {data.resu} {self.tr_config_updated} &> {data.resu}/logs_training.txt" #cmd = f"allennlp train -s {data.resu} -f {self.tr_config_updated} &> {data.resu}/logs_training.txt"
cmd = f"allennlp train -s {data.resu} {self.tr_config_updated}" # &> {data.resu}/logs_training.txt"
print(f"----> Training : {cmd}")
os.system(cmd)
steps.model_path = f"{data.resu}/model.tar.gz"
logs.add_infos('model_to make predictions', self.model)
def fine_tuning(self, data):
logs.add_infos('model_to be fine-tuned', self.model)
cmd = f"allennlp fine-tune -m {self.model_path} -c {self.tr_config_updated} -s {data.fine}" # &> {data.resu}/logs_fine-tuning.txt"
print(f"----> Fine-tuning : {cmd}")
os.system(cmd) os.system(cmd)
self.model_ft_path = f"{data.fine}/model.tar.gz"
logs.add_infos('model_to make predictions', self.model_ft_path)
def get_stamp(): def get_stamp():
...@@ -245,20 +300,23 @@ def get_config_infos(config, stamp): ...@@ -245,20 +300,23 @@ def get_config_infos(config, stamp):
data = Data(infos['data_raw'], stamp) data = Data(infos['data_raw'], stamp)
steps = Process(infos['steps']) steps = Process(infos['steps'])
prod = Output(infos['output']) prod = Output(infos['output'])
my_logs["config"] = infos
return data, steps, prod return data, steps, prod
def print_logs(dict_logs): class Logs:
file_logs = f"{data.resu}/logs_processes.json" def __init__(self):
with open(file_logs, 'w') as fi: self.file_path = f"{data.resu}/logs_processes.json"
json.dump(dict_logs, fi, indent=4) self.dict = {}
def add_infos(self, key, value):
self.dict[key] = value
def print(self):
with open(self.file_path, 'w', encoding='utf-8') as fl:
json.dump(self.dict, fl, indent=4)
if __name__ == '__main__': if __name__ == '__main__':
my_logs = {}
stamp = get_stamp() stamp = get_stamp()
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--config', help='Config file in JSON.') parser.add_argument('--config', help='Config file in JSON.')
...@@ -266,32 +324,45 @@ if __name__ == '__main__': ...@@ -266,32 +324,45 @@ if __name__ == '__main__':
args = parser.parse_args() args = parser.parse_args()
config = args.config config = args.config
stamp = args.name stamp = args.name
my_logs["stamp"] = stamp
data, steps, prod = get_config_infos(config, stamp) data, steps, prod = get_config_infos(config, stamp)
data.create_folders() data.create_folders(ft=None)
logs = Logs()
logs.add_infos("stamp", stamp)
logs.add_infos("infos", config)
if steps.main == "annotation" or steps.main == "test": if steps.main == "annotation" or steps.main == "test":
data.pre_processing(steps) data.pre_processing(steps)
data.make_ner_format() data.make_ner_format()
steps.get_model() steps.get_model()
data.make_predictions(steps) # output allennlp JSON data.make_predictions(steps) # output allennlp JSON
steps.get_evaluation_status()
if steps.eval == True:
data.evaluation(steps, prod)
elif steps.main == "train": elif steps.main == "train":
steps.get_data_sets(data) #[steps.set_train, steps.set_dev, steps.set_test] steps.get_data_for_train(data) #[steps.set_train, steps.set_dev, steps.set_test]
# data preprocessing data.pre_processing(steps, file_in=steps.test_raw)
steps.make_sets_ner_format(data) steps.make_sets_ner_format(data)
steps.update_training_config() steps.update_training_config()
steps.training(data) steps.training(data)
data.make_predictions(steps, js_name=steps.set_test, fi_ner=steps.test_ner)
steps.get_evaluation_status()
if steps.eval == True:
data.evaluation(steps, prod, name=steps.test_data)
elif steps.main == "fine_tune":
#steps.get_evaluation_status() steps.get_data_for_fine_tune(data)
#if steps.eval == True: data.create_folders(steps.ft_stamp)
#data.evaluation(prod) data.pre_processing(steps, file_in=steps.test_raw)
steps.make_sets_ner_format(data)
steps.get_model() # model to be fine-tune
steps.update_training_config()
steps.fine_tuning(data)
data.make_predictions(steps, js_name=steps.set_test, fi_ner=steps.test_ner, model=steps.model_ft_path)
steps.get_evaluation_status()
if steps.eval == True:
data.evaluation(steps, prod, name=steps.test_data, model=steps.model_ft_path)
print_logs(my_logs) # <-- attention variable globale ! logs.print()
\ No newline at end of file \ No newline at end of file
...@@ -50,6 +50,7 @@ def main(steps): ...@@ -50,6 +50,7 @@ def main(steps):
# TODO: # TODO:
#### train, has_par == true, en fait on fine_tune... #### train, has_par == true, en fait on fine_tune...
#allennlp fine-tune -m Results_${CONFIG}/results_${PARENT}_${MODEL}/model.tar.gz -c ${CODE}configs/${MODEL}.jsonnet -s Results_${CONFIG}/results_${DATASET}-${PARENT}_${MODEL} --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --include-package allen_custom.custom_bert_token_embedder #allennlp fine-tune -m Results_${CONFIG}/results_${PARENT}_${MODEL}/model.tar.gz -c ${CODE}configs/${MODEL}.jsonnet -s Results_${CONFIG}/results_${DATASET}-${PARENT}_${MODEL} --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --include-package allen_custom.custom_bert_token_embedder
# allennlp fine-tune -m MODEL_ARCHIVE -c CONFIG_FILE -s SERIALIZATION_DIR -o overrides
# TODO # TODO
### ensuite prediction sur valset ou "parent test" ou "finetune test"... ?? ### ensuite prediction sur valset ou "parent test" ou "finetune test"... ??
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment