Skip to content
Snippets Groups Projects
Commit 237a227a authored by laura.riviere's avatar laura.riviere
Browse files

add train config

parent b369e12e
Branches
No related tags found
No related merge requests found
{
"usecase_description": "Config file for usecase_3 : from a dataset, splited in train/dev/test, train a model (= fine-tune a LM) and test on testset.",
"data_raw": {
"name": "eng.rst.rstdt",
"exte": ".conllu",
"language": "en",
"existing_metadata": true
},
"steps":{
"main": "train",
"pre-processing": {
"to_do": false,
"syntactic_tool": "stanza",
"sentence_split": true,
"tokenization": true,
"syntactic_parsing": true,
"create_metadata": {
"to_do": false,
"line": "paragraph",
"sent": "sent"
}
},
"discourse_segmenter": {
"model": null,
"training": {
"toolkit": "allennlp",
"pre_trained_lm": "bert",
"config_file": "/home/lriviere/andiamo/discut22/model/config_training_bert_m.jsonnet",
"train_data_path": "eng.rst.rstdt_train",
"validation_data_path": "eng.rst.rstdt_dev"
}
},
"gold_test_data_path": "eng.rst.rstdt_test"
},
"output":{
"conll_file":{
"to_do": true,
"metadata": true,
"with_gold_labels": true
},
"txt_file":{
"to_do": true,
"metadata": true
}
}
}
......@@ -75,7 +75,7 @@ class Data:
OUTPUT: Tokenized text with just 4 columns.
"""
self.ner = f"{self.preprocessed}.ner"
self.ner = f"{self.conv}/{self.name}.conll.ner"
self.ner = f"{self.conv}/{self.name}.ner"
print(f"----> Making NER format {self.ner}.")
conv_to_ner.main(self.preprocessed, self.ner, "conll") # <-- TODO faire en relatif#TODO add same for train/dev/test for config train
my_logs['data_ner'] = self.ner
......@@ -169,18 +169,16 @@ class Process:
self.meta_line = infos['pre-processing']['create_metadata']['line']
self.meta_sent = infos['pre-processing']['create_metadata']['sent']
#if self.main == "train":
#if self.ner_init == True : # à faire en relatif !! split truc
# self.train_data = f"{self.data.path}/{self.data.name}_train.ner{self.data.file}"
# self.dev_data = f"{self.data.path}/{self.data.name}_dev.ner{self.data.file}"
#else :
# self.train_data = infos['discourse_segmenter']['training']['train_data_path']
# self.dev_data = infos['discourse_segmenter']['training']['validation_data_path']
if self.main == "train":
self.set_train = infos['discourse_segmenter']['training']['train_data_path']
self.set_dev = infos['discourse_segmenter']['training']['validation_data_path']
self.set_test = infos['gold_test_data_path']
self.toolkit = infos['discourse_segmenter']['training']['toolkit']
self.tr_config = infos['discourse_segmenter']['training']['config_file']
self.pretr_lm = infos['discourse_segmenter']['training']['pre_trained_lm']
self.model = infos['discourse_segmenter']['model'] # ezpz for Tony
self.model = infos['discourse_segmenter']['model'] # ezpz for Tony
self.test_data = infos['gold_test_data_path']
def get_evaluation_status(self):
......@@ -202,6 +200,39 @@ class Process:
else:
self.model_path = self.model
def get_data_sets(self, data):
self.train_raw = f"{data.path}/{self.set_train}{data.exte}"
self.dev_raw = f"{data.path}/{self.set_dev}{data.exte}"
self.test_raw = f"{data.path}/{self.set_test}{data.exte}"
def make_sets_ner_format(self, data): #[steps.set_train, steps.set_dev, steps.set_test]
self.train_ner = f"{data.conv}/{self.set_train}{data.exte}.ner"
self.dev_ner = f"{data.conv}/{self.set_dev}{data.exte}.ner"
self.test_ner = f"{data.conv}/{self.set_test}{data.exte}.ner"
print(f"----> Making NER format {self.train_ner}.")
conv_to_ner.main(self.train_raw, self.train_ner, "conll")
print(f"----> Making NER format {self.dev_ner}.")
conv_to_ner.main(self.dev_raw, self.dev_ner, "conll")
print(f"----> Making NER format {self.test_ner}.")
conv_to_ner.main(self.test_raw, self.test_ner, "conll")
#self.ner = f"{self.preprocessed}.ner"
#self.ner = f"{self.conv}/{self.name}.ner"
#my_logs['data_ner'] = self.ner
def update_training_config(self):
self.tr_config_updated = re.sub('.jsonnet$', '_up.jsonnet', self.tr_config)
with open(self.tr_config, 'r') as js:
tr_conf = json.load(js)
tr_conf['train_data_path'] = self.train_ner
tr_conf['validation_data_path'] = self.dev_ner
with open(self.tr_config_updated, 'w') as js:
json.dump(tr_conf, js)
def training(self, data):
cmd = f"allennlp train -s {data.resu} {self.tr_config_updated} &> {data.resu}/logs_training.txt"
os.system(cmd)
def get_stamp():
now = datetime.now()
......@@ -219,7 +250,7 @@ def get_config_infos(config, stamp):
def print_logs(dict_logs):
file_logs = f"{data.resu}/processes_logs.json"
file_logs = f"{data.resu}/logs_processes.json"
with open(file_logs, 'w') as fi:
json.dump(dict_logs, fi, indent=4)
......@@ -239,17 +270,28 @@ if __name__ == '__main__':
data, steps, prod = get_config_infos(config, stamp)
data.create_folders()
data.pre_processing(steps)
data.make_ner_format()
steps.get_model()
if steps.main == "annotation" or steps.main == "test":
data.pre_processing(steps)
data.make_ner_format()
steps.get_model()
data.make_predictions(steps) # output allennlp JSON
#elif steps.main == "train":
elif steps.main == "train":
steps.get_data_sets(data) #[steps.set_train, steps.set_dev, steps.set_test]
# data preprocessing
steps.make_sets_ner_format(data)
steps.update_training_config()
steps.training(data)
steps.get_evaluation_status()
if steps.eval == True:
data.evaluation(prod)
#steps.get_evaluation_status()
#if steps.eval == True:
#data.evaluation(prod)
print_logs(my_logs) # <-- attention variable globale !
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment