Skip to content
Snippets Groups Projects
Commit 9544fd95 authored by laura.riviere's avatar laura.riviere
Browse files

new directories archi and more logs

parent 068f5571
Branches
No related tags found
1 merge request!4new directories archi and more logs
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
"tokenization": true, "tokenization": true,
"syntactic_parsing": false, "syntactic_parsing": false,
"create_metadata": { "create_metadata": {
"to_do": false, "to_do": true,
"line": "paragraph", "line": "paragraph",
"sent": "sent" "sent": "sent"
} }
...@@ -36,10 +36,10 @@ ...@@ -36,10 +36,10 @@
"conll_file":{ "conll_file":{
"to_do": true, "to_do": true,
"metadata": true, "metadata": true,
"with_gold_labels": true "with_gold_labels": false
}, },
"txt_file":{ "txt_file":{
"to_do": true, "to_do": false,
"metadata": true "metadata": true
} }
} }
......
{ {
"usecase_description": "Config file for usecase_2 : from a gold text, make predictions with an existing model, compare gold and predictions with metrics.", "usecase_description": "Config file for usecase_2 : from a gold text, make predictions with an existing model, compare gold and predictions with metrics.",
"data_raw": { "data_raw": {
"name": "fra.sdrt.annodis_dev", "name": "eng.pdtb.pdtb_dev",
"exte": ".conllu", "exte": ".conllu",
"language": "fr", "language": "en",
"existing_metadata": true "existing_metadata": true
}, },
"steps":{ "steps":{
...@@ -15,13 +15,13 @@ ...@@ -15,13 +15,13 @@
"tokenization": true, "tokenization": true,
"syntactic_parsing": true, "syntactic_parsing": true,
"create_metadata": { "create_metadata": {
"to_do": false, "to_do": true,
"line": "paragraph", "line": "paragraph",
"sent": "sent" "sent": "sent"
} }
}, },
"discourse_segmenter": { "discourse_segmenter": {
"model": "tony", "model": "/home/lriviere/andiamo/morteza/discut/Results_conllu/results_eng.pdtb.pdtb_bert/model.tar.gz",
"training": { "training": {
"toolkit": null, "toolkit": null,
"pre_trained_lm": null, "pre_trained_lm": null,
......
...@@ -30,7 +30,7 @@ ...@@ -30,7 +30,7 @@
"validation_data_path": "eng.rst.rstdt_dev" "validation_data_path": "eng.rst.rstdt_dev"
} }
}, },
"gold_test_data_path": "eng.rst.rstdt_test" "gold_test_data_path": "eng.rst.rstdt_dev"
}, },
"output":{ "output":{
"conll_file":{ "conll_file":{
...@@ -39,7 +39,7 @@ ...@@ -39,7 +39,7 @@
"with_gold_labels": true "with_gold_labels": true
}, },
"txt_file":{ "txt_file":{
"to_do": true, "to_do": false,
"metadata": true "metadata": true
} }
} }
......
...@@ -30,7 +30,7 @@ ...@@ -30,7 +30,7 @@
"validation_data_path": "/home/lriviere/andiamo/discut22/data/eng.sdrt.stac/eng.sdrt.stac_dev.conllu" "validation_data_path": "/home/lriviere/andiamo/discut22/data/eng.sdrt.stac/eng.sdrt.stac_dev.conllu"
} }
}, },
"gold_test_data_path": "eng.rst.rstdt_test" "gold_test_data_path": "eng.rst.rstdt_dev"
}, },
"output":{ "output":{
"conll_file":{ "conll_file":{
...@@ -39,7 +39,7 @@ ...@@ -39,7 +39,7 @@
"with_gold_labels": true "with_gold_labels": true
}, },
"txt_file":{ "txt_file":{
"to_do": true, "to_do": false,
"metadata": true "metadata": true
} }
} }
......
...@@ -23,27 +23,51 @@ import utils.seg_eval as seg_eval ...@@ -23,27 +23,51 @@ import utils.seg_eval as seg_eval
class Data: class Data:
def __init__(self, infos, stamp): def __init__(self, infos, stamp, stamp_time, overwrite):
self.name = infos['name'] self.name = infos['name']
self.lang = infos['language'] self.lang = infos['language']
self.path = f"../data/{self.name}" self.path = f"../data/{self.name}"
self.exte = infos['exte'] self.exte = infos['exte']
self.raw = f"{self.path}/{self.name}{self.exte}" self.raw = f"{self.path}/{self.name}{self.exte}"
self.stamp = stamp self.stamp = stamp
self.conv = f"{self.path}/data_converted_{stamp}" self.stamp_time = stamp_time
self.resu = f"{self.path}/results_{stamp}" self.proj = "../projects"
self.run = f"{self.proj}/{stamp}"
self.conv = f"{self.run}/data_converted"
self.resu = f"{self.run}/results"
self.over = overwrite
self.meta = infos['existing_metadata'] self.meta = infos['existing_metadata']
def create_folders(self, ft=None): # -> can be rtansfor into method of class def create_folders(self):
folders_list=[self.conv, self.resu] print(f"----> Checking/creating folders.")
#folders_list=[self.conv] if not os.path.isdir(self.proj):
if ft != None: os.mkdir(self.proj)
self.fine = f"{self.resu}/fine_tune_{ft}" if not os.path.isdir(self.run):
#folders_list.append(self.fine) # made automatically by allennlp os.mkdir(self.run)
for it in folders_list:
print(f"----> Checking/creating folder {it}.") if not os.path.isdir(self.conv):
if not os.path.isdir(it): os.mkdir(self.conv)
os.mkdir(it) elif self.over == False:
self.conv = f"{self.conv}_{stamp_time}"
os.mkdir(self.conv)
if not os.path.isdir(self.resu):
os.mkdir(self.resu)
elif self.over == False:
self.resu = f"{self.resu}_{stamp_time}"
os.mkdir(self.resu)
self.resu_fine = f"{self.resu}/fine_tune"
if os.path.isdir(self.resu_fine) and self.over == False:
self.resu_fine = f"{self.resu_fine}_{stamp_time}"
elif os.path.isdir(self.resu_fine) and self.over == True:
os.rmdir(self.resu_fine)
self.resu_train = f"{self.resu}/train"
if os.path.isdir(self.resu_train) and self.over == False:
self.resu_train = f"{self.resu_train}_{stamp_time}"
def pre_processing(self, steps, file_in=None): def pre_processing(self, steps, file_in=None):
file_in = self.raw if file_in == None else file_in file_in = self.raw if file_in == None else file_in
...@@ -178,6 +202,30 @@ class Data: ...@@ -178,6 +202,30 @@ class Data:
print(f"----> Predictions to file {pred_txt}.") print(f"----> Predictions to file {pred_txt}.")
logs.add_infos('output_txt_file', pred_txt) logs.add_infos('output_txt_file', pred_txt)
def make_output(self, prod):
if prod.conll_todo == True:
if prod.conll_meta == True:
pred = self.pred_json_to_conll_w_metadata()
else:
pred = self.pred_json_to_conll()
print(f"----> Predictions to file {pred}.")
logs.add_infos('output_conll_file', pred)
if prod.txt_todo == True:
if prod.txt_meta == True:
pred = self.pred_meta_conll = f"{self.resu}/{self.name}_pred_meta.conll"
if not os.path.isfile(pred):
self.pred_json_to_conll_w_metadata()
pred_txt = self.brackets_txt_with_metadata()
# os.system(f"rm {pred})
else:
pred = self.pred_conll = f"{self.resu}/{self.name}_pred.conll"
if not os.path.isfile(pred):
self.pred_json_to_conll()
pred_txt = self.brackets_txt()
# os.system(f"rm {pred})
print(f"----> Predictions to file {pred_txt}.")
logs.add_infos('output_txt_file', pred_txt)
class Output: class Output:
def __init__(self, infos): def __init__(self, infos):
...@@ -216,6 +264,8 @@ class Process: ...@@ -216,6 +264,8 @@ class Process:
def get_evaluation_status(self): def get_evaluation_status(self):
if self.main == "test" or self.main == "train" or self.main == "fine_tune": if self.main == "test" or self.main == "train" or self.main == "fine_tune":
self.eval = True self.eval = True
else:
self.eval = False # "annotation"
def get_model(self): def get_model(self):
self.model_path = "" self.model_path = ""
...@@ -242,12 +292,11 @@ class Process: ...@@ -242,12 +292,11 @@ class Process:
spec: testset is the same that data_raw_name / spec: testset is the same that data_raw_name /
trainset & devset are elsewhere and config fill with path not just name trainset & devset are elsewhere and config fill with path not just name
""" """
self.ft_stamp = re.sub('^.*/', '', self.set_train)
self.train_raw = self.set_train self.train_raw = self.set_train
self.dev_raw = self.set_dev self.dev_raw = self.set_dev
self.test_raw = f"{data.path}/{self.set_test}{data.exte}" self.test_raw = f"{data.path}/{self.set_test}{data.exte}"
# reset names to go ez pz for ner formatage # reset names to go ez pz for ner formatage
self.set_train = re.sub('\.[^\.]+$', '', self.ft_stamp) self.set_train = re.sub('\.[^\.]+$', '', re.sub('^.*/', '', self.set_train))
self.set_dev = re.sub('\.[^\.]+$', '', re.sub('^.*/', '', self.dev_raw)) self.set_dev = re.sub('\.[^\.]+$', '', re.sub('^.*/', '', self.dev_raw))
def make_sets_ner_format(self, data): #[steps.set_train, steps.set_dev, steps.set_test] def make_sets_ner_format(self, data): #[steps.set_train, steps.set_dev, steps.set_test]
...@@ -262,7 +311,7 @@ class Process: ...@@ -262,7 +311,7 @@ class Process:
conv_to_ner.main(self.test_raw, self.test_ner, "conll") conv_to_ner.main(self.test_raw, self.test_ner, "conll")
def update_training_config(self): def update_training_config(self):
logs.add_infos('training_config', self.tr_config) logs.add_json('training_config', self.tr_config)
self.tr_config_updated = re.sub('.jsonnet$', '_up.jsonnet', self.tr_config) self.tr_config_updated = re.sub('.jsonnet$', '_up.jsonnet', self.tr_config)
with open(self.tr_config, 'r') as js: with open(self.tr_config, 'r') as js:
tr_conf = json.load(js) tr_conf = json.load(js)
...@@ -270,23 +319,25 @@ class Process: ...@@ -270,23 +319,25 @@ class Process:
tr_conf['validation_data_path'] = self.dev_ner tr_conf['validation_data_path'] = self.dev_ner
with open(self.tr_config_updated, 'w') as js: with open(self.tr_config_updated, 'w') as js:
json.dump(tr_conf, js) json.dump(tr_conf, js)
logs.add_infos('training_config_updated', self.tr_config_updated) logs.add_json('training_config_updated', self.tr_config_updated)
def training(self, data): def training(self, data):
#cmd = f"allennlp train -s {data.resu} -f {self.tr_config_updated} &> {data.resu}/logs_training.txt" cmd = f"allennlp train -s {data.resu_train} {self.tr_config_updated} &> {data.resu}/logs_training_{data.stamp_time}.txt"
cmd = f"allennlp train -s {data.resu} {self.tr_config_updated}" # &> {data.resu}/logs_training.txt" cmd = cmd if data.over == False else re.sub('&>', '-f &>', cmd)
print(f"----> Training : {cmd}") print(f"----> Training : {cmd}")
os.system(cmd) os.system(cmd)
steps.model_path = f"{data.resu}/model.tar.gz" steps.model_path = f"{data.resu_train}/model.tar.gz"
logs.add_infos('model_to make predictions', self.model) logs.add_infos('model_to make predictions', self.model_path)
logs.add_infos('logs_trainning_file', f"{data.resu}/logs_training_{data.stamp_time}.txt" )
def fine_tuning(self, data): def fine_tuning(self, data):
logs.add_infos('model_to be fine-tuned', self.model) logs.add_infos('model_to be fine-tuned', self.model)
cmd = f"allennlp fine-tune -m {self.model_path} -c {self.tr_config_updated} -s {data.fine}" # &> {data.resu}/logs_fine-tuning.txt" cmd = f"allennlp fine-tune -m {self.model_path} -c {self.tr_config_updated} -s {data.resu_fine} &> {data.resu}/logs_fine-tuning_{data.stamp_time}.txt"
print(f"----> Fine-tuning : {cmd}") print(f"----> Fine-tuning : {cmd}")
os.system(cmd) os.system(cmd)
self.model_ft_path = f"{data.fine}/model.tar.gz" self.model_ft_path = f"{data.resu_fine}/model.tar.gz"
logs.add_infos('model_to make predictions', self.model_ft_path) logs.add_infos('model_to make predictions', self.model_ft_path)
logs.add_infos('logs_fine-tuning_file', f"{data.resu}/logs_fine-tuning_{data.stamp_time}.txt")
def get_stamp(): def get_stamp():
...@@ -294,43 +345,52 @@ def get_stamp(): ...@@ -294,43 +345,52 @@ def get_stamp():
stamp = re.sub('[\s:]', '_', str(now)) stamp = re.sub('[\s:]', '_', str(now))
return stamp return stamp
def get_config_infos(config, stamp): def get_config_infos(config, stamp, stamp_time, logs, overwrite):
with open(config, 'r', encoding='utf-8') as f: with open(config, 'r', encoding='utf-8') as f:
infos = json.load(f) infos = json.load(f)
data = Data(infos['data_raw'], stamp) data = Data(infos['data_raw'], stamp, stamp_time, overwrite)
steps = Process(infos['steps']) steps = Process(infos['steps'])
prod = Output(infos['output']) prod = Output(infos['output'])
logs.add_infos('config', infos)
return data, steps, prod return data, steps, prod
class Logs: class Logs:
def __init__(self): def __init__(self):
self.file_path = f"{data.resu}/logs_processes.json"
self.dict = {} self.dict = {}
def add_infos(self, key, value): def add_infos(self, key, value):
self.dict[key] = value self.dict[key] = value
def print(self): def add_json(self, key, jsonf):
with open(jsonf, 'r', encoding='utf-8') as f:
infos = json.load(f)
self.dict[key] = infos
def print(self, stamp_time):
self.file_path = f"{data.run}/logs_global_{stamp_time}.json"
with open(self.file_path, 'w', encoding='utf-8') as fl: with open(self.file_path, 'w', encoding='utf-8') as fl:
json.dump(self.dict, fl, indent=4) json.dump(self.dict, fl, indent=4)
if __name__ == '__main__': if __name__ == '__main__':
stamp = get_stamp() stamp = stamp_time = get_stamp()
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--config', help='Config file in JSON.') parser.add_argument('--config', help='Config file in JSON.')
parser.add_argument('--name',default=stamp , help='Run name.') parser.add_argument('--name',default=stamp , help='Run name.')
parser.add_argument('-o', '--overwrite', action='store_true', help='Overwite output.')
args = parser.parse_args() args = parser.parse_args()
config = args.config config = args.config
stamp = args.name stamp = args.name
overwrite = args.overwrite
data, steps, prod = get_config_infos(config, stamp)
data.create_folders(ft=None)
logs = Logs() logs = Logs()
data, steps, prod = get_config_infos(config, stamp, stamp_time, logs, overwrite)
data.create_folders()
logs.add_infos("stamp", stamp) logs.add_infos("stamp", stamp)
logs.add_infos("infos", config) logs.add_infos("stamp_time", stamp_time)
logs.add_infos("overwrite", overwrite)
if steps.main == "annotation" or steps.main == "test": if steps.main == "annotation" or steps.main == "test":
data.pre_processing(steps) data.pre_processing(steps)
...@@ -340,6 +400,8 @@ if __name__ == '__main__': ...@@ -340,6 +400,8 @@ if __name__ == '__main__':
steps.get_evaluation_status() steps.get_evaluation_status()
if steps.eval == True: if steps.eval == True:
data.evaluation(steps, prod) data.evaluation(steps, prod)
else:
data.make_output(prod)
elif steps.main == "train": elif steps.main == "train":
steps.get_data_for_train(data) #[steps.set_train, steps.set_dev, steps.set_test] steps.get_data_for_train(data) #[steps.set_train, steps.set_dev, steps.set_test]
...@@ -353,8 +415,7 @@ if __name__ == '__main__': ...@@ -353,8 +415,7 @@ if __name__ == '__main__':
data.evaluation(steps, prod, name=steps.test_data) data.evaluation(steps, prod, name=steps.test_data)
elif steps.main == "fine_tune": elif steps.main == "fine_tune":
steps.get_data_for_fine_tune(data) steps.get_data_for_fine_tune(data)
data.create_folders(steps.ft_stamp)
data.pre_processing(steps, file_in=steps.test_raw) data.pre_processing(steps, file_in=steps.test_raw)
steps.make_sets_ner_format(data) steps.make_sets_ner_format(data)
steps.get_model() # model to be fine-tune steps.get_model() # model to be fine-tune
...@@ -365,4 +426,5 @@ if __name__ == '__main__': ...@@ -365,4 +426,5 @@ if __name__ == '__main__':
if steps.eval == True: if steps.eval == True:
data.evaluation(steps, prod, name=steps.test_data, model=steps.model_ft_path) data.evaluation(steps, prod, name=steps.test_data, model=steps.model_ft_path)
logs.print() logs.print(stamp_time)
\ No newline at end of file print(f"----> All logs saved in {logs.file_path}")
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment