Skip to content
Snippets Groups Projects
Commit 391c66ab authored by larivier's avatar larivier
Browse files

Merge branch 'archi-doc' into 'main'

new directories archi and more logs

See merge request !4
parents c20a29ea 9544fd95
Branches
No related tags found
1 merge request!4new directories archi and more logs
......@@ -15,7 +15,7 @@
"tokenization": true,
"syntactic_parsing": false,
"create_metadata": {
"to_do": false,
"to_do": true,
"line": "paragraph",
"sent": "sent"
}
......@@ -36,10 +36,10 @@
"conll_file":{
"to_do": true,
"metadata": true,
"with_gold_labels": true
"with_gold_labels": false
},
"txt_file":{
"to_do": true,
"to_do": false,
"metadata": true
}
}
......
{
"usecase_description": "Config file for usecase_2 : from a gold text, make predictions with an existing model, compare gold and predictions with metrics.",
"data_raw": {
"name": "fra.sdrt.annodis_dev",
"name": "eng.pdtb.pdtb_dev",
"exte": ".conllu",
"language": "fr",
"language": "en",
"existing_metadata": true
},
"steps":{
......@@ -15,13 +15,13 @@
"tokenization": true,
"syntactic_parsing": true,
"create_metadata": {
"to_do": false,
"to_do": true,
"line": "paragraph",
"sent": "sent"
}
},
"discourse_segmenter": {
"model": "tony",
"model": "/home/lriviere/andiamo/morteza/discut/Results_conllu/results_eng.pdtb.pdtb_bert/model.tar.gz",
"training": {
"toolkit": null,
"pre_trained_lm": null,
......
......@@ -30,7 +30,7 @@
"validation_data_path": "eng.rst.rstdt_dev"
}
},
"gold_test_data_path": "eng.rst.rstdt_test"
"gold_test_data_path": "eng.rst.rstdt_dev"
},
"output":{
"conll_file":{
......@@ -39,7 +39,7 @@
"with_gold_labels": true
},
"txt_file":{
"to_do": true,
"to_do": false,
"metadata": true
}
}
......
......@@ -30,7 +30,7 @@
"validation_data_path": "/home/lriviere/andiamo/discut22/data/eng.sdrt.stac/eng.sdrt.stac_dev.conllu"
}
},
"gold_test_data_path": "eng.rst.rstdt_test"
"gold_test_data_path": "eng.rst.rstdt_dev"
},
"output":{
"conll_file":{
......@@ -39,7 +39,7 @@
"with_gold_labels": true
},
"txt_file":{
"to_do": true,
"to_do": false,
"metadata": true
}
}
......
......@@ -23,27 +23,51 @@ import utils.seg_eval as seg_eval
class Data:
def __init__(self, infos, stamp):
def __init__(self, infos, stamp, stamp_time, overwrite):
self.name = infos['name']
self.lang = infos['language']
self.path = f"../data/{self.name}"
self.exte = infos['exte']
self.raw = f"{self.path}/{self.name}{self.exte}"
self.stamp = stamp
self.conv = f"{self.path}/data_converted_{stamp}"
self.resu = f"{self.path}/results_{stamp}"
self.stamp_time = stamp_time
self.proj = "../projects"
self.run = f"{self.proj}/{stamp}"
self.conv = f"{self.run}/data_converted"
self.resu = f"{self.run}/results"
self.over = overwrite
self.meta = infos['existing_metadata']
def create_folders(self, ft=None): # -> can be rtansfor into method of class
folders_list=[self.conv, self.resu]
#folders_list=[self.conv]
if ft != None:
self.fine = f"{self.resu}/fine_tune_{ft}"
#folders_list.append(self.fine) # made automatically by allennlp
for it in folders_list:
print(f"----> Checking/creating folder {it}.")
if not os.path.isdir(it):
os.mkdir(it)
def create_folders(self):
print(f"----> Checking/creating folders.")
if not os.path.isdir(self.proj):
os.mkdir(self.proj)
if not os.path.isdir(self.run):
os.mkdir(self.run)
if not os.path.isdir(self.conv):
os.mkdir(self.conv)
elif self.over == False:
self.conv = f"{self.conv}_{stamp_time}"
os.mkdir(self.conv)
if not os.path.isdir(self.resu):
os.mkdir(self.resu)
elif self.over == False:
self.resu = f"{self.resu}_{stamp_time}"
os.mkdir(self.resu)
self.resu_fine = f"{self.resu}/fine_tune"
if os.path.isdir(self.resu_fine) and self.over == False:
self.resu_fine = f"{self.resu_fine}_{stamp_time}"
elif os.path.isdir(self.resu_fine) and self.over == True:
os.rmdir(self.resu_fine)
self.resu_train = f"{self.resu}/train"
if os.path.isdir(self.resu_train) and self.over == False:
self.resu_train = f"{self.resu_train}_{stamp_time}"
def pre_processing(self, steps, file_in=None):
file_in = self.raw if file_in == None else file_in
......@@ -178,6 +202,30 @@ class Data:
print(f"----> Predictions to file {pred_txt}.")
logs.add_infos('output_txt_file', pred_txt)
def make_output(self, prod):
if prod.conll_todo == True:
if prod.conll_meta == True:
pred = self.pred_json_to_conll_w_metadata()
else:
pred = self.pred_json_to_conll()
print(f"----> Predictions to file {pred}.")
logs.add_infos('output_conll_file', pred)
if prod.txt_todo == True:
if prod.txt_meta == True:
pred = self.pred_meta_conll = f"{self.resu}/{self.name}_pred_meta.conll"
if not os.path.isfile(pred):
self.pred_json_to_conll_w_metadata()
pred_txt = self.brackets_txt_with_metadata()
# os.system(f"rm {pred})
else:
pred = self.pred_conll = f"{self.resu}/{self.name}_pred.conll"
if not os.path.isfile(pred):
self.pred_json_to_conll()
pred_txt = self.brackets_txt()
# os.system(f"rm {pred})
print(f"----> Predictions to file {pred_txt}.")
logs.add_infos('output_txt_file', pred_txt)
class Output:
def __init__(self, infos):
......@@ -216,6 +264,8 @@ class Process:
def get_evaluation_status(self):
if self.main == "test" or self.main == "train" or self.main == "fine_tune":
self.eval = True
else:
self.eval = False # "annotation"
def get_model(self):
self.model_path = ""
......@@ -242,12 +292,11 @@ class Process:
spec: testset is the same that data_raw_name /
trainset & devset are elsewhere and config fill with path not just name
"""
self.ft_stamp = re.sub('^.*/', '', self.set_train)
self.train_raw = self.set_train
self.dev_raw = self.set_dev
self.test_raw = f"{data.path}/{self.set_test}{data.exte}"
# reset names to go ez pz for ner formatage
self.set_train = re.sub('\.[^\.]+$', '', self.ft_stamp)
self.set_train = re.sub('\.[^\.]+$', '', re.sub('^.*/', '', self.set_train))
self.set_dev = re.sub('\.[^\.]+$', '', re.sub('^.*/', '', self.dev_raw))
def make_sets_ner_format(self, data): #[steps.set_train, steps.set_dev, steps.set_test]
......@@ -262,7 +311,7 @@ class Process:
conv_to_ner.main(self.test_raw, self.test_ner, "conll")
def update_training_config(self):
logs.add_infos('training_config', self.tr_config)
logs.add_json('training_config', self.tr_config)
self.tr_config_updated = re.sub('.jsonnet$', '_up.jsonnet', self.tr_config)
with open(self.tr_config, 'r') as js:
tr_conf = json.load(js)
......@@ -270,23 +319,25 @@ class Process:
tr_conf['validation_data_path'] = self.dev_ner
with open(self.tr_config_updated, 'w') as js:
json.dump(tr_conf, js)
logs.add_infos('training_config_updated', self.tr_config_updated)
logs.add_json('training_config_updated', self.tr_config_updated)
def training(self, data):
#cmd = f"allennlp train -s {data.resu} -f {self.tr_config_updated} &> {data.resu}/logs_training.txt"
cmd = f"allennlp train -s {data.resu} {self.tr_config_updated}" # &> {data.resu}/logs_training.txt"
cmd = f"allennlp train -s {data.resu_train} {self.tr_config_updated} &> {data.resu}/logs_training_{data.stamp_time}.txt"
cmd = cmd if data.over == False else re.sub('&>', '-f &>', cmd)
print(f"----> Training : {cmd}")
os.system(cmd)
steps.model_path = f"{data.resu}/model.tar.gz"
logs.add_infos('model_to make predictions', self.model)
steps.model_path = f"{data.resu_train}/model.tar.gz"
logs.add_infos('model_to make predictions', self.model_path)
logs.add_infos('logs_trainning_file', f"{data.resu}/logs_training_{data.stamp_time}.txt" )
def fine_tuning(self, data):
logs.add_infos('model_to be fine-tuned', self.model)
cmd = f"allennlp fine-tune -m {self.model_path} -c {self.tr_config_updated} -s {data.fine}" # &> {data.resu}/logs_fine-tuning.txt"
cmd = f"allennlp fine-tune -m {self.model_path} -c {self.tr_config_updated} -s {data.resu_fine} &> {data.resu}/logs_fine-tuning_{data.stamp_time}.txt"
print(f"----> Fine-tuning : {cmd}")
os.system(cmd)
self.model_ft_path = f"{data.fine}/model.tar.gz"
self.model_ft_path = f"{data.resu_fine}/model.tar.gz"
logs.add_infos('model_to make predictions', self.model_ft_path)
logs.add_infos('logs_fine-tuning_file', f"{data.resu}/logs_fine-tuning_{data.stamp_time}.txt")
def get_stamp():
......@@ -294,43 +345,52 @@ def get_stamp():
stamp = re.sub('[\s:]', '_', str(now))
return stamp
def get_config_infos(config, stamp):
def get_config_infos(config, stamp, stamp_time, logs, overwrite):
with open(config, 'r', encoding='utf-8') as f:
infos = json.load(f)
data = Data(infos['data_raw'], stamp)
data = Data(infos['data_raw'], stamp, stamp_time, overwrite)
steps = Process(infos['steps'])
prod = Output(infos['output'])
logs.add_infos('config', infos)
return data, steps, prod
class Logs:
def __init__(self):
self.file_path = f"{data.resu}/logs_processes.json"
self.dict = {}
def add_infos(self, key, value):
self.dict[key] = value
def print(self):
def add_json(self, key, jsonf):
with open(jsonf, 'r', encoding='utf-8') as f:
infos = json.load(f)
self.dict[key] = infos
def print(self, stamp_time):
self.file_path = f"{data.run}/logs_global_{stamp_time}.json"
with open(self.file_path, 'w', encoding='utf-8') as fl:
json.dump(self.dict, fl, indent=4)
if __name__ == '__main__':
stamp = get_stamp()
stamp = stamp_time = get_stamp()
parser = argparse.ArgumentParser()
parser.add_argument('--config', help='Config file in JSON.')
parser.add_argument('--name',default=stamp , help='Run name.')
parser.add_argument('-o', '--overwrite', action='store_true', help='Overwite output.')
args = parser.parse_args()
config = args.config
stamp = args.name
overwrite = args.overwrite
data, steps, prod = get_config_infos(config, stamp)
data.create_folders(ft=None)
logs = Logs()
data, steps, prod = get_config_infos(config, stamp, stamp_time, logs, overwrite)
data.create_folders()
logs.add_infos("stamp", stamp)
logs.add_infos("infos", config)
logs.add_infos("stamp_time", stamp_time)
logs.add_infos("overwrite", overwrite)
if steps.main == "annotation" or steps.main == "test":
data.pre_processing(steps)
......@@ -340,6 +400,8 @@ if __name__ == '__main__':
steps.get_evaluation_status()
if steps.eval == True:
data.evaluation(steps, prod)
else:
data.make_output(prod)
elif steps.main == "train":
steps.get_data_for_train(data) #[steps.set_train, steps.set_dev, steps.set_test]
......@@ -353,8 +415,7 @@ if __name__ == '__main__':
data.evaluation(steps, prod, name=steps.test_data)
elif steps.main == "fine_tune":
steps.get_data_for_fine_tune(data)
data.create_folders(steps.ft_stamp)
steps.get_data_for_fine_tune(data)
data.pre_processing(steps, file_in=steps.test_raw)
steps.make_sets_ner_format(data)
steps.get_model() # model to be fine-tune
......@@ -365,4 +426,5 @@ if __name__ == '__main__':
if steps.eval == True:
data.evaluation(steps, prod, name=steps.test_data, model=steps.model_ft_path)
logs.print()
\ No newline at end of file
logs.print(stamp_time)
print(f"----> All logs saved in {logs.file_path}")
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment