Skip to content
Snippets Groups Projects
Commit af630340 authored by laura.riviere's avatar laura.riviere
Browse files

add refacto usecase2

parent 6744dc6b
No related branches found
No related tags found
1 merge request!3Refacto 1205
......@@ -30,16 +30,18 @@
"validation_data_path": null
}
},
"evaluation": false,
"gold_test_data_path": null
},
"output":{
"file":{
"tab_to_bracket": true,
"conllu":true,
"metadata": true
"conll_file":{
"to_do": true,
"metadata": true,
"with_gold_labels": true
},
"scores":false
"txt_file":{
"to_do": true,
"metadata": true
}
}
}
......
......@@ -30,16 +30,18 @@
"validation_data_path": null
}
},
"evaluation": false,
"gold_test_data_path": null
},
"output":{
"file":{
"conllu":true,
"conll_file":{
"to_do": true,
"metadata": true,
"tab_to_bracket": false
"with_gold_labels": true
},
"scores":false
"txt_file":{
"to_do": true,
"metadata": true
}
}
}
......
{
"usecase_description": "Config file for usecase_2 : from a gold text, make predictions with an existing model, compare gold and predictions with metrics.",
"data_raw": {
"name": "fra.sdrt.annodis_dev",
"exte": ".conllu",
"language": "fr",
"existing_metadata": true
},
"steps":{
"main": "test",
"pre-processing": {
"to_do": false,
"syntactic_tool": "stanza",
"sentence_split": true,
"tokenization": true,
"syntactic_parsing": true,
"create_metadata": {
"to_do": false,
"line": "paragraph",
"sent": "sent"
}
},
"discourse_segmenter": {
"model": "tony",
"training": {
"toolkit": null,
"pre_trained_lm": null,
"config_file": null,
"train_data_path": null,
"validation_data_path": null
}
},
"gold_test_data_path": null
},
"output":{
"conll_file":{
"to_do": true,
"metadata": true,
"with_gold_labels": true
},
"txt_file":{
"to_do": true,
"metadata": true
}
}
}
......@@ -12,11 +12,12 @@ from datetime import datetime
import os
import re
import json
import utils_2.syntactic_parsing as synt_pars
import utils.syntactic_parsing as synt_pars
import utils.conv2ner as conv_to_ner # TODO clean it
import utils.json2conll as json_to_connl # TODO clean it
import utils.training_allennlp as tr_allen
import utils.conll2bracket as c2bracket
import utils.seg_eval as seg_eval
......@@ -26,6 +27,7 @@ class Data:
self.lang = infos['language']
self.path = f"../data/{self.name}"
self.exte = infos['exte']
self.raw = f"{self.path}/{self.name}{self.exte}"
self.stamp = stamp
self.conv = f"{self.path}/data_converted_{stamp}"
self.resu = f"{self.path}/results_{stamp}"
......@@ -33,11 +35,14 @@ class Data:
def create_folders(self): # -> can be rtansfor into method of class
for it in [self.conv, self.resu]:
print(f"----> Checking/creating folder {it}.")
if not os.path.isdir(it):
os.mkdir(it)
my_logs['folders'] = f"{self.conv}, {self.resu}"
def pre_processing(self, steps):
file_in = f"{self.path}/{self.name}{self.exte}"
print("----> Preprocessing input data.")
file_in = self.raw
if steps.pre_process_to_do == True:
file_out = f"{self.conv}/{self.name}.conll"
if steps.synt_tool == "stanza":
......@@ -70,21 +75,38 @@ class Data:
OUTPUT: Tokenized text with just 4 columns.
"""
self.ner = f"{self.preprocessed}.ner"
self.ner = f"{self.conv}/{self.name}.conll.ner"
print(f"----> Making NER format {self.ner}.")
conv_to_ner.main(self.preprocessed, self.ner, "conll") # <-- TODO faire en relatif#TODO add same for train/dev/test for config train
my_logs['data_ner'] = self.ner
def make_predictions(self, steps):
self.pred_json = f"{self.resu}/{self.name}_pred.json"
cmd = f"allennlp predict --use-dataset-reader --output-file {self.pred_json} {steps.model_path} {self.ner} &> {self.resu}/logs_predictions.txt"
print(f"----> Making predictions: {cmd}.")
os.system(cmd)
my_logs['predictions_cmd'] = cmd
def pred_json_to_conll_with_metadata(self):
self.pred_meta_conll = f"{self.resu}/{self.name}_pred_n_meta.conll"
def pred_json_to_conll_w_metadata_w_gold(self): # here and 3 below..sorry..factorsation TBD
self.pred_conll_meta_gold = f"{self.resu}/{self.name}_pred_meta_gold.conll"
json_to_connl.js2conllNmetaNgold(self.pred_json, self.pred_conll_meta_gold, "conll", self.preprocessed)
return self.pred_conll_meta_gold
def pred_json_to_conll_w_metadata(self):
self.pred_meta_conll = f"{self.resu}/{self.name}_pred_meta.conll"
json_to_connl.js2conllNmeta(self.pred_json, self.pred_meta_conll, "conll", self.preprocessed)
return self.pred_meta_conll
def pred_json_to_conll_w_gold(self):
self.pred_conll_gold = f"{self.resu}/{self.name}_pred_gold.conll"
json_to_connl.js2conll(self.pred_json, self.pred_conll_gold, "conll")
return self.pred_conll_gold
def pred_json_to_conll(self):
self.pred_conll = f"{self.resu}/{self.name}_pred.conll"
json_to_connl.js2conll(self.pred_json, self.pred_conll, "conll")
return self.pred_conll
def brackets_txt(self):
self.brack = f"{self.resu}/{self.name}_brac.txt"
......@@ -95,6 +117,44 @@ class Data:
c2bracket.conll2brackets_with_meta(self.pred_meta_conll, self.brack_meta)
def evaluation(self, prod):
self.basic_metrics = f"{self.resu}/Evaluation_metrics.json"
if self.exte == ".conll" or self.exte == ".conllu": # get gold file
gold = self.raw
else:
gold = self.preprocessed
if prod.conll_todo == False: # get pred_file
pred = self.pred_json_to_conll()
else:
if prod.conll_meta == True:
if prod.conll_w_gold == True:
pred = self.pred_json_to_conll_w_metadata_w_gold()
else:
pred = self.pred_json_to_conll_w_metadata()
else:
if prod.conll_w_gold == True:
pred = self.pred_json_to_conll_w_gold()
else:
pred = self.pred_json_to_conll()
print(f"----> Predictions to file {pred}")
print(f"----> Evaluation scores to file {self.basic_metrics}")
scores_dict = seg_eval.get_scores(gold, pred)
with open(self.basic_metrics, 'w') as fo:
json.dump(scores_dict, fo)
class Output:
def __init__(self, infos):
self.conll_todo = infos['conll_file']['to_do']
self.conll_meta = infos['conll_file']['metadata']
self.conll_w_gold = infos['conll_file']['with_gold_labels']
self.txt_todo = infos['txt_file']['to_do']
self.txt_meta = infos['txt_file']['metadata']
class Process:
def __init__(self, infos):
......@@ -109,7 +169,6 @@ class Process:
self.meta_line = infos['pre-processing']['create_metadata']['line']
self.meta_sent = infos['pre-processing']['create_metadata']['sent']
#if self.main == "train":
#if self.ner_init == True : # à faire en relatif !! split truc
# self.train_data = f"{self.data.path}/{self.data.name}_train.ner{self.data.file}"
......@@ -120,14 +179,14 @@ class Process:
self.toolkit = infos['discourse_segmenter']['training']['toolkit']
self.tr_config = infos['discourse_segmenter']['training']['config_file']
self.pretr_lm = infos['discourse_segmenter']['training']['pre_trained_lm']
self.model = infos['discourse_segmenter']['model'] # ezpz for Tony
#self.post_tab = infos['post-processing']['json_to_tab']
self.eval = infos['evaluation']
self.test_data = infos['gold_test_data_path']
def get_evaluation_status(self):
if self.main == "test":
self.eval = True
#elif self.main == "train":
def get_model(self):
self.model_path = ""
......@@ -138,21 +197,12 @@ class Process:
os.system(dl)
self.model_path = f"../model/tony/{arch}"
else:
print("Tony already in place !")
print("----> Tony already in place !")
self.model_path = f"../model/tony/{arch}"
else:
self.model_path = self.model
class Output:
def __init__(self, infos):
self.prod_bracket = infos['file']['tab_to_bracket']
self.prod_conll = infos['file']['conllu']
self.metadata = infos['file']['metadata']
def get_stamp():
now = datetime.now()
stamp = re.sub('[\s:]', '_', str(now))
......@@ -168,14 +218,14 @@ def get_config_infos(config, stamp):
return data, steps, prod
def print_logs():
def print_logs(dict_logs):
file_logs = f"{data.resu}/processes_logs.json"
print(my_logs) # <-- ahaha TBD
with open(file_logs, 'w') as fi:
json.dump(dict_logs, fi, indent=4)
if __name__ == '__main__':
my_logs = {}
stamp = get_stamp()
......@@ -191,32 +241,15 @@ if __name__ == '__main__':
data.create_folders()
data.pre_processing(steps)
data.make_ner_format()
#TEST data_preprocessed = "/home/lriviere/andiamo/discut22/data/edgar_poe_short/data_converted_vendredi/edgar_poe_short.conll"
steps.get_model()
if steps.main == "annotation" or steps.main == "test":
data.make_predictions(steps)
#data_pred_json = "/home/lriviere/andiamo/discut22/data/edgar_poe_short/results_vendredi/edgar_poe_short_pred.json"
data.pred_json_to_conll()
if prod.metadata == True:
data.pred_json_to_conll_with_metadata()
if prod.prod_bracket == True:
if data.exte != ".txt":
exit("pb")
else:
data.brackets_txt()
data.brackets_txt_with_metadata()
data.make_predictions(steps) # output allennlp JSON
#elif steps.main == "train":
steps.get_evaluation_status()
if steps.eval == True:
data.evaluation(prod)
#scores = compare_pred_gold()
#print_logs()
\ No newline at end of file
print_logs(my_logs) # <-- attention variable globale !
\ No newline at end of file
......@@ -110,8 +110,8 @@ def conversion2ner(input, output, params=None):
# then, previous token label is set to B-E to signal end of previous segment
res[-1][-1] = "B-E"
start_doc = False
if label not in maptags:
print("warning, strange label ",label,file=sys.stderr)
#if label not in maptags:
#print("warning, strange label ",label,file=sys.stderr)
res.append([w,pos,"O",tag])
for line in res:
......
......@@ -5,6 +5,7 @@ conll format
import json
import sys
import re
#filepath = sys.argv[1]
#config = sys.argv[2]
......@@ -60,7 +61,8 @@ def js2conllNmeta(data_pred_json, data_out, config, data_meta):
tag = data[sent_pred_count]['tags'][tok]
tok += 1
#print(f"tok: {tok}, tag: {tag}, word: {word}, line : {line}")
fo.write(f"{line}\t{map[tag]}\n")
new_line = re.sub('\t[^\t]+$', '', line)
fo.write(f"{new_line}\t{map[tag]}\n")
#if int(line.split("\t")[0]) == tok and line.split("\t")[1] == word:
# fo.write(f"{line}\t{tag}\n")
......@@ -68,4 +70,29 @@ def js2conllNmeta(data_pred_json, data_out, config, data_meta):
#print(f"sentpred : {sent_pred}\n")
#print(f"word n tag : {word}:::{tag}\n")
\ No newline at end of file
def js2conllNmetaNgold(data_pred_json, data_out, config, gold_n_meta):
data = []
sent_pred_count = 0
tok = 0
for line in open(data_pred_json, 'r'):
data.append(json.loads(line))
with open(data_out, 'w', encoding='utf-8') as fo, open(gold_n_meta, 'r') as fm:
# id
for line in fm:
line = line.strip()
if line.startswith("#"):
fo.write(f"{line}\n")
elif line == "":
sent_pred_count += 1
tok = 0
fo.write(f"{line}\n")
else:
sent_pred = data[sent_pred_count]
word = data[sent_pred_count]['words'][tok]
tag = data[sent_pred_count]['tags'][tok]
tok += 1
#print(f"tok: {tok}, tag: {tag}, word: {word}, line : {line}")
fo.write(f"{line}\t{map[tag]}\n")
\ No newline at end of file
......@@ -68,9 +68,9 @@ Arguments:
"""
__author__ = "Amir Zeldes"
#__author__ = "Amir Zeldes"
__license__ = "Apache 2.0"
__version__ = "1.0.1"
#__version__ = "1.0.1"
def parse_data(infile, string_input=False):
if not string_input:
......@@ -222,22 +222,3 @@ def get_scores(gold_file, pred_file, string_input=False):
return score_dict
if __name__ == "__main__":
p = argparse.ArgumentParser()
p.add_argument("goldfile",help="Shared task gold file in .tok or .conll format")
p.add_argument("predfile",help="Corresponding file with system predictions")
p.add_argument("-s","--string_input",action="store_true",help="Whether inputs are file names or strings")
opts = p.parse_args()
score_dict = get_scores(opts.goldfile,opts.predfile,opts.string_input)
print("File: " + score_dict["doc_name"])
print("o Total tokens: " + str(score_dict["tok_count"]))
print("o Gold " +score_dict["seg_type"]+": " + str(score_dict["gold_seg_count"]))
print("o Predicted "+score_dict["seg_type"]+": " + str(score_dict["pred_seg_count"]))
print("o Precision: " + str(score_dict["prec"]))
print("o Recall: " + str(score_dict["rec"]))
print("o F-Score: " + str(score_dict["f_score"]))
File moved
File deleted
File deleted
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment