Skip to content
Snippets Groups Projects
Commit b369e12e authored by larivier's avatar larivier
Browse files

Merge branch 'refacto-1205' into 'main'

Refacto 1205

See merge request !3
parents 415aed4a af630340
Branches
Tags
1 merge request!3Refacto 1205
# Classes for discut22
class Input:
def __init__(self, infos, stamp):
self.name = infos['name']
self.lang = infos['language']
# self.path = infos['folder_path'] # misused
self.path = f"../data/{self.name}"
self.file = infos['file']
self.stamp = stamp
self.conv = f"{self.path}/data_converted_{stamp}" # à intégrer
self.resu = f"{self.path}/results_{stamp}"
class Process:
def __init__(self, infos, data):
self.data = data
self.main = infos["main"] # train test annotation
self.toke = infos['pre-processing']['tokenization']
self.toke_tool = infos['pre-processing']['tokenization_tool']
self.ssplit = infos['pre-processing']['sentence_split']
self.ssplitor = infos['pre-processing']['sentence_split_splitor']
self.ner_init = infos['pre-processing']['NER_format_initialisation'] # useless because done anyway
if self.main == "train":
if self.ner_init == True : # à faire en relatif !! split truc
self.train_data = f"{self.data.path}/{self.data.name}_train.ner{self.data.file}"
self.dev_data = f"{self.data.path}/{self.data.name}_dev.ner{self.data.file}"
else :
self.train_data = infos['discourse_segmenter']['training']['train_data_path']
self.dev_data = infos['discourse_segmenter']['training']['validation_data_path']
self.toolkit = infos['discourse_segmenter']['training']['toolkit']
self.tr_config = infos['discourse_segmenter']['training']['config_file']
self.pretr_lm = infos['discourse_segmenter']['training']['pre_trained_lm']
self.model = infos['discourse_segmenter']['model'] # ezpz for Tony
self.post_tab = infos['post-processing']['json_to_tab']
self.eval = infos['evaluation']
self.test_data = infos['gold_test_data_path']
self.post_bracket = infos['post-processing']['tab_to_bracket']
self.post_conll = infos['post-processing']['metadata_conll']
\ No newline at end of file
{
"usecase_description": "Config file for usecase_1 : from a raw text, get the same text but with EDU bracket.",
"data_raw": {
"name": "edgar_poe_en",
"exte": ".txt",
"language": "en",
"existing_metadata": true
},
"steps":{
"main": "annotation",
"pre-processing": {
"to_do": true,
"syntactic_tool": "stanza",
"sentence_split": true,
"tokenization": true,
"syntactic_parsing": false,
"create_metadata": {
"to_do": false,
"line": "paragraph",
"sent": "sent"
}
},
"discourse_segmenter": {
"model": "/home/lriviere/andiamo/discut/Results_conllu/results_eng.rst.gum-eng_bert/model.tar.gz",
"training": {
"toolkit": null,
"pre_trained_lm": null,
"config_file": null,
"train_data_path": null,
"validation_data_path": null
}
},
"gold_test_data_path": null
},
"output":{
"conll_file":{
"to_do": true,
"metadata": true,
"with_gold_labels": true
},
"txt_file":{
"to_do": true,
"metadata": true
}
}
}
{
"usecase_description": "Config file for usecase_1 : from a text, get the same text but with EDU bracket.",
"input": {
"name": "eng_annotation",
"file": ".conllu",
"language": "en"
"usecase_description": "Config file for usecase_1 : from a tokenized text, get the same text but with EDU bracket.",
"data_raw": {
"name": "edgar_poe_short",
"exte": ".conll",
"language": "en",
"existing_metadata": true
},
"steps":{
"main": "annotation",
"pre-processing": {
"tokenization": false,
"tokenization_tool" : null,
"sentence_split": false,
"sentence_split_splitor": null,
"syntactic_parsing": false,
"NER_format_initialisation": true
"to_do": false,
"syntactic_tool": "stanza",
"sentence_split": true,
"tokenization": true,
"syntactic_parsing": true,
"create_metadata": {
"to_do": true,
"line": "paragraph",
"sent": "sent"
}
},
"discourse_segmenter": {
"model": "/home/lriviere/andiamo/discut22/data/eng.sdrt.stac/results_2022-11-21_15_42_42.923648/model.tar.gz",
"model": "/home/lriviere/andiamo/discut/Results_conllu/results_eng.rst.gum-eng_bert/model.tar.gz",
"training": {
"toolkit": null,
"pre_trained_lm": null,
......@@ -25,13 +30,20 @@
"validation_data_path": null
}
},
"post-processing": {
"json_to_tab": true,
"tab_to_bracket":true
},
"evaluation": false,
"gold_test_data_path": null
},
"output":{
"conll_file":{
"to_do": true,
"metadata": true,
"with_gold_labels": true
},
"txt_file":{
"to_do": true,
"metadata": true
}
}
}
{
"usecase_description": "Config file for usecase_2",
"input": {
"usecase_description": "Config file for usecase_2 : from a gold text, make predictions with an existing model, compare gold and predictions with metrics.",
"data_raw": {
"name": "fra.sdrt.annodis_dev",
"file": ".conllu",
"language": "fr"
"exte": ".conllu",
"language": "fr",
"existing_metadata": true
},
"steps":{
"main": "test",
"pre-processing": {
"tokenization": false,
"tokenization_tool" : "spacy",
"sentence_split": false,
"sentence_split_splitor": "stanza",
"syntactic_parsing": false,
"NER_format_initialisation": true
"to_do": false,
"syntactic_tool": "stanza",
"sentence_split": true,
"tokenization": true,
"syntactic_parsing": true,
"create_metadata": {
"to_do": false,
"line": "paragraph",
"sent": "sent"
}
},
"discourse_segmenter": {
"model": "tony",
......@@ -25,14 +30,20 @@
"validation_data_path": null
}
},
"post-processing": {
"json_to_tab": true,
"metadata_conll": true,
"tab_to_bracket":true
},
"evaluation": true,
"gold_test_data_path": null
},
"output":{
"conll_file":{
"to_do": true,
"metadata": true,
"with_gold_labels": true
},
"txt_file":{
"to_do": true,
"metadata": true
}
}
}
{
"usecase_description": "Config file for usecase_3 : Take a EDU gold segmented set of train/dev/test of texts au format conll as input, train a model, output scores.",
"input": {
"name": "eng.sdrt.stac",
"file": ".conllu",
"language": "en"
},
"steps":{
"main": "train",
"pre-processing": {
"tokenization": false,
"tokenization_tool" : null,
"sentence_split": false,
"sentence_split_splitor": null,
"syntactic_parsing": false,
"NER_format_initialisation": true
},
"discourse_segmenter": {
"model": null,
"training": {
"toolkit": "allennlp",
"pre_trained_lm": "bert",
"config_file": "../model/config_training_bert.jsonnet",
"train_data_path": "../data/eng.sdrt.stac/eng.sdrt.stac_train.conllu",
"validation_data_path": "../data/eng.sdrt.stac/eng.sdrt.stac_dev.conllu"
}
},
"post-processing": {
"json_to_tab": false,
"tab_to_bracket":false
},
"evaluation": true,
"gold_test_data_path": "../data/eng.sdrt.stac/eng.sdrt.stac_test.conllu"
}
}
######################################
###### DISCOURSE SEGMENTOR 2022 ######
######################################
""" This the main script
And the only one to run,
after completion of config.json """
import os
import sys
import argparse
import re
from datetime import datetime
import pandas as pd # for futur clean output in df
import json
from classes_def import Input, Process
import utils
#import utils.fr_tokenize as tk
import utils.conv2ner as c2n
import utils.json2conll as j2c
import utils.conll2bracket as c2bracket
import utils.sent_split as ssent
import utils.training_allennlp as tr_allen
# fonction to get config stuffs
def get_config_infos(stamp, config_file):
with open(config_file) as f:
infos = json.load(f)
data_in = Input(infos['input'], stamp)
actions = Process(infos['steps'], data_in)
print(f"data to be process : {data_in.name}")
return actions
# fonction to load existing model -> only tony for now
def get_model(model_name):
name = model_name
output = ""
if name == "tony":
arch = "french_tokens.tar.gz"
if not os.path.isfile(f"../model/{name}/{arch}"):
dl = "wget https://zenodo.org/record/4235850/files/french_tokens.tar.gz -P ../model --progress=bar"
os.system(dl)
output = f"../model/{name}/{arch}"
else:
print("Tony already in place !")
output = f"../model/{name}/{arch}"
else:
output = model_name
return output
def text_tokenization(f_in, f_out, lang, tool):
if lang == "fr" :
if tool == "spacy" :
tk.main(f_in, f_out) # .ss -> .tok
def main(steps):
# FN: soit besoin sent split, soit besoin tokenizer, soit aucun des deux
if steps.ssplit == True : # python code/ssplit/parse_corpus.py ${dataset} --parser stanza --out_dir data
#### Split text into sentence : not in usecase1
if not steps.ssplitor == "stanza" :
print("pls define sentence splitor") # raise error n kill process
data_in = f"{steps.data.path}/{steps.data.name}{steps.data.file}"
data_tok = f"{steps.data.path}/{steps.data.name}.tok"
print(f"Starting sentence spliting...to {steps.data.path}/steps.data.name")
ssent.main(data_in, data_tok, steps.ssplitor, steps.data.lang)
elif steps.toke == True :
#### Tokenization du text # #python ${SEG_DIR}/code/utils/fr_tokenize.py $RAW > ${RAW}.tok
data_in = f"{steps.data.path}/{steps.data.name}{steps.data.file}"
data_tok = f"{steps.data.path}/{steps.data.name}.tok"
print(f"Starting Tokenization...to {data_tok}")
#tk.main(f_in, f_out) # .ss -> .tok
text_tokenization(data_in, data_tok, steps.data.lang, steps.toke_tool) # .ss -> .tok
else:
data_tok = f"{steps.data.path}/{steps.data.name}{steps.data.file}"
if steps.ner_init == True:
if steps.main == "test" or steps.main =="annotation":
#### Conversion en NER pb # #python $RUNTIME/conv2ner.py ${RAW}.tok > ${RAW}.ner.tok
data_ner = f"{steps.data.path}/{steps.data.name}.ner.tok"
print(f"Starting conversion to NER format...to {data_ner}")
c2n.main(data_tok, data_ner, steps.data.file)
elif steps.main == "train":
for part in ["train", "dev", "test"]:
data_tok = f"{steps.data.path}/{steps.data.name}_{part}{steps.data.file}"
data_ner = f"{steps.data.path}/{steps.data.name}_{part}.ner{steps.data.file}"
print("Starting conversion to NER format...to {}".format(data_ner))
c2n.main(data_tok, data_ner, steps.data.file)
# Create the results directory
if not os.path.isdir(steps.data.resu):
print(" result directory does not exist yet")
os.mkdir(steps.data.resu)
if steps.main == "train":
if steps.toolkit == "allennlp":
print("toolkit allennlp for training")
tr_allen.main(steps)
# set the value of model from null to what was just created by training
steps.model = f"{steps.data.resu}/model.tar.gz"
elif steps.toolkit == "jiant":
print("Jiant toolkit not ready")
else :
print("toolkit unknown")
#check config train file
elif steps.main == "test" or steps.main =="annotation":
#### Appliquer le model choisi, sortir le JSON avec les predictions :score, proba, tags
# #allennlp predict --use-dataset-reader --output-file ${RESULT_DIR}/${FILE}.json ${MODEL} ${RAW}.ner.tok
print(f"Checking for model...{steps.model}")
model_path = get_model(steps.model)
print(f"model{model_path}")
data_json = f"{steps.data.resu}/{steps.data.name}.json"
print(f"datapred: {data_json}\n")
print(f"input: {data_ner}\n")
cmd = f"allennlp predict --use-dataset-reader --output-file {data_json} {model_path} {data_ner} &> {steps.data.resu}/logs.txt"
print(f"comd{cmd}")
print("Starting Prediction...")
os.system(cmd)
#### ------------------------------- TBD do the same but with python script (or JIANT ??)
else:
print(" pb define model")
if steps.post_tab == True :
#### Appliquer les predictions au texte et sortir le texte tokenisé avec la colone des tags-prédis
# # #python $RUNTIME/json2conll.py ${RESULT_DIR}/${FILE}.json split.tok > ${RESULT_DIR}/${FILE}.split.tok
data_conll = f"{steps.data.resu}/{steps.data.name}.split.tok"
format = "split.tok" # to retrive from config file !!!
print(f"Starting Formating from json to tok format...to {data_conll}")
j2c.main(data_json, format, data_conll)
####### EVALUATION AGAINST GOLD
# python discut/code/utils/seg_eval.py data_gold data_pred (-s)
if steps.eval == True :
if steps.main == "train":
data_gold = steps.test_data # (())== data NER because of ner_init == true((deleted))
if steps.ner_init == True :
data_gold_ner = f"{steps.data.path}/{steps.data.name}_test.ner.conllu"
# make predictions on test_data
model_path = steps.model # model just been created
# data_json about to be created by predict cmd
data_json = f"{steps.data.resu}/{steps.data.name}_test.predictions.json" ## à faire en relatif !! [opt : --silent ??]
cmd = f"allennlp predict --use-dataset-reader --output-file {data_json} {model_path} {data_gold_ner} --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --predictor sentence-tagger --include-package allen_custom.custom_bert_token_embedder &> {steps.data.resu}/logs.txt"
#cmd = f"allennlp predict --use-dataset-reader --output-file {data_json} {model_path} {data_gold} &> {steps.data.resu} /logs.txt"
print("Starting Prediction...")
print(f"cmd prediction: {cmd}")
os.system(cmd)
data_conll = f"{steps.data.resu}/{steps.data.name}_test.predictions.conll" ## à faire en relatif
print(f"Starting Formating from json to tok format...to {data_conll}")
j2c.main(data_json, "split.tok", data_conll)
print(f"starting eval, gold={data_gold}, predictions={data_conll}, model={model_path}")
data_g = "../data/eng.rst.rstdt/eng.rst.rstdt_test.conllu"
data_p = "../data/eng.rst.rstdt/results/eng.rst.rstdt_test.predictions.conll" # == data_conll
cmd = f"python utils/seg_eval.py {data_gold} {data_conll} &> {steps.data.resu}/Evaluation.txt"
os.system(cmd)
else :
data_gold = data_tok # changer les noms des var, c'est pas clair !
data_pred = data_conll #
cmd = f"python utils/seg_eval.py {data_gold} {data_pred} &> {steps.data.resu}/Evaluation.txt"
os.system(cmd)
if steps.post_conll == True:
f_pred = f"{steps.data.resu}/{steps.data.name}.split.tok"
predictions = open(f_pred, 'r')
first_line = predictions.readline()
columns = first_line.split("\t")
predictions.close()
f_out = f"{steps.data.resu}/{steps.data.name}_full_output.conllu"
with open(f_out, "w") as fo:
f_in = f"{steps.data.path}/{steps.data.name}{steps.data.file}"
with open(f_in, "r") as fi:
f_pred = f"{steps.data.resu}/{steps.data.name}.split.tok"
with open(f_pred, "r") as fp:
df = pd.read_csv(fp, header=None, sep="\t", usecols=[len(columns)-1])
#df = df.dropna()
print(f"longueur={len(df)}")
print(f"line bug: {df.iloc[3047-148:3060-148,:]}\n")
print(f"type {type(df.iloc[4,:])}")
i = 0
for line in fi:
line = line.strip()
if line.startswith("#"):
fo.write(f"{line}\n")
elif line == "":
fo.write(f"{line}\n")
i +=1
else:
fo.write(f"{line}")
labels = df.iloc[i,:].values.tolist()
for tag in labels:
fo.write(f"\t{tag}")
fo.write("\n")
#fo.write(f"{df.iloc[i,:]}\n")
i += 1
#print(f"i::{i}\t")
if steps.post_bracket == True :
####prendre le texte tokénisé+tags-prédits et sortir le texte en plain (format du d'ebut, for now en suite de phrases) avec les brackets
# # #python $RUNTIME/conll2bracket.py ${RESULT_DIR}/${FILE}.split.tok > ${RESULT_DIR}/${FILE}.split.tok.bracket
data_bracket = f"{steps.data.resu}/{steps.data.name}.split.tok.bracket"
print(f"Starting formating into bracket text...to {data_bracket}")
c2bracket.main(data_conll, data_bracket)
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--config', help='Config file in JSON')
args = parser.parse_args()
config = args.config
now = datetime.now()
#stamp = re.sub('[\s:]', '_', str(now))
stamp = "debug1205"
my_logs = {}
my_logs['stamp'] = stamp
steps = get_config_infos(stamp, config)
print(stamp)
main(steps)
#print("Done.")
\ No newline at end of file
######################################
###### DISCOURSE SEGMENTOR 2022 ######
######################################
""" This the main script
And the only one to run,
after completion of config.json
Discut22 uses allennlp toolkit. For that, it need NER intermediary format.
"""
import argparse
from datetime import datetime
import os
import re
import json
import utils.syntactic_parsing as synt_pars
import utils.conv2ner as conv_to_ner # TODO clean it
import utils.json2conll as json_to_connl # TODO clean it
import utils.training_allennlp as tr_allen
import utils.conll2bracket as c2bracket
import utils.seg_eval as seg_eval
class Data:
def __init__(self, infos, stamp):
self.name = infos['name']
self.lang = infos['language']
self.path = f"../data/{self.name}"
self.exte = infos['exte']
self.raw = f"{self.path}/{self.name}{self.exte}"
self.stamp = stamp
self.conv = f"{self.path}/data_converted_{stamp}"
self.resu = f"{self.path}/results_{stamp}"
self.meta = infos['existing_metadata']
def create_folders(self): # -> can be rtansfor into method of class
for it in [self.conv, self.resu]:
print(f"----> Checking/creating folder {it}.")
if not os.path.isdir(it):
os.mkdir(it)
my_logs['folders'] = f"{self.conv}, {self.resu}"
def pre_processing(self, steps):
print("----> Preprocessing input data.")
file_in = self.raw
if steps.pre_process_to_do == True:
file_out = f"{self.conv}/{self.name}.conll"
if steps.synt_tool == "stanza":
processors = []
metadata = {}
if steps.toke == True:
processors.extend(['tokenize', 'mwt'])
if steps.synt_parse == True:
processors.extend(['pos', 'lemma', 'depparse'])
#if steps.ssplit == True:
# processors.append('constituency')
if steps.crea_meta == True:
metadata['line'] = steps.meta_line
metadata['sent'] = steps.meta_sent
if data.meta == True:
metadata['meta'] = True
processors_str = ",".join(processors)
synt_pars.with_stanza(data.lang, file_in, file_out, processors_str, metadata)
else:
exit(f"Exited. Not valid syntactic tool: \"{steps.synt_tool}\". Options: \"stanza\". Change your config file.")
else:
file_out = file_in
my_logs['data_preprocessed'] = file_out
self.preprocessed = file_out
def make_ner_format(self):
"""
This fonction build the NER format upon the Segmentor works.
INPUT: Tokenized text with whatever number of columns.
OUTPUT: Tokenized text with just 4 columns.
"""
self.ner = f"{self.preprocessed}.ner"
self.ner = f"{self.conv}/{self.name}.conll.ner"
print(f"----> Making NER format {self.ner}.")
conv_to_ner.main(self.preprocessed, self.ner, "conll") # <-- TODO faire en relatif#TODO add same for train/dev/test for config train
my_logs['data_ner'] = self.ner
def make_predictions(self, steps):
self.pred_json = f"{self.resu}/{self.name}_pred.json"
cmd = f"allennlp predict --use-dataset-reader --output-file {self.pred_json} {steps.model_path} {self.ner} &> {self.resu}/logs_predictions.txt"
print(f"----> Making predictions: {cmd}.")
os.system(cmd)
my_logs['predictions_cmd'] = cmd
def pred_json_to_conll_w_metadata_w_gold(self): # here and 3 below..sorry..factorsation TBD
self.pred_conll_meta_gold = f"{self.resu}/{self.name}_pred_meta_gold.conll"
json_to_connl.js2conllNmetaNgold(self.pred_json, self.pred_conll_meta_gold, "conll", self.preprocessed)
return self.pred_conll_meta_gold
def pred_json_to_conll_w_metadata(self):
self.pred_meta_conll = f"{self.resu}/{self.name}_pred_meta.conll"
json_to_connl.js2conllNmeta(self.pred_json, self.pred_meta_conll, "conll", self.preprocessed)
return self.pred_meta_conll
def pred_json_to_conll_w_gold(self):
self.pred_conll_gold = f"{self.resu}/{self.name}_pred_gold.conll"
json_to_connl.js2conll(self.pred_json, self.pred_conll_gold, "conll")
return self.pred_conll_gold
def pred_json_to_conll(self):
self.pred_conll = f"{self.resu}/{self.name}_pred.conll"
json_to_connl.js2conll(self.pred_json, self.pred_conll, "conll")
return self.pred_conll
def brackets_txt(self):
self.brack = f"{self.resu}/{self.name}_brac.txt"
c2bracket.conll2brackets(self.pred_conll, self.brack)
def brackets_txt_with_metadata(self):
self.brack_meta = f"{self.resu}/{self.name}_brac_meta.txt"
c2bracket.conll2brackets_with_meta(self.pred_meta_conll, self.brack_meta)
def evaluation(self, prod):
self.basic_metrics = f"{self.resu}/Evaluation_metrics.json"
if self.exte == ".conll" or self.exte == ".conllu": # get gold file
gold = self.raw
else:
gold = self.preprocessed
if prod.conll_todo == False: # get pred_file
pred = self.pred_json_to_conll()
else:
if prod.conll_meta == True:
if prod.conll_w_gold == True:
pred = self.pred_json_to_conll_w_metadata_w_gold()
else:
pred = self.pred_json_to_conll_w_metadata()
else:
if prod.conll_w_gold == True:
pred = self.pred_json_to_conll_w_gold()
else:
pred = self.pred_json_to_conll()
print(f"----> Predictions to file {pred}")
print(f"----> Evaluation scores to file {self.basic_metrics}")
scores_dict = seg_eval.get_scores(gold, pred)
with open(self.basic_metrics, 'w') as fo:
json.dump(scores_dict, fo)
class Output:
def __init__(self, infos):
self.conll_todo = infos['conll_file']['to_do']
self.conll_meta = infos['conll_file']['metadata']
self.conll_w_gold = infos['conll_file']['with_gold_labels']
self.txt_todo = infos['txt_file']['to_do']
self.txt_meta = infos['txt_file']['metadata']
class Process:
def __init__(self, infos):
self.main = infos["main"] # train test annotation
self.pre_process_to_do = infos['pre-processing']['to_do']
self.synt_tool = infos['pre-processing']['syntactic_tool']
self.synt_parse = infos['pre-processing']['syntactic_parsing']
self.toke = infos['pre-processing']['tokenization']
self.ssplit = infos['pre-processing']['sentence_split']
self.crea_meta = infos['pre-processing']['create_metadata']['to_do']
self.meta_line = infos['pre-processing']['create_metadata']['line']
self.meta_sent = infos['pre-processing']['create_metadata']['sent']
#if self.main == "train":
#if self.ner_init == True : # à faire en relatif !! split truc
# self.train_data = f"{self.data.path}/{self.data.name}_train.ner{self.data.file}"
# self.dev_data = f"{self.data.path}/{self.data.name}_dev.ner{self.data.file}"
#else :
# self.train_data = infos['discourse_segmenter']['training']['train_data_path']
# self.dev_data = infos['discourse_segmenter']['training']['validation_data_path']
self.toolkit = infos['discourse_segmenter']['training']['toolkit']
self.tr_config = infos['discourse_segmenter']['training']['config_file']
self.pretr_lm = infos['discourse_segmenter']['training']['pre_trained_lm']
self.model = infos['discourse_segmenter']['model'] # ezpz for Tony
self.test_data = infos['gold_test_data_path']
def get_evaluation_status(self):
if self.main == "test":
self.eval = True
#elif self.main == "train":
def get_model(self):
self.model_path = ""
if self.model == "tony":
arch = "french_tokens.tar.gz"
if not os.path.isfile(f"../model/tony/{arch}"):
dl = "wget https://zenodo.org/record/4235850/files/french_tokens.tar.gz -P ../model --progress=bar"
os.system(dl)
self.model_path = f"../model/tony/{arch}"
else:
print("----> Tony already in place !")
self.model_path = f"../model/tony/{arch}"
else:
self.model_path = self.model
def get_stamp():
now = datetime.now()
stamp = re.sub('[\s:]', '_', str(now))
return stamp
def get_config_infos(config, stamp):
with open(config, 'r', encoding='utf-8') as f:
infos = json.load(f)
data = Data(infos['data_raw'], stamp)
steps = Process(infos['steps'])
prod = Output(infos['output'])
my_logs["config"] = infos
return data, steps, prod
def print_logs(dict_logs):
file_logs = f"{data.resu}/processes_logs.json"
with open(file_logs, 'w') as fi:
json.dump(dict_logs, fi, indent=4)
if __name__ == '__main__':
my_logs = {}
stamp = get_stamp()
parser = argparse.ArgumentParser()
parser.add_argument('--config', help='Config file in JSON.')
parser.add_argument('--name',default=stamp , help='Run name.')
args = parser.parse_args()
config = args.config
stamp = args.name
my_logs["stamp"] = stamp
data, steps, prod = get_config_infos(config, stamp)
data.create_folders()
data.pre_processing(steps)
data.make_ner_format()
steps.get_model()
if steps.main == "annotation" or steps.main == "test":
data.make_predictions(steps) # output allennlp JSON
#elif steps.main == "train":
steps.get_evaluation_status()
if steps.eval == True:
data.evaluation(prod)
print_logs(my_logs) # <-- attention variable globale !
\ No newline at end of file
......@@ -50,8 +50,26 @@ def conll2brackets(in_f, out_f):
start = False
file_out.write("]\n\n")
def main(f_in, f_out):
input = f_in
output = f_out
conll2brackets(input, output)
\ No newline at end of file
def conll2brackets_with_meta(in_f, out_f):
start = True
input = in_f
with open(out_f, 'w') as file_out:
with open(in_f, 'r') as input:
for line in input:
if line.startswith("#"):
file_out.write(f"{line}\n")
elif line.strip()=="":
file_out.write("]")
file_out.write("\n\n")
start = True
else:
n, word, *junk, tag = line.split()
if tag=="BeginSeg=Yes":
if not(start):
file_out.write("] ")
file_out.write(f"[ {word} ")
else:
file_out.write(f"{word} ")
start = False
file_out.write("]\n\n")
\ No newline at end of file
......@@ -110,8 +110,8 @@ def conversion2ner(input, output, params=None):
# then, previous token label is set to B-E to signal end of previous segment
res[-1][-1] = "B-E"
start_doc = False
if label not in maptags:
print("warning, strange label ",label,file=sys.stderr)
#if label not in maptags:
#print("warning, strange label ",label,file=sys.stderr)
res.append([w,pos,"O",tag])
for line in res:
......
......@@ -5,6 +5,7 @@ conll format
import json
import sys
import re
#filepath = sys.argv[1]
#config = sys.argv[2]
......@@ -26,7 +27,6 @@ def js2conll(filepath, fileoutpath, config):
data = []
for line in open(filepath, 'r'):
data.append(json.loads(line))
with open(fileoutpath, 'w') as f_out:
for doc in data:
tokens = zip(doc["words"],doc["tags"])
......@@ -37,8 +37,62 @@ def js2conll(filepath, fileoutpath, config):
f_out.write("\n")
#print()
def main(f_in, form, f_out):
input = f_in
output = f_out
forma = form
js2conll(input, output, forma)
\ No newline at end of file
def js2conllNmeta(data_pred_json, data_out, config, data_meta):
data = []
sent_pred_count = 0
tok = 0
for line in open(data_pred_json, 'r'):
data.append(json.loads(line))
with open(data_out, 'w', encoding='utf-8') as fo, open(data_meta, 'r') as fm:
# id
for line in fm:
line = line.strip()
if line.startswith("#"):
fo.write(f"{line}\n")
elif line == "":
sent_pred_count += 1
tok = 0
fo.write(f"{line}\n")
else:
sent_pred = data[sent_pred_count]
word = data[sent_pred_count]['words'][tok]
tag = data[sent_pred_count]['tags'][tok]
tok += 1
#print(f"tok: {tok}, tag: {tag}, word: {word}, line : {line}")
new_line = re.sub('\t[^\t]+$', '', line)
fo.write(f"{new_line}\t{map[tag]}\n")
#if int(line.split("\t")[0]) == tok and line.split("\t")[1] == word:
# fo.write(f"{line}\t{tag}\n")
#print(f"sentpred : {sent_pred}\n")
#print(f"word n tag : {word}:::{tag}\n")
def js2conllNmetaNgold(data_pred_json, data_out, config, gold_n_meta):
data = []
sent_pred_count = 0
tok = 0
for line in open(data_pred_json, 'r'):
data.append(json.loads(line))
with open(data_out, 'w', encoding='utf-8') as fo, open(gold_n_meta, 'r') as fm:
# id
for line in fm:
line = line.strip()
if line.startswith("#"):
fo.write(f"{line}\n")
elif line == "":
sent_pred_count += 1
tok = 0
fo.write(f"{line}\n")
else:
sent_pred = data[sent_pred_count]
word = data[sent_pred_count]['words'][tok]
tag = data[sent_pred_count]['tags'][tok]
tok += 1
#print(f"tok: {tok}, tag: {tag}, word: {word}, line : {line}")
fo.write(f"{line}\t{map[tag]}\n")
\ No newline at end of file
......@@ -68,9 +68,9 @@ Arguments:
"""
__author__ = "Amir Zeldes"
#__author__ = "Amir Zeldes"
__license__ = "Apache 2.0"
__version__ = "1.0.1"
#__version__ = "1.0.1"
def parse_data(infile, string_input=False):
if not string_input:
......@@ -222,22 +222,3 @@ def get_scores(gold_file, pred_file, string_input=False):
return score_dict
if __name__ == "__main__":
p = argparse.ArgumentParser()
p.add_argument("goldfile",help="Shared task gold file in .tok or .conll format")
p.add_argument("predfile",help="Corresponding file with system predictions")
p.add_argument("-s","--string_input",action="store_true",help="Whether inputs are file names or strings")
opts = p.parse_args()
score_dict = get_scores(opts.goldfile,opts.predfile,opts.string_input)
print("File: " + score_dict["doc_name"])
print("o Total tokens: " + str(score_dict["tok_count"]))
print("o Gold " +score_dict["seg_type"]+": " + str(score_dict["gold_seg_count"]))
print("o Predicted "+score_dict["seg_type"]+": " + str(score_dict["pred_seg_count"]))
print("o Precision: " + str(score_dict["prec"]))
print("o Recall: " + str(score_dict["rec"]))
print("o F-Score: " + str(score_dict["f_score"]))
import stanza
from stanza.utils.conll import CoNLL
def with_stanza(lang, f_in, f_out, process, meta):
"""
Stanza's class CoNNL:
ID = 'id'
TEXT = 'text'
LEMMA = 'lemma'
UPOS = 'upos'
XPOS = 'xpos'
FEATS = 'feats'
HEAD = 'head'
DEPREL = 'deprel'
DEPS = 'deps'
MISC = 'misc' -> 'start_char|end_char'
START_CHAR = 'start_char'
END_CHAR = 'end_char'
FIELD_TO_IDX = {ID: 0, TEXT: 1, LEMMA: 2, UPOS: 3, XPOS: 4, FEATS: 5, HEAD: 6, DEPREL: 7, DEPS: 8, MISC: 9}
"""
stanza.download(lang)
nlp = stanza.Pipeline(lang, processors=process, use_gpu=True)
with open(f_in, 'r', encoding='utf-8') as fi, open(f_out, 'w', encoding='utf-8') as fo:
count_line = 0
for line in fi:
count_line += 1
count_sent = 0
line = line.strip()
if line.startswith("#"):
if "meta" in meta.keys() and meta['meta'] == True:
fo.write(f"{line}\n")
elif line == "":
fo.write("\n")
else:
#if meta['line']:
if "line" in meta.keys():
txt = f"#{meta['line']}-{count_line}\n"
fo.write(txt)
doc = nlp(line)
for sent in doc.sentences:
count_sent += 1
#if meta['sent']:
if "sent" in meta.keys():
txt = f"#{meta['sent']}-{count_sent}\n#text=\"{sent.text}\"\n"
fo.write(txt)
for token in sent.tokens:
token_conll = CoNLL.convert_token_dict(token.to_dict()[0])
fo.write("\t".join(token_conll))
fo.write("\n")
fo.write("\n")
\ No newline at end of file
......@@ -33,9 +33,18 @@ def main(steps):
#### train, has_per == False
# allennlp train -s Results_${CONFIG}/results_${OUTPUT} ${CODE}configs/${MODEL}.jsonnet --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --include-package allen_custom.custom_bert_token_embedder
# allennlp train -s Resultts_conllu/results_eng.rst.rstdt_bert ../code/utils/configs/bert.jsonnet ....
# Dicut- repo morteza
#allennlp train -s Results_${CONFIG}/results_${OUTPUT} ${CODE}configs/bert.jsonnet
cmd2 = f"allennlp train -s {steps.data.resu} {tr_config}"
# Discut-gitlab
cmd = f"allennlp train -s {steps.data.resu} {tr_config} --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --include-package allen_custom.custom_bert_token_embedder"
print(cmd)
os.system(cmd)
print(cmd2)
os.system(cmd2)
# then...
# TODO:
......
This diff is collapsed.
This diff is collapsed.
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment