Skip to content
Snippets Groups Projects
Commit b43945e7 authored by laura.riviere's avatar laura.riviere
Browse files

begin of refacto code

parent 415aed4a
No related branches found
No related tags found
1 merge request!3Refacto 1205
# Classes for discut22
class Data:
def __init__(self, infos, stamp):
self.name = infos['name']
self.lang = infos['language']
self.path = f"../data/{self.name}"
self.exte = infos['exte']
self.stamp = stamp
self.conv = f"{self.path}/data_converted_{stamp}" # à intégrer
self.resu = f"{self.path}/results_{stamp}"
self.meta = infos['existing_metadata']
class Process:
def __init__(self, infos):
self.main = infos["main"] # train test annotation
self.pre_process_to_do = infos['pre-processing']['to_do']
self.synt_tool = infos['pre-processing']['syntactic_tool']
self.synt_parse = infos['pre-processing']['syntactic_parsing']
self.toke = infos['pre-processing']['tokenization']
self.ssplit = infos['pre-processing']['sentence_split']
self.crea_meta = infos['pre-processing']['create_metadata']['to_do']
self.meta_line = infos['pre-processing']['create_metadata']['line']
self.meta_sent = infos['pre-processing']['create_metadata']['sent']
#self.ner_init = infos['pre-processing']['NER_format_initialisation'] # useless because done anyway
#if self.main == "train":
#if self.ner_init == True : # à faire en relatif !! split truc
# self.train_data = f"{self.data.path}/{self.data.name}_train.ner{self.data.file}"
# self.dev_data = f"{self.data.path}/{self.data.name}_dev.ner{self.data.file}"
#else :
# self.train_data = infos['discourse_segmenter']['training']['train_data_path']
# self.dev_data = infos['discourse_segmenter']['training']['validation_data_path']
self.toolkit = infos['discourse_segmenter']['training']['toolkit']
self.tr_config = infos['discourse_segmenter']['training']['config_file']
self.pretr_lm = infos['discourse_segmenter']['training']['pre_trained_lm']
self.model = infos['discourse_segmenter']['model'] # ezpz for Tony
#self.post_tab = infos['post-processing']['json_to_tab']
self.eval = infos['evaluation']
self.test_data = infos['gold_test_data_path']
class Output:
def __init__(self, infos):
self.prod_tab = infos['file']['json_to_tab']
self.prod_bracket = infos['file']['tab_to_bracket']
self.prod_conll = infos['file']['conllu']
self.metadata = infos['file']['metadata']
\ No newline at end of file
{
"usecase_description": "Config file for usecase_1 : from a text, get the same text but with EDU bracket.",
"data_raw": {
"name": "edgar_poe_en",
"exte": ".txt",
"language": "en",
"existing_metadata": false
},
"steps":{
"main": "annotation",
"pre-processing": {
"to_do": true,
"syntactic_tool": "stanza",
"sentence_split": true,
"tokenization": true,
"syntactic_parsing": true,
"create_metadata": {
"to_do": true,
"line": "paragraph",
"sent": "sent"
}
},
"discourse_segmenter": {
"model": "/home/lriviere/andiamo/discut/Results_conllu/results_eng.rst.gum-eng_bert/model.tar.gz",
"training": {
"toolkit": null,
"pre_trained_lm": null,
"config_file": null,
"train_data_path": null,
"validation_data_path": null
}
},
"evaluation": false,
"gold_test_data_path": null
},
"output":{
"file":{
"json_to_tab": true,
"tab_to_bracket": true,
"conllu":true,
"metadata": true
},
"scores":false
}
}
######################################
###### DISCOURSE SEGMENTOR 2022 ######
######################################
""" This the main script
And the only one to run,
after completion of config.json
Discut22 uses allennlp toolkit. For that, it need NER intermediary format.
"""
import argparse
from datetime import datetime
import os
import re
import json
from classes_def_2 import Data, Process, Output
import utils_2.syntactic_parsing as synt_pars
import utils.conv2ner as conv_to_ner
def get_stamp():
now = datetime.now()
stamp = re.sub('[\s:]', '_', str(now))
return stamp
def get_config_infos(config, stamp):
with open(config, 'r', encoding='utf-8') as f:
infos = json.load(f)
data = Data(infos['data_raw'], stamp)
steps = Process(infos['steps'])
prod = Output(infos['output'])
my_logs["config"] = infos
return data, steps, prod
def create_folders(li):
for it in li:
if not os.path.isdir(it):
os.mkdir(it)
def print_logs():
file_logs = f"{data.resu}/processes_logs.json"
print(my_logs)
def pre_processing(data, steps):
data_in = f"{data.path}/{data.name}{data.exte}"
if steps.pre_process_to_do == True:
data_out = f"{data.path}/{data.name}.conll"
if steps.synt_tool == "stanza":
processors = []
metadata = {}
if steps.toke == True:
processors.extend(['tokenize', 'mwt'])
if steps.synt_parse == True:
processors.extend(['pos', 'lemma', 'depparse'])
#if steps.ssplit == True:
# processors.append('constituency')
if steps.crea_meta == True:
metadata['line'] = steps.meta_line
metadata['sent'] = steps.meta_sent
if data.meta == True:
metadata['meta'] = True
processors_str = ",".join(processors)
synt_pars.with_stanza(data.lang, data_in, data_out, processors_str, metadata)
else:
exit(f"Exited. Not valid syntactic tool: \"{steps.synt_tool}\". Options: \"stanza\". Change your config file.")
else:
data_out = data_in
my_logs['data_preprocessed'] = data_out
return data_out
def data_to_ner_format(data_in):
"""
This fonction build the NER format upon the Segmentor works.
INPUT: Tokenized text with whatever number of columns.
OUTPUT: Tokenized text with just 4 columns.
"""
data_ner = f"{data_in}.ner"
conv_to_ner.main(data_in, data_ner, "conll")
#TODO add same for train/dev/test for config train
my_logs['data_ner'] = data_ner
return data_ner
if __name__ == '__main__':
my_logs = {}
stamp = get_stamp()
parser = argparse.ArgumentParser()
parser.add_argument('--config', help='Config file in JSON.')
parser.add_argument('--name',default=stamp , help='Run name.')
args = parser.parse_args()
config = args.config
stamp = args.name
my_logs["stamp"] = stamp
data, steps, prod = get_config_infos(config, stamp)
create_folders([data.conv, data.resu])
data_preprocessed = pre_processing(data, steps)
data_ner = data_to_ner_format(data_preprocessed)
#print_logs()
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment