diff --git a/code/classes_def_2.py b/code/classes_def_2.py new file mode 100644 index 0000000000000000000000000000000000000000..20aee54530ce61beb3f5c45d90df9f86d81bf654 --- /dev/null +++ b/code/classes_def_2.py @@ -0,0 +1,57 @@ +# Classes for discut22 + + + +class Data: + def __init__(self, infos, stamp): + self.name = infos['name'] + self.lang = infos['language'] + self.path = f"../data/{self.name}" + self.exte = infos['exte'] + self.stamp = stamp + self.conv = f"{self.path}/data_converted_{stamp}" # à intégrer + self.resu = f"{self.path}/results_{stamp}" + self.meta = infos['existing_metadata'] + + +class Process: + def __init__(self, infos): + self.main = infos["main"] # train test annotation + + self.pre_process_to_do = infos['pre-processing']['to_do'] + self.synt_tool = infos['pre-processing']['syntactic_tool'] + self.synt_parse = infos['pre-processing']['syntactic_parsing'] + self.toke = infos['pre-processing']['tokenization'] + self.ssplit = infos['pre-processing']['sentence_split'] + self.crea_meta = infos['pre-processing']['create_metadata']['to_do'] + self.meta_line = infos['pre-processing']['create_metadata']['line'] + self.meta_sent = infos['pre-processing']['create_metadata']['sent'] + + #self.ner_init = infos['pre-processing']['NER_format_initialisation'] # useless because done anyway + + #if self.main == "train": + #if self.ner_init == True : # à faire en relatif !! split truc + # self.train_data = f"{self.data.path}/{self.data.name}_train.ner{self.data.file}" + # self.dev_data = f"{self.data.path}/{self.data.name}_dev.ner{self.data.file}" + #else : + # self.train_data = infos['discourse_segmenter']['training']['train_data_path'] + # self.dev_data = infos['discourse_segmenter']['training']['validation_data_path'] + self.toolkit = infos['discourse_segmenter']['training']['toolkit'] + self.tr_config = infos['discourse_segmenter']['training']['config_file'] + self.pretr_lm = infos['discourse_segmenter']['training']['pre_trained_lm'] + + self.model = infos['discourse_segmenter']['model'] # ezpz for Tony + + #self.post_tab = infos['post-processing']['json_to_tab'] + + self.eval = infos['evaluation'] + self.test_data = infos['gold_test_data_path'] + + + +class Output: + def __init__(self, infos): + self.prod_tab = infos['file']['json_to_tab'] + self.prod_bracket = infos['file']['tab_to_bracket'] + self.prod_conll = infos['file']['conllu'] + self.metadata = infos['file']['metadata'] \ No newline at end of file diff --git a/code/config_global_1.2.json b/code/config_global_1.2.json new file mode 100644 index 0000000000000000000000000000000000000000..3e93ff20c17c3479d55b7e8cccc84685fe6cd115 --- /dev/null +++ b/code/config_global_1.2.json @@ -0,0 +1,48 @@ +{ + "usecase_description": "Config file for usecase_1 : from a text, get the same text but with EDU bracket.", + "data_raw": { + "name": "edgar_poe_en", + "exte": ".txt", + "language": "en", + "existing_metadata": false + }, + "steps":{ + "main": "annotation", + "pre-processing": { + "to_do": true, + "syntactic_tool": "stanza", + "sentence_split": true, + "tokenization": true, + "syntactic_parsing": true, + "create_metadata": { + "to_do": true, + "line": "paragraph", + "sent": "sent" + } + }, + "discourse_segmenter": { + "model": "/home/lriviere/andiamo/discut/Results_conllu/results_eng.rst.gum-eng_bert/model.tar.gz", + "training": { + "toolkit": null, + "pre_trained_lm": null, + "config_file": null, + "train_data_path": null, + "validation_data_path": null + } + }, + "evaluation": false, + "gold_test_data_path": null + }, + "output":{ + "file":{ + "json_to_tab": true, + "tab_to_bracket": true, + "conllu":true, + "metadata": true + }, + "scores":false + } +} + + + diff --git a/code/discut22_2.py b/code/discut22_2.py new file mode 100644 index 0000000000000000000000000000000000000000..e5b10fda0a6fa11b65eb6e221c9d38218fa516af --- /dev/null +++ b/code/discut22_2.py @@ -0,0 +1,110 @@ +###################################### +###### DISCOURSE SEGMENTOR 2022 ###### +###################################### +""" This the main script + And the only one to run, + after completion of config.json + Discut22 uses allennlp toolkit. For that, it need NER intermediary format. + + + + + + """ +import argparse +from datetime import datetime +import os +import re +import json +from classes_def_2 import Data, Process, Output +import utils_2.syntactic_parsing as synt_pars +import utils.conv2ner as conv_to_ner + + + + + +def get_stamp(): + now = datetime.now() + stamp = re.sub('[\s:]', '_', str(now)) + return stamp + +def get_config_infos(config, stamp): + with open(config, 'r', encoding='utf-8') as f: + infos = json.load(f) + data = Data(infos['data_raw'], stamp) + steps = Process(infos['steps']) + prod = Output(infos['output']) + my_logs["config"] = infos + return data, steps, prod + +def create_folders(li): + for it in li: + if not os.path.isdir(it): + os.mkdir(it) + +def print_logs(): + file_logs = f"{data.resu}/processes_logs.json" + print(my_logs) + +def pre_processing(data, steps): + data_in = f"{data.path}/{data.name}{data.exte}" + if steps.pre_process_to_do == True: + data_out = f"{data.path}/{data.name}.conll" + if steps.synt_tool == "stanza": + processors = [] + metadata = {} + if steps.toke == True: + processors.extend(['tokenize', 'mwt']) + if steps.synt_parse == True: + processors.extend(['pos', 'lemma', 'depparse']) + #if steps.ssplit == True: + # processors.append('constituency') + if steps.crea_meta == True: + metadata['line'] = steps.meta_line + metadata['sent'] = steps.meta_sent + if data.meta == True: + metadata['meta'] = True + processors_str = ",".join(processors) + synt_pars.with_stanza(data.lang, data_in, data_out, processors_str, metadata) + else: + exit(f"Exited. Not valid syntactic tool: \"{steps.synt_tool}\". Options: \"stanza\". Change your config file.") + else: + data_out = data_in + my_logs['data_preprocessed'] = data_out + return data_out + +def data_to_ner_format(data_in): + """ + This fonction build the NER format upon the Segmentor works. + INPUT: Tokenized text with whatever number of columns. + OUTPUT: Tokenized text with just 4 columns. + """ + data_ner = f"{data_in}.ner" + conv_to_ner.main(data_in, data_ner, "conll") + + #TODO add same for train/dev/test for config train + + my_logs['data_ner'] = data_ner + return data_ner + + + +if __name__ == '__main__': + my_logs = {} + stamp = get_stamp() + parser = argparse.ArgumentParser() + parser.add_argument('--config', help='Config file in JSON.') + parser.add_argument('--name',default=stamp , help='Run name.') + args = parser.parse_args() + config = args.config + stamp = args.name + my_logs["stamp"] = stamp + + data, steps, prod = get_config_infos(config, stamp) + create_folders([data.conv, data.resu]) + + data_preprocessed = pre_processing(data, steps) + data_ner = data_to_ner_format(data_preprocessed) + + #print_logs() \ No newline at end of file