begin of refacto code

b43945e7 · laura.riviere · 415aed4a · b43945e7 · b43945e7 · b43945e7
Commit b43945e7 authored 2 years ago by laura.riviere
--- a/code/classes_def_2.py
+++ b/code/classes_def_2.py
+# Classes for discut22
+class Data:
+    def __init__(self, infos, stamp):
+        self.name = infos['name']
+        self.lang = infos['language']
+        self.path = f"../data/{self.name}"
+        self.exte = infos['exte']
+        self.stamp = stamp
+        self.conv = f"{self.path}/data_converted_{stamp}" # à intégrer
+        self.resu = f"{self.path}/results_{stamp}"
+        self.meta = infos['existing_metadata']
+class Process:
+    def __init__(self, infos):
+        self.main = infos["main"] # train test annotation
+        self.pre_process_to_do = infos['pre-processing']['to_do']
+        self.synt_tool = infos['pre-processing']['syntactic_tool']
+        self.synt_parse = infos['pre-processing']['syntactic_parsing']
+        self.toke = infos['pre-processing']['tokenization']
+        self.ssplit = infos['pre-processing']['sentence_split']
+        self.crea_meta = infos['pre-processing']['create_metadata']['to_do']
+        self.meta_line = infos['pre-processing']['create_metadata']['line']
+        self.meta_sent = infos['pre-processing']['create_metadata']['sent']
+        #self.ner_init = infos['pre-processing']['NER_format_initialisation'] # useless because done anyway
+        #if self.main == "train":
+            #if self.ner_init == True : # à faire en relatif !! split truc
+            #    self.train_data = f"{self.data.path}/{self.data.name}_train.ner{self.data.file}"
+            #    self.dev_data = f"{self.data.path}/{self.data.name}_dev.ner{self.data.file}"
+            #else :
+            #    self.train_data = infos['discourse_segmenter']['training']['train_data_path']
+            #    self.dev_data = infos['discourse_segmenter']['training']['validation_data_path']
+        self.toolkit = infos['discourse_segmenter']['training']['toolkit']
+        self.tr_config = infos['discourse_segmenter']['training']['config_file']
+        self.pretr_lm = infos['discourse_segmenter']['training']['pre_trained_lm']
+        self.model = infos['discourse_segmenter']['model'] # ezpz for Tony 
+        #self.post_tab = infos['post-processing']['json_to_tab']
+        self.eval = infos['evaluation']
+        self.test_data = infos['gold_test_data_path']
+class Output:
+    def __init__(self, infos):
+        self.prod_tab = infos['file']['json_to_tab']
+        self.prod_bracket = infos['file']['tab_to_bracket']
+        self.prod_conll = infos['file']['conllu']
+        self.metadata = infos['file']['metadata']
\ No newline at end of file
--- a/code/config_global_1.2.json
+++ b/code/config_global_1.2.json
+{
+    "usecase_description": "Config file for usecase_1 : from a text, get the same text but with EDU bracket.",
+    "data_raw": {
+        "name": "edgar_poe_en",
+        "exte": ".txt",
+        "language": "en",
+        "existing_metadata": false
+    },
+    "steps":{
+        "main": "annotation",
+        "pre-processing": {
+            "to_do": true,
+            "syntactic_tool": "stanza",
+            "sentence_split": true,
+            "tokenization": true,
+            "syntactic_parsing": true,
+            "create_metadata": {
+                "to_do": true,
+                "line": "paragraph",
+                "sent": "sent"
+            }
+        },
+        "discourse_segmenter": {
+            "model": "/home/lriviere/andiamo/discut/Results_conllu/results_eng.rst.gum-eng_bert/model.tar.gz",
+            "training": {
+                "toolkit": null,
+                "pre_trained_lm": null,
+                "config_file": null,
+                "train_data_path": null,
+                "validation_data_path": null
+            }
+        },
+        "evaluation": false,
+        "gold_test_data_path": null
+    },
+    "output":{
+        "file":{
+            "json_to_tab": true,
+            "tab_to_bracket": true,
+            "conllu":true,
+            "metadata": true
+        },
+        "scores":false
+    }
+}
--- a/code/discut22_2.py
+++ b/code/discut22_2.py
+######################################
+###### DISCOURSE SEGMENTOR 2022 ######
+######################################
+""" This the main script
+    And the only one to run,
+    after completion of config.json 
+    Discut22 uses allennlp toolkit. For that, it need NER intermediary format.
+    """
+import argparse
+from datetime import datetime
+import os
+import re
+import json
+from classes_def_2 import Data, Process, Output
+import utils_2.syntactic_parsing as synt_pars
+import utils.conv2ner as conv_to_ner
+def get_stamp():
+    now = datetime.now()
+    stamp = re.sub('[\s:]', '_', str(now))
+    return stamp
+def get_config_infos(config, stamp):
+    with open(config, 'r', encoding='utf-8') as f:
+        infos = json.load(f)
+        data = Data(infos['data_raw'], stamp)
+        steps = Process(infos['steps'])
+        prod = Output(infos['output'])
+        my_logs["config"] = infos
+    return data, steps, prod
+def create_folders(li):
+    for it in li:
+        if not os.path.isdir(it):
+            os.mkdir(it)
+def print_logs():
+    file_logs = f"{data.resu}/processes_logs.json"
+    print(my_logs)
+def pre_processing(data, steps):
+    data_in = f"{data.path}/{data.name}{data.exte}"
+    if steps.pre_process_to_do == True:
+        data_out = f"{data.path}/{data.name}.conll"
+        if steps.synt_tool == "stanza":
+            processors = []
+            metadata = {}
+            if steps.toke == True:
+                processors.extend(['tokenize', 'mwt'])
+            if steps.synt_parse == True:
+                processors.extend(['pos', 'lemma', 'depparse'])
+            #if steps.ssplit == True:
+            #    processors.append('constituency')
+            if steps.crea_meta == True:
+                metadata['line'] = steps.meta_line
+                metadata['sent'] = steps.meta_sent
+            if data.meta == True:
+                metadata['meta'] = True
+            processors_str = ",".join(processors)
+            synt_pars.with_stanza(data.lang, data_in, data_out, processors_str, metadata)
+        else:
+            exit(f"Exited. Not valid syntactic tool: \"{steps.synt_tool}\". Options: \"stanza\". Change your config file.")
+    else:
+        data_out = data_in
+    my_logs['data_preprocessed'] = data_out
+    return data_out 
+def data_to_ner_format(data_in):
+    """
+    This fonction build the NER format upon the Segmentor works.
+    INPUT: Tokenized text with whatever number of columns.
+    OUTPUT: Tokenized text with just 4 columns.
+    """
+    data_ner = f"{data_in}.ner"
+    conv_to_ner.main(data_in, data_ner, "conll")
+    #TODO add same for train/dev/test for config train
+    my_logs['data_ner'] = data_ner
+    return data_ner
+if __name__ == '__main__':
+    my_logs = {}
+    stamp = get_stamp()
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--config', help='Config file in JSON.')
+    parser.add_argument('--name',default=stamp , help='Run name.')
+    args = parser.parse_args()
+    config = args.config
+    stamp = args.name
+    my_logs["stamp"] = stamp
+    data, steps, prod = get_config_infos(config, stamp)
+    create_folders([data.conv, data.resu])
+    data_preprocessed = pre_processing(data, steps)
+    data_ner = data_to_ner_format(data_preprocessed)
+    #print_logs()
\ No newline at end of file