diff --git a/code/config_global_1.2.json b/code/config_global_1.2.json index 3e93ff20c17c3479d55b7e8cccc84685fe6cd115..0e71493d60e98ecf2192f2a5ef4e43f5966b18d7 100644 --- a/code/config_global_1.2.json +++ b/code/config_global_1.2.json @@ -1,10 +1,10 @@ { - "usecase_description": "Config file for usecase_1 : from a text, get the same text but with EDU bracket.", + "usecase_description": "Config file for usecase_1 : from a raw text, get the same text but with EDU bracket.", "data_raw": { "name": "edgar_poe_en", "exte": ".txt", "language": "en", - "existing_metadata": false + "existing_metadata": true }, "steps":{ "main": "annotation", @@ -35,7 +35,6 @@ }, "output":{ "file":{ - "json_to_tab": true, "tab_to_bracket": true, "conllu":true, "metadata": true diff --git a/code/config_global_1.21.json b/code/config_global_1.21.json new file mode 100644 index 0000000000000000000000000000000000000000..1e8c4a92ea30cea09445fafac64523cf0939d0fa --- /dev/null +++ b/code/config_global_1.21.json @@ -0,0 +1,47 @@ +{ + "usecase_description": "Config file for usecase_1 : from a tokenized text, get the same text but with EDU bracket.", + "data_raw": { + "name": "edgar_poe_short", + "exte": ".conll", + "language": "en", + "existing_metadata": true + }, + "steps":{ + "main": "annotation", + "pre-processing": { + "to_do": false, + "syntactic_tool": "stanza", + "sentence_split": true, + "tokenization": true, + "syntactic_parsing": true, + "create_metadata": { + "to_do": true, + "line": "paragraph", + "sent": "sent" + } + }, + "discourse_segmenter": { + "model": "/home/lriviere/andiamo/discut/Results_conllu/results_eng.rst.gum-eng_bert/model.tar.gz", + "training": { + "toolkit": null, + "pre_trained_lm": null, + "config_file": null, + "train_data_path": null, + "validation_data_path": null + } + }, + "evaluation": false, + "gold_test_data_path": null + }, + "output":{ + "file":{ + "conllu":true, + "metadata": true, + "tab_to_bracket": false + }, + "scores":false + } +} + + + diff --git a/code/config_global_3.json b/code/config_global_3.json index c1efee85638fee908c6c15a36ed4b4766e70d0a9..67d7e8d53299ed707c952c6c0473b3332a95588e 100644 --- a/code/config_global_3.json +++ b/code/config_global_3.json @@ -27,6 +27,7 @@ }, "post-processing": { "json_to_tab": false, + "metadata_conll": false, "tab_to_bracket":false }, "evaluation": true, diff --git a/code/discut22_1.py b/code/discut22_1.py index a65dddd19b03a3326668cf8e20c9ecc3fd9ec551..7a8739bab0a41424bbb12e5d1fe5e3ef5659b622 100644 --- a/code/discut22_1.py +++ b/code/discut22_1.py @@ -240,7 +240,7 @@ if __name__ == '__main__': now = datetime.now() #stamp = re.sub('[\s:]', '_', str(now)) - stamp = "debug1205" + stamp = "_debug1214" my_logs = {} my_logs['stamp'] = stamp diff --git a/code/discut22_2.py b/code/discut22_2.py index 5fe0218583bbfef724f3af1c27c5182695e7a951..d017ee77c356a7df923b9c399a88b415c9c8fb38 100644 --- a/code/discut22_2.py +++ b/code/discut22_2.py @@ -20,6 +20,7 @@ from classes_def_2 import Data, Process, Output import utils_2.syntactic_parsing as synt_pars import utils.conv2ner as conv_to_ner # TODO clean it import utils.json2conll as json_to_connl # TODO clean it +import utils.training_allennlp as tr_allen @@ -92,7 +93,7 @@ def make_predictions(data_in, model_path): model = model_path # add def get_model from v1 data_out = f"{data.resu}/{data.name}_pred.json" #cmd = f"allennlp predict --use-dataset-reader --output-file {data_out} {model_path} {data_in} &> {steps.data.resu}/logs.txt" - cmd = f"allennlp predict --use-dataset-reader --output-file {data_out} {model_path} {data_in}" # &> {steps.data.resu}/logs.txt" + cmd = f"allennlp predict --use-dataset-reader --output-file {data_out} {model_path} {data_in} &> {steps.data.resu}/logs_predictions.txt" os.system(cmd) return data_out @@ -126,17 +127,24 @@ if __name__ == '__main__': create_folders([data.conv, data.resu]) data_preprocessed = pre_processing(data, steps) - #data_preprocessed = "/home/lriviere/andiamo/discut22/data/edgar_poe_short/data_converted_vendredi/edgar_poe_short.conll" + #TEST data_preprocessed = "/home/lriviere/andiamo/discut22/data/edgar_poe_short/data_converted_vendredi/edgar_poe_short.conll" data_ner = data_to_ner_format(data_preprocessed) - if steps.main == "annotation": - #data_pred_json = make_predictions(data_ner, steps.model) - data_pred_json = "/home/lriviere/andiamo/discut22/data/edgar_poe_short/results_vendredi/edgar_poe_short_pred.json" + if steps.main == "annotation" or steps.main == "test": + data_pred_json = make_predictions(data_ner, steps.model) + #data_pred_json = "/home/lriviere/andiamo/discut22/data/edgar_poe_short/results_vendredi/edgar_poe_short_pred.json" if prod.metadata == True: data_pred_and_meta_conll = pred_json_to_conll_with_metadata(data_pred_json, data_preprocessed) else: data_pred_toke = pred_json_to_toke(data_pred_json) + #elif steps.main == "train": + + + + + #scores = compare_pred_gold() + #print_logs() \ No newline at end of file diff --git a/code/utils/training_allennlp.py b/code/utils/training_allennlp.py index 65d4dfdb42e4a371c22858534ebefc6ce94038f0..04c3957bae6839a567394251487db7d25ed5f1d4 100644 --- a/code/utils/training_allennlp.py +++ b/code/utils/training_allennlp.py @@ -33,9 +33,18 @@ def main(steps): #### train, has_per == False # allennlp train -s Results_${CONFIG}/results_${OUTPUT} ${CODE}configs/${MODEL}.jsonnet --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --include-package allen_custom.custom_bert_token_embedder # allennlp train -s Resultts_conllu/results_eng.rst.rstdt_bert ../code/utils/configs/bert.jsonnet .... + + # Dicut- repo morteza + #allennlp train -s Results_${CONFIG}/results_${OUTPUT} ${CODE}configs/bert.jsonnet + cmd2 = f"allennlp train -s {steps.data.resu} {tr_config}" + + # Discut-gitlab cmd = f"allennlp train -s {steps.data.resu} {tr_config} --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --include-package allen_custom.custom_bert_token_embedder" - print(cmd) - os.system(cmd) + + + + print(cmd2) + os.system(cmd2) # then... # TODO: