From af2160d7935decf2205518d14b1c63d69d9b08a1 Mon Sep 17 00:00:00 2001
From: "laura.riviere" <laura.riviere@irit.fr>
Date: Tue, 3 Jan 2023 11:53:46 +0100
Subject: [PATCH] clean new config

---
 code/config_global_1.2.json     |  5 ++--
 code/config_global_1.21.json    | 47 +++++++++++++++++++++++++++++++++
 code/config_global_3.json       |  1 +
 code/discut22_1.py              |  2 +-
 code/discut22_2.py              | 18 +++++++++----
 code/utils/training_allennlp.py | 13 +++++++--
 6 files changed, 75 insertions(+), 11 deletions(-)
 create mode 100644 code/config_global_1.21.json

diff --git a/code/config_global_1.2.json b/code/config_global_1.2.json
index 3e93ff2..0e71493 100644
--- a/code/config_global_1.2.json
+++ b/code/config_global_1.2.json
@@ -1,10 +1,10 @@
 {
-    "usecase_description": "Config file for usecase_1 : from a text, get the same text but with EDU bracket.",
+    "usecase_description": "Config file for usecase_1 : from a raw text, get the same text but with EDU bracket.",
     "data_raw": {
         "name": "edgar_poe_en",
         "exte": ".txt",
         "language": "en",
-        "existing_metadata": false
+        "existing_metadata": true
     },
     "steps":{
         "main": "annotation",
@@ -35,7 +35,6 @@
     },
     "output":{
         "file":{
-            "json_to_tab": true,
             "tab_to_bracket": true,
             "conllu":true,
             "metadata": true
diff --git a/code/config_global_1.21.json b/code/config_global_1.21.json
new file mode 100644
index 0000000..1e8c4a9
--- /dev/null
+++ b/code/config_global_1.21.json
@@ -0,0 +1,47 @@
+{
+    "usecase_description": "Config file for usecase_1 : from a tokenized text, get the same text but with EDU bracket.",
+    "data_raw": {
+        "name": "edgar_poe_short",
+        "exte": ".conll",
+        "language": "en",
+        "existing_metadata": true
+    },
+    "steps":{
+        "main": "annotation",
+        "pre-processing": {
+            "to_do": false,
+            "syntactic_tool": "stanza",
+            "sentence_split": true,
+            "tokenization": true,
+            "syntactic_parsing": true,
+            "create_metadata": {
+                "to_do": true,
+                "line": "paragraph",
+                "sent": "sent"
+            }
+        },
+        "discourse_segmenter": {
+            "model": "/home/lriviere/andiamo/discut/Results_conllu/results_eng.rst.gum-eng_bert/model.tar.gz",
+            "training": {
+                "toolkit": null,
+                "pre_trained_lm": null,
+                "config_file": null,
+                "train_data_path": null,
+                "validation_data_path": null
+            }
+        },
+        "evaluation": false,
+        "gold_test_data_path": null
+    },
+    "output":{
+        "file":{
+            "conllu":true,
+            "metadata": true,
+            "tab_to_bracket": false
+        },
+        "scores":false
+    }
+}
+
+
+
diff --git a/code/config_global_3.json b/code/config_global_3.json
index c1efee8..67d7e8d 100644
--- a/code/config_global_3.json
+++ b/code/config_global_3.json
@@ -27,6 +27,7 @@
         },
         "post-processing": {
             "json_to_tab": false,
+            "metadata_conll": false,
             "tab_to_bracket":false
         },
         "evaluation": true,
diff --git a/code/discut22_1.py b/code/discut22_1.py
index a65dddd..7a8739b 100644
--- a/code/discut22_1.py
+++ b/code/discut22_1.py
@@ -240,7 +240,7 @@ if __name__ == '__main__':
 
     now = datetime.now()
     #stamp = re.sub('[\s:]', '_', str(now))
-    stamp = "debug1205"
+    stamp = "_debug1214"
     my_logs = {}
     my_logs['stamp'] = stamp
 
diff --git a/code/discut22_2.py b/code/discut22_2.py
index 5fe0218..d017ee7 100644
--- a/code/discut22_2.py
+++ b/code/discut22_2.py
@@ -20,6 +20,7 @@ from classes_def_2 import Data, Process, Output
 import utils_2.syntactic_parsing as synt_pars
 import utils.conv2ner as conv_to_ner # TODO clean it
 import utils.json2conll as json_to_connl # TODO clean it
+import utils.training_allennlp as tr_allen
 
 
 
@@ -92,7 +93,7 @@ def make_predictions(data_in, model_path):
     model = model_path # add def get_model from v1
     data_out = f"{data.resu}/{data.name}_pred.json"
     #cmd = f"allennlp predict --use-dataset-reader --output-file {data_out} {model_path} {data_in} &> {steps.data.resu}/logs.txt"
-    cmd = f"allennlp predict --use-dataset-reader --output-file {data_out} {model_path} {data_in}" # &> {steps.data.resu}/logs.txt"
+    cmd = f"allennlp predict --use-dataset-reader --output-file {data_out} {model_path} {data_in} &> {steps.data.resu}/logs_predictions.txt"
     os.system(cmd)
     return data_out
 
@@ -126,17 +127,24 @@ if __name__ == '__main__':
     create_folders([data.conv, data.resu])
 
     data_preprocessed = pre_processing(data, steps)
-    #data_preprocessed = "/home/lriviere/andiamo/discut22/data/edgar_poe_short/data_converted_vendredi/edgar_poe_short.conll"
+    #TEST data_preprocessed = "/home/lriviere/andiamo/discut22/data/edgar_poe_short/data_converted_vendredi/edgar_poe_short.conll"
     data_ner = data_to_ner_format(data_preprocessed)
 
-    if steps.main == "annotation":
-        #data_pred_json = make_predictions(data_ner, steps.model)
-        data_pred_json = "/home/lriviere/andiamo/discut22/data/edgar_poe_short/results_vendredi/edgar_poe_short_pred.json"
+    if steps.main == "annotation" or steps.main == "test":
+        data_pred_json = make_predictions(data_ner, steps.model)
+        #data_pred_json = "/home/lriviere/andiamo/discut22/data/edgar_poe_short/results_vendredi/edgar_poe_short_pred.json"
 
         if prod.metadata == True:
             data_pred_and_meta_conll = pred_json_to_conll_with_metadata(data_pred_json, data_preprocessed)
         else:
             data_pred_toke = pred_json_to_toke(data_pred_json)
 
+    #elif steps.main == "train":
+
         
+
+
+
+    #scores = compare_pred_gold()
+
     #print_logs()
\ No newline at end of file
diff --git a/code/utils/training_allennlp.py b/code/utils/training_allennlp.py
index 65d4dfd..04c3957 100644
--- a/code/utils/training_allennlp.py
+++ b/code/utils/training_allennlp.py
@@ -33,9 +33,18 @@ def main(steps):
     #### train, has_per == False
     # allennlp train -s Results_${CONFIG}/results_${OUTPUT} ${CODE}configs/${MODEL}.jsonnet --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --include-package allen_custom.custom_bert_token_embedder 
     # allennlp train -s Resultts_conllu/results_eng.rst.rstdt_bert ../code/utils/configs/bert.jsonnet ....
+    
+    # Dicut- repo morteza
+    #allennlp train -s Results_${CONFIG}/results_${OUTPUT} ${CODE}configs/bert.jsonnet
+    cmd2 = f"allennlp train -s {steps.data.resu} {tr_config}"
+    
+    # Discut-gitlab
     cmd = f"allennlp train -s {steps.data.resu} {tr_config} --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --include-package allen_custom.custom_bert_token_embedder" 
-    print(cmd)
-    os.system(cmd)
+    
+    
+    
+    print(cmd2)
+    os.system(cmd2)
     # then...
 
     # TODO:
-- 
GitLab