From ccb38671d2aadadebf908f21ac3e26dff8285f69 Mon Sep 17 00:00:00 2001
From: "laura.riviere" <laura.riviere@irit.fr>
Date: Mon, 21 Nov 2022 16:21:36 +0100
Subject: [PATCH] fix requirements.txt

---
 .../custom_bert_token_embedder.cpython-37.pyc | Bin 9434 -> 9434 bytes
 .../custom_conll_reader.cpython-37.pyc        | Bin 7306 -> 7306 bytes
 .../custom_disrpt_reader.cpython-37.pyc       | Bin 7627 -> 7627 bytes
 .../custom_simple_tagger.cpython-37.pyc       | Bin 8475 -> 8475 bytes
 code/config_global_1.json                     |  14 +++++++-------
 code/config_global_3.json                     |  12 ++++++------
 code/discut22_1.py                            |  13 +++++++------
 7 files changed, 20 insertions(+), 19 deletions(-)

diff --git a/code/allen_custom/__pycache__/custom_bert_token_embedder.cpython-37.pyc b/code/allen_custom/__pycache__/custom_bert_token_embedder.cpython-37.pyc
index 7ba9802b113f8fe5be38ab83deb4daf2695ec9ff..165899baf4e577c29f30b67b7ffc7e0f9ddeb9db 100644
GIT binary patch
delta 20
bcmccRdCQaAiI<m)fq{X+JF<Ku_jwfnJT?V_

delta 20
bcmccRdCQaAiI<m)fq{X+p)YJB_jwfnJc0#_

diff --git a/code/allen_custom/__pycache__/custom_conll_reader.cpython-37.pyc b/code/allen_custom/__pycache__/custom_conll_reader.cpython-37.pyc
index 61f75bd353d9d94355e7f854fbb639bdca3a0c6f..e1cd9b9e6975f5ec5f52e93593be0b21fec841a6 100644
GIT binary patch
delta 20
bcmeCO?6Tx`;^pOHU|?YIjx68ET`vOwEp7yA

delta 20
bcmeCO?6Tx`;^pOHU|?Wy=nLD(T`vOwExH7A

diff --git a/code/allen_custom/__pycache__/custom_disrpt_reader.cpython-37.pyc b/code/allen_custom/__pycache__/custom_disrpt_reader.cpython-37.pyc
index b5fed7c4fd030664682f83ecb79a6980d3d2cf6b..5effd2460e7a8c5f97dcecd3a560c3dba932e235 100644
GIT binary patch
delta 20
bcmX?YecGDaiI<m)fq{X+JF<Ku_W@Y|H{S&^

delta 20
bcmX?YecGDaiI<m)fq{X+p)YJB_W@Y|I4cD^

diff --git a/code/allen_custom/__pycache__/custom_simple_tagger.cpython-37.pyc b/code/allen_custom/__pycache__/custom_simple_tagger.cpython-37.pyc
index c1647c89fbf6170d62b81ac688416a44a176c2c6..0eb4cd9a434b2331b7a46a62b763bccb2235e80c 100644
GIT binary patch
delta 20
bcmbR3G~0>WiI<m)fq{X+JF<Kuw}2u5Fkb{T

delta 20
bcmbR3G~0>WiI<m)fq{X+p)YJBw}2u5FslST

diff --git a/code/config_global_1.json b/code/config_global_1.json
index cd6c01c..e767ec2 100644
--- a/code/config_global_1.json
+++ b/code/config_global_1.json
@@ -1,22 +1,22 @@
 {
-    "usecase_description": "Config file for usecase_1 : from a text, get the same text but with EDU bracket using ToNy segmenter.",
+    "usecase_description": "Config file for usecase_1 : from a text, get the same text but with EDU bracket.",
     "input": {
-        "name": "chaperontest",
-        "file": ".ss",
-        "language": "fr"
+        "name": "eng_annotation",
+        "file": ".conllu",
+        "language": "en"
     },
     "steps":{
         "main": "annotation",
         "pre-processing": {
-            "tokenization": true,
-            "tokenization_tool" : "spacy",
+            "tokenization": false,
+            "tokenization_tool" : null,
             "sentence_split": false,
             "sentence_split_splitor": null,
             "syntactic_parsing": false, 
             "NER_format_initialisation": true
         },
         "discourse_segmenter": {
-            "model": "tony",
+            "model": "/home/lriviere/andiamo/discut22/data/eng.sdrt.stac/results_2022-11-21_15_42_42.923648/model.tar.gz",
             "training": {
                 "toolkit": null,
                 "pre_trained_lm": null,
diff --git a/code/config_global_3.json b/code/config_global_3.json
index 9703e8d..c1efee8 100644
--- a/code/config_global_3.json
+++ b/code/config_global_3.json
@@ -1,7 +1,7 @@
 {
     "usecase_description": "Config file for usecase_3 : Take a EDU gold segmented set of train/dev/test of texts au format conll as input, train a model, output scores.",
     "input": {
-        "name": "eng.rst.rstdt",
+        "name": "eng.sdrt.stac",
         "file": ".conllu",
         "language": "en"
     },
@@ -9,9 +9,9 @@
         "main": "train",
         "pre-processing": {
             "tokenization": false,
-            "tokenization_tool" : "spacy",
+            "tokenization_tool" : null,
             "sentence_split": false,
-            "sentence_split_splitor": "stanza",
+            "sentence_split_splitor": null,
             "syntactic_parsing": false, 
             "NER_format_initialisation": true
         },
@@ -21,8 +21,8 @@
                 "toolkit": "allennlp",
                 "pre_trained_lm": "bert",
                 "config_file": "../model/config_training_bert.jsonnet",
-                "train_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_train.conllu",
-                "validation_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_dev.conllu"
+                "train_data_path": "../data/eng.sdrt.stac/eng.sdrt.stac_train.conllu",
+                "validation_data_path": "../data/eng.sdrt.stac/eng.sdrt.stac_dev.conllu"
             }
         },
         "post-processing": {
@@ -30,7 +30,7 @@
             "tab_to_bracket":false
         },
         "evaluation": true,
-        "gold_test_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_test.conllu"
+        "gold_test_data_path": "../data/eng.sdrt.stac/eng.sdrt.stac_test.conllu"
     }
 }
 
diff --git a/code/discut22_1.py b/code/discut22_1.py
index 1a40cfe..60c543c 100644
--- a/code/discut22_1.py
+++ b/code/discut22_1.py
@@ -15,7 +15,7 @@ import json
 
 from classes_def import Input, Process
 import utils
-import utils.fr_tokenize as tk
+#import utils.fr_tokenize as tk
 import utils.conv2ner as c2n
 import utils.json2conll as j2c
 import utils.conll2bracket as c2bracket
@@ -37,16 +37,20 @@ def get_config_infos(stamp, config_file):
 # fonction to load existing model -> only tony for now
 def get_model(model_name):
     name = model_name
+    output = ""
 
     if name == "tony": 
         arch = "french_tokens.tar.gz"
         if not os.path.isfile(f"../model/{name}/{arch}"):
             dl = "wget https://zenodo.org/record/4235850/files/french_tokens.tar.gz -P ../model --progress=bar"
             os.system(dl)
+            output = f"../model/{name}/{arch}"
         else:
             print("Tony already in place !")
+    else:
+        output = model_name
 
-    return f"../model/{name}/{arch}"
+    return output
 
 
 def text_tokenization(f_in, f_out, lang, tool):
@@ -106,12 +110,9 @@ def main(steps):
 
 
     if steps.main == "train":
-        #model_config = steps.model_config
-        #cmd = "bash utils/expes.sh eng.rst.rstdt model/config_training.jsonnet bert train"
-        #os.system(cmd)
         if steps.toolkit == "allennlp":
             print("toolkit allennlp for training")
-        #    tr_allen.main(steps)
+            tr_allen.main(steps)
             # set the value of model from null to what was just created by training
             steps.model = f"{steps.data.resu}/model.tar.gz"
         elif steps.toolkit == "jiant":
-- 
GitLab