From ccb38671d2aadadebf908f21ac3e26dff8285f69 Mon Sep 17 00:00:00 2001 From: "laura.riviere" <laura.riviere@irit.fr> Date: Mon, 21 Nov 2022 16:21:36 +0100 Subject: [PATCH] fix requirements.txt --- .../custom_bert_token_embedder.cpython-37.pyc | Bin 9434 -> 9434 bytes .../custom_conll_reader.cpython-37.pyc | Bin 7306 -> 7306 bytes .../custom_disrpt_reader.cpython-37.pyc | Bin 7627 -> 7627 bytes .../custom_simple_tagger.cpython-37.pyc | Bin 8475 -> 8475 bytes code/config_global_1.json | 14 +++++++------- code/config_global_3.json | 12 ++++++------ code/discut22_1.py | 13 +++++++------ 7 files changed, 20 insertions(+), 19 deletions(-) diff --git a/code/allen_custom/__pycache__/custom_bert_token_embedder.cpython-37.pyc b/code/allen_custom/__pycache__/custom_bert_token_embedder.cpython-37.pyc index 7ba9802b113f8fe5be38ab83deb4daf2695ec9ff..165899baf4e577c29f30b67b7ffc7e0f9ddeb9db 100644 GIT binary patch delta 20 bcmccRdCQaAiI<m)fq{X+JF<Ku_jwfnJT?V_ delta 20 bcmccRdCQaAiI<m)fq{X+p)YJB_jwfnJc0#_ diff --git a/code/allen_custom/__pycache__/custom_conll_reader.cpython-37.pyc b/code/allen_custom/__pycache__/custom_conll_reader.cpython-37.pyc index 61f75bd353d9d94355e7f854fbb639bdca3a0c6f..e1cd9b9e6975f5ec5f52e93593be0b21fec841a6 100644 GIT binary patch delta 20 bcmeCO?6Tx`;^pOHU|?YIjx68ET`vOwEp7yA delta 20 bcmeCO?6Tx`;^pOHU|?Wy=nLD(T`vOwExH7A diff --git a/code/allen_custom/__pycache__/custom_disrpt_reader.cpython-37.pyc b/code/allen_custom/__pycache__/custom_disrpt_reader.cpython-37.pyc index b5fed7c4fd030664682f83ecb79a6980d3d2cf6b..5effd2460e7a8c5f97dcecd3a560c3dba932e235 100644 GIT binary patch delta 20 bcmX?YecGDaiI<m)fq{X+JF<Ku_W@Y|H{S&^ delta 20 bcmX?YecGDaiI<m)fq{X+p)YJB_W@Y|I4cD^ diff --git a/code/allen_custom/__pycache__/custom_simple_tagger.cpython-37.pyc b/code/allen_custom/__pycache__/custom_simple_tagger.cpython-37.pyc index c1647c89fbf6170d62b81ac688416a44a176c2c6..0eb4cd9a434b2331b7a46a62b763bccb2235e80c 100644 GIT binary patch delta 20 bcmbR3G~0>WiI<m)fq{X+JF<Kuw}2u5Fkb{T delta 20 bcmbR3G~0>WiI<m)fq{X+p)YJBw}2u5FslST diff --git a/code/config_global_1.json b/code/config_global_1.json index cd6c01c..e767ec2 100644 --- a/code/config_global_1.json +++ b/code/config_global_1.json @@ -1,22 +1,22 @@ { - "usecase_description": "Config file for usecase_1 : from a text, get the same text but with EDU bracket using ToNy segmenter.", + "usecase_description": "Config file for usecase_1 : from a text, get the same text but with EDU bracket.", "input": { - "name": "chaperontest", - "file": ".ss", - "language": "fr" + "name": "eng_annotation", + "file": ".conllu", + "language": "en" }, "steps":{ "main": "annotation", "pre-processing": { - "tokenization": true, - "tokenization_tool" : "spacy", + "tokenization": false, + "tokenization_tool" : null, "sentence_split": false, "sentence_split_splitor": null, "syntactic_parsing": false, "NER_format_initialisation": true }, "discourse_segmenter": { - "model": "tony", + "model": "/home/lriviere/andiamo/discut22/data/eng.sdrt.stac/results_2022-11-21_15_42_42.923648/model.tar.gz", "training": { "toolkit": null, "pre_trained_lm": null, diff --git a/code/config_global_3.json b/code/config_global_3.json index 9703e8d..c1efee8 100644 --- a/code/config_global_3.json +++ b/code/config_global_3.json @@ -1,7 +1,7 @@ { "usecase_description": "Config file for usecase_3 : Take a EDU gold segmented set of train/dev/test of texts au format conll as input, train a model, output scores.", "input": { - "name": "eng.rst.rstdt", + "name": "eng.sdrt.stac", "file": ".conllu", "language": "en" }, @@ -9,9 +9,9 @@ "main": "train", "pre-processing": { "tokenization": false, - "tokenization_tool" : "spacy", + "tokenization_tool" : null, "sentence_split": false, - "sentence_split_splitor": "stanza", + "sentence_split_splitor": null, "syntactic_parsing": false, "NER_format_initialisation": true }, @@ -21,8 +21,8 @@ "toolkit": "allennlp", "pre_trained_lm": "bert", "config_file": "../model/config_training_bert.jsonnet", - "train_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_train.conllu", - "validation_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_dev.conllu" + "train_data_path": "../data/eng.sdrt.stac/eng.sdrt.stac_train.conllu", + "validation_data_path": "../data/eng.sdrt.stac/eng.sdrt.stac_dev.conllu" } }, "post-processing": { @@ -30,7 +30,7 @@ "tab_to_bracket":false }, "evaluation": true, - "gold_test_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_test.conllu" + "gold_test_data_path": "../data/eng.sdrt.stac/eng.sdrt.stac_test.conllu" } } diff --git a/code/discut22_1.py b/code/discut22_1.py index 1a40cfe..60c543c 100644 --- a/code/discut22_1.py +++ b/code/discut22_1.py @@ -15,7 +15,7 @@ import json from classes_def import Input, Process import utils -import utils.fr_tokenize as tk +#import utils.fr_tokenize as tk import utils.conv2ner as c2n import utils.json2conll as j2c import utils.conll2bracket as c2bracket @@ -37,16 +37,20 @@ def get_config_infos(stamp, config_file): # fonction to load existing model -> only tony for now def get_model(model_name): name = model_name + output = "" if name == "tony": arch = "french_tokens.tar.gz" if not os.path.isfile(f"../model/{name}/{arch}"): dl = "wget https://zenodo.org/record/4235850/files/french_tokens.tar.gz -P ../model --progress=bar" os.system(dl) + output = f"../model/{name}/{arch}" else: print("Tony already in place !") + else: + output = model_name - return f"../model/{name}/{arch}" + return output def text_tokenization(f_in, f_out, lang, tool): @@ -106,12 +110,9 @@ def main(steps): if steps.main == "train": - #model_config = steps.model_config - #cmd = "bash utils/expes.sh eng.rst.rstdt model/config_training.jsonnet bert train" - #os.system(cmd) if steps.toolkit == "allennlp": print("toolkit allennlp for training") - # tr_allen.main(steps) + tr_allen.main(steps) # set the value of model from null to what was just created by training steps.model = f"{steps.data.resu}/model.tar.gz" elif steps.toolkit == "jiant": -- GitLab