diff --git a/code/allen_custom/__pycache__/custom_bert_token_embedder.cpython-37.pyc b/code/allen_custom/__pycache__/custom_bert_token_embedder.cpython-37.pyc index 7ba9802b113f8fe5be38ab83deb4daf2695ec9ff..165899baf4e577c29f30b67b7ffc7e0f9ddeb9db 100644 Binary files a/code/allen_custom/__pycache__/custom_bert_token_embedder.cpython-37.pyc and b/code/allen_custom/__pycache__/custom_bert_token_embedder.cpython-37.pyc differ diff --git a/code/allen_custom/__pycache__/custom_conll_reader.cpython-37.pyc b/code/allen_custom/__pycache__/custom_conll_reader.cpython-37.pyc index 61f75bd353d9d94355e7f854fbb639bdca3a0c6f..e1cd9b9e6975f5ec5f52e93593be0b21fec841a6 100644 Binary files a/code/allen_custom/__pycache__/custom_conll_reader.cpython-37.pyc and b/code/allen_custom/__pycache__/custom_conll_reader.cpython-37.pyc differ diff --git a/code/allen_custom/__pycache__/custom_disrpt_reader.cpython-37.pyc b/code/allen_custom/__pycache__/custom_disrpt_reader.cpython-37.pyc index b5fed7c4fd030664682f83ecb79a6980d3d2cf6b..5effd2460e7a8c5f97dcecd3a560c3dba932e235 100644 Binary files a/code/allen_custom/__pycache__/custom_disrpt_reader.cpython-37.pyc and b/code/allen_custom/__pycache__/custom_disrpt_reader.cpython-37.pyc differ diff --git a/code/allen_custom/__pycache__/custom_simple_tagger.cpython-37.pyc b/code/allen_custom/__pycache__/custom_simple_tagger.cpython-37.pyc index c1647c89fbf6170d62b81ac688416a44a176c2c6..0eb4cd9a434b2331b7a46a62b763bccb2235e80c 100644 Binary files a/code/allen_custom/__pycache__/custom_simple_tagger.cpython-37.pyc and b/code/allen_custom/__pycache__/custom_simple_tagger.cpython-37.pyc differ diff --git a/code/config_global_1.json b/code/config_global_1.json index cd6c01ca6812e45cf87d284f4ee323677800a07b..e767ec2093f974e37945535c3577818ccb2be08c 100644 --- a/code/config_global_1.json +++ b/code/config_global_1.json @@ -1,22 +1,22 @@ { - "usecase_description": "Config file for usecase_1 : from a text, get the same text but with EDU bracket using ToNy segmenter.", + "usecase_description": "Config file for usecase_1 : from a text, get the same text but with EDU bracket.", "input": { - "name": "chaperontest", - "file": ".ss", - "language": "fr" + "name": "eng_annotation", + "file": ".conllu", + "language": "en" }, "steps":{ "main": "annotation", "pre-processing": { - "tokenization": true, - "tokenization_tool" : "spacy", + "tokenization": false, + "tokenization_tool" : null, "sentence_split": false, "sentence_split_splitor": null, "syntactic_parsing": false, "NER_format_initialisation": true }, "discourse_segmenter": { - "model": "tony", + "model": "/home/lriviere/andiamo/discut22/data/eng.sdrt.stac/results_2022-11-21_15_42_42.923648/model.tar.gz", "training": { "toolkit": null, "pre_trained_lm": null, diff --git a/code/config_global_3.json b/code/config_global_3.json index 9703e8d798bbb98f1cf02f1dd40f7c2c061e3473..c1efee85638fee908c6c15a36ed4b4766e70d0a9 100644 --- a/code/config_global_3.json +++ b/code/config_global_3.json @@ -1,7 +1,7 @@ { "usecase_description": "Config file for usecase_3 : Take a EDU gold segmented set of train/dev/test of texts au format conll as input, train a model, output scores.", "input": { - "name": "eng.rst.rstdt", + "name": "eng.sdrt.stac", "file": ".conllu", "language": "en" }, @@ -9,9 +9,9 @@ "main": "train", "pre-processing": { "tokenization": false, - "tokenization_tool" : "spacy", + "tokenization_tool" : null, "sentence_split": false, - "sentence_split_splitor": "stanza", + "sentence_split_splitor": null, "syntactic_parsing": false, "NER_format_initialisation": true }, @@ -21,8 +21,8 @@ "toolkit": "allennlp", "pre_trained_lm": "bert", "config_file": "../model/config_training_bert.jsonnet", - "train_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_train.conllu", - "validation_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_dev.conllu" + "train_data_path": "../data/eng.sdrt.stac/eng.sdrt.stac_train.conllu", + "validation_data_path": "../data/eng.sdrt.stac/eng.sdrt.stac_dev.conllu" } }, "post-processing": { @@ -30,7 +30,7 @@ "tab_to_bracket":false }, "evaluation": true, - "gold_test_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_test.conllu" + "gold_test_data_path": "../data/eng.sdrt.stac/eng.sdrt.stac_test.conllu" } } diff --git a/code/discut22_1.py b/code/discut22_1.py index 1a40cfe5c0206ff3826bf173ef805dcb0345c92b..60c543c91db77ee70a92113a220e9f8a9fc0c670 100644 --- a/code/discut22_1.py +++ b/code/discut22_1.py @@ -15,7 +15,7 @@ import json from classes_def import Input, Process import utils -import utils.fr_tokenize as tk +#import utils.fr_tokenize as tk import utils.conv2ner as c2n import utils.json2conll as j2c import utils.conll2bracket as c2bracket @@ -37,16 +37,20 @@ def get_config_infos(stamp, config_file): # fonction to load existing model -> only tony for now def get_model(model_name): name = model_name + output = "" if name == "tony": arch = "french_tokens.tar.gz" if not os.path.isfile(f"../model/{name}/{arch}"): dl = "wget https://zenodo.org/record/4235850/files/french_tokens.tar.gz -P ../model --progress=bar" os.system(dl) + output = f"../model/{name}/{arch}" else: print("Tony already in place !") + else: + output = model_name - return f"../model/{name}/{arch}" + return output def text_tokenization(f_in, f_out, lang, tool): @@ -106,12 +110,9 @@ def main(steps): if steps.main == "train": - #model_config = steps.model_config - #cmd = "bash utils/expes.sh eng.rst.rstdt model/config_training.jsonnet bert train" - #os.system(cmd) if steps.toolkit == "allennlp": print("toolkit allennlp for training") - # tr_allen.main(steps) + tr_allen.main(steps) # set the value of model from null to what was just created by training steps.model = f"{steps.data.resu}/model.tar.gz" elif steps.toolkit == "jiant":