Skip to content
Snippets Groups Projects
Commit ccb38671 authored by laura.riviere's avatar laura.riviere
Browse files

fix requirements.txt

parent 42f75de9
Branches
No related tags found
No related merge requests found
No preview for this file type
No preview for this file type
No preview for this file type
No preview for this file type
{
"usecase_description": "Config file for usecase_1 : from a text, get the same text but with EDU bracket using ToNy segmenter.",
"usecase_description": "Config file for usecase_1 : from a text, get the same text but with EDU bracket.",
"input": {
"name": "chaperontest",
"file": ".ss",
"language": "fr"
"name": "eng_annotation",
"file": ".conllu",
"language": "en"
},
"steps":{
"main": "annotation",
"pre-processing": {
"tokenization": true,
"tokenization_tool" : "spacy",
"tokenization": false,
"tokenization_tool" : null,
"sentence_split": false,
"sentence_split_splitor": null,
"syntactic_parsing": false,
"NER_format_initialisation": true
},
"discourse_segmenter": {
"model": "tony",
"model": "/home/lriviere/andiamo/discut22/data/eng.sdrt.stac/results_2022-11-21_15_42_42.923648/model.tar.gz",
"training": {
"toolkit": null,
"pre_trained_lm": null,
......
{
"usecase_description": "Config file for usecase_3 : Take a EDU gold segmented set of train/dev/test of texts au format conll as input, train a model, output scores.",
"input": {
"name": "eng.rst.rstdt",
"name": "eng.sdrt.stac",
"file": ".conllu",
"language": "en"
},
......@@ -9,9 +9,9 @@
"main": "train",
"pre-processing": {
"tokenization": false,
"tokenization_tool" : "spacy",
"tokenization_tool" : null,
"sentence_split": false,
"sentence_split_splitor": "stanza",
"sentence_split_splitor": null,
"syntactic_parsing": false,
"NER_format_initialisation": true
},
......@@ -21,8 +21,8 @@
"toolkit": "allennlp",
"pre_trained_lm": "bert",
"config_file": "../model/config_training_bert.jsonnet",
"train_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_train.conllu",
"validation_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_dev.conllu"
"train_data_path": "../data/eng.sdrt.stac/eng.sdrt.stac_train.conllu",
"validation_data_path": "../data/eng.sdrt.stac/eng.sdrt.stac_dev.conllu"
}
},
"post-processing": {
......@@ -30,7 +30,7 @@
"tab_to_bracket":false
},
"evaluation": true,
"gold_test_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_test.conllu"
"gold_test_data_path": "../data/eng.sdrt.stac/eng.sdrt.stac_test.conllu"
}
}
......
......@@ -15,7 +15,7 @@ import json
from classes_def import Input, Process
import utils
import utils.fr_tokenize as tk
#import utils.fr_tokenize as tk
import utils.conv2ner as c2n
import utils.json2conll as j2c
import utils.conll2bracket as c2bracket
......@@ -37,16 +37,20 @@ def get_config_infos(stamp, config_file):
# fonction to load existing model -> only tony for now
def get_model(model_name):
name = model_name
output = ""
if name == "tony":
arch = "french_tokens.tar.gz"
if not os.path.isfile(f"../model/{name}/{arch}"):
dl = "wget https://zenodo.org/record/4235850/files/french_tokens.tar.gz -P ../model --progress=bar"
os.system(dl)
output = f"../model/{name}/{arch}"
else:
print("Tony already in place !")
else:
output = model_name
return f"../model/{name}/{arch}"
return output
def text_tokenization(f_in, f_out, lang, tool):
......@@ -106,12 +110,9 @@ def main(steps):
if steps.main == "train":
#model_config = steps.model_config
#cmd = "bash utils/expes.sh eng.rst.rstdt model/config_training.jsonnet bert train"
#os.system(cmd)
if steps.toolkit == "allennlp":
print("toolkit allennlp for training")
# tr_allen.main(steps)
tr_allen.main(steps)
# set the value of model from null to what was just created by training
steps.model = f"{steps.data.resu}/model.tar.gz"
elif steps.toolkit == "jiant":
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment