Skip to content
Snippets Groups Projects
Commit 1c404fc4 authored by laura.riviere's avatar laura.riviere
Browse files

add config with path to en model in stout

parent 58b29c78
No related branches found
No related tags found
1 merge request!2Dev expes
......@@ -14,9 +14,11 @@ Code: https://gitlab.inria.fr/andiamo/tony
# Usage
## Usecases
- **Discourse Segmentation:** Take a raw text as input, use a loaded model to make predictions. Output the same text but with EDU segmentation.
- **Discourse Segmentation:** Take a raw text as input, use a loaded model to make predictions. Output the same text but with EDU segmentation. --> config_1
- **Segmentation Evaluation:** Take an EDU gold segmented text as input, use a loaded model to make predictions. Output scores of model predictions against gold, and output discrepancies.
- **Segmentation Evaluation:** Take an EDU gold segmented text as input, use a loaded model to make predictions. Output scores of model predictions against gold, and output discrepancies. --> config_2
- **Custom Model Creation:** Fine-tuning (over one or two level) a pretrained Language Model with a specific dataset or combination of datasets. Then make predictions and evaluation. --> config_3
## Content description
......@@ -40,17 +42,17 @@ pip install -r requirements.txt
## Configuration file: to chose or to complete
- `code/config_1.json` Config for usecase_1 : take a sentence splited text, apply ToNy, output same text but with EDU brackets.
- [TBD : train models config and all sort of cool options]
## Run usecase 1
(go to `code` directory)
Run this command:
```
python discut22.py --config config_1.json
python discut22.py --config config_XX.json
```
## Authors and acknowledgment
Morteza Ezzabady
Morteza Ezzabady
Amir Zeldes
<!---
......
......@@ -6,8 +6,7 @@
"folder_path": "../data/chaperontest",
"format": "raw_sentences",
"language": "fr",
"gold": false,
"results_path": "../data/chaperontest/results"
"gold": false
},
"output": {
"format": "bracket",
......@@ -27,7 +26,8 @@
"post-processing": {
"json_to_tab": true,
"tab_to_bracket":true
}
},
"evaluation": false
}
}
{
"usecase_description": "Config file for usecase_1 : from a text, get the same text but with EDU using an existing model.",
"input": {
"name": "my.cool.project",
"file": [".conllu", ".tok"],
"folder_path": "../data/my.cool.project",
"format": "truc",
"language": "en",
"gold": true
},
"output": {
"format": "ner_tok",
"framework": "rst"
},
"steps":{
"main": "annotation",
"pre-processing": {
"tokenization": [false, true],
"sentence_split": [false, true],
"sentence_split_splitor": "stanza",
"syntactic_parsing": [false],
"NER_format_initialisation": true
},
"discourse_segmenter": {
"model": "/moredata/andiamo/discut/Results_split.tok/results_eng.rst.gum_bert/model.tar.gz",
"training": {
"toolkit": null,
"pre_trained_lm": null,
"config_file": null,
"train_data_path": null,
"validation_data_path": null
}
},
"post-processing": {
"json_to_tab": [false, true],
"tab_to_bracket": [false, true]
},
"evaluation": false,
"gold_test_data_path": null
}
}
......@@ -6,8 +6,7 @@
"folder_path": "../data/fra.sdrt.annodis_dev",
"format": "truc",
"language": "fr",
"gold": true,
"results_path": "../data/fra.sdrt.annodis_dev/results"
"gold": true
},
"output": {
"format": "ner_tok",
......
......@@ -7,8 +7,7 @@
"folder_path": "../data/fra.sdrt.annodis_dev",
"format": "truc",
"language": "fr",
"gold": true,
"results_path": "../data/fra.sdrt.annodis_dev/results"
"gold": true
},
"output": {
"format": "ner_tok",
......
......@@ -48,7 +48,6 @@ def get_model(model_name):
# main call
def main(steps):
#steps = get_config_infos(config) # on obtient la liste des trucs
......@@ -60,13 +59,12 @@ def main(steps):
# FN: soit besoin sent split, soit besoin tokenizer, soit aucun des deux
if steps.ssplit == True : # python code/ssplit/parse_corpus.py ${dataset} --parser stanza --out_dir data
#### Split text into sentence : not in usecase1
if not steps.ssplitor == "stanza" :
print("pls define sentence splitor") # raise error n kill process
data_in = f"{steps.data.path}/{steps.data.name}{steps.data.file}"
data_tok = f"{steps.data.path}/{steps.data.name}.tok"
print(f"Starting sentence spliting...to {steps.data.path}/steps.data.name")
# ssent.main(data_in, data_tok, "stanza", steps.data.lang)
ssent.main(data_in, data_tok, "stanza", steps.data.lang)
ssent.main(data_in, data_tok, steps.ssplitor, steps.data.lang)
elif steps.toke == True :
#### Tokenization du text # #python ${SEG_DIR}/code/utils/fr_tokenize.py $RAW > ${RAW}.tok
data_in = f"{steps.data.path}/{steps.data.name}{steps.data.file}"
......@@ -74,7 +72,6 @@ def main(steps):
# sys.exit("check path")
print(f"Starting Tokenization...to {data_tok}")
tk.main(data_in, data_tok) # .ss -> .tok
else:
data_tok = f"{steps.data.path}/{steps.data.name}{steps.data.file}"
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment