From b30aaabe02296e0ad263370db2801077e79678f7 Mon Sep 17 00:00:00 2001
From: "laura.riviere" <laura.riviere@irit.fr>
Date: Mon, 23 Jan 2023 18:04:05 +0100
Subject: [PATCH] add discut

---
 DISCUT/README.md                            | 123 ++++++
 DISCUT/code/config_global_1.1.json          |  50 +++
 DISCUT/code/config_global_1.2.json          |  50 +++
 DISCUT/code/config_global_2.json            |  50 +++
 DISCUT/code/config_global_3.json            |  50 +++
 DISCUT/code/config_global_4.json            |  50 +++
 DISCUT/code/discut22_2.py                   | 423 ++++++++++++++++++++
 DISCUT/{.gitkeep => code/utils/__init__.py} |   0
 DISCUT/code/utils/conll2bracket.py          |  75 ++++
 DISCUT/code/utils/conv2ner.py               | 128 ++++++
 DISCUT/code/utils/fr_tokenize.py            |  91 +++++
 DISCUT/code/utils/json2conll.py             |  98 +++++
 DISCUT/code/utils/seg_eval.py               | 224 +++++++++++
 DISCUT/code/utils/sent_split.py             | 220 ++++++++++
 DISCUT/code/utils/syntactic_parsing.py      |  65 +++
 DISCUT/code/utils/training_allennlp.py      |  57 +++
 DISCUT/data/chaperonrouge/chaperonrouge.txt |   1 +
 DISCUT/requirements.txt                     |  98 +++++
 18 files changed, 1853 insertions(+)
 create mode 100644 DISCUT/README.md
 create mode 100644 DISCUT/code/config_global_1.1.json
 create mode 100644 DISCUT/code/config_global_1.2.json
 create mode 100644 DISCUT/code/config_global_2.json
 create mode 100644 DISCUT/code/config_global_3.json
 create mode 100644 DISCUT/code/config_global_4.json
 create mode 100644 DISCUT/code/discut22_2.py
 rename DISCUT/{.gitkeep => code/utils/__init__.py} (100%)
 create mode 100644 DISCUT/code/utils/conll2bracket.py
 create mode 100644 DISCUT/code/utils/conv2ner.py
 create mode 100644 DISCUT/code/utils/fr_tokenize.py
 create mode 100644 DISCUT/code/utils/json2conll.py
 create mode 100644 DISCUT/code/utils/seg_eval.py
 create mode 100644 DISCUT/code/utils/sent_split.py
 create mode 100644 DISCUT/code/utils/syntactic_parsing.py
 create mode 100644 DISCUT/code/utils/training_allennlp.py
 create mode 100644 DISCUT/data/chaperonrouge/chaperonrouge.txt
 create mode 100644 DISCUT/requirements.txt

diff --git a/DISCUT/README.md b/DISCUT/README.md
new file mode 100644
index 0000000..c6021e3
--- /dev/null
+++ b/DISCUT/README.md
@@ -0,0 +1,123 @@
+# Project DisCut22 : Discourse Annotator Tool
+
+A tool for Discourse Annotation. Inheritor of ToNy and DisCut, segmentors for DISRPT 2019 and 2021. The goal of this version is to be easy to use with or without IT knowledge.
+
+__2021__  
+*[Multi-lingual Discourse Segmentation and Connective Identification: MELODI at Disrpt2021](https://aclanthology.org/2021.disrpt-1.3.pdf)*  
+Code: https://gitlab.irit.fr/melodi/andiamo/discoursesegmentation/discut
+
+__2019__  
+*[ToNy: Contextual embeddings for accurate multilingual discourse segmentation of full documents](https://www.aclweb.org/anthology/W19-2715.pdf)*  
+Code: https://gitlab.inria.fr/andiamo/tony
+
+
+
+
+## Usecases 
+- **Discourse Segmentation: "annotation"** Take a raw text as input, use a loaded model to make predictions. Output the same text but with EDU segmentation.  
+&rarr; `config_global_1.1.json` and `config_global_1.2.json`.  
+- **Segmentation Evaluation: "test"** Take an EDU gold segmented text as input, use a loaded model to make predictions. Output scores of model predictions against gold, and output discrepancies.  
+&rarr; `config_global_2.json`.  
+- **Custom Model Creation: "train"** Train a new model using a pretrained Language Model (BERT, etc) and a specific dataset or combination of datasets. Then make predictions and evaluation.  
+&rarr; `config_global_3.json`.
+- **Custom Model fine-tuning: "fine_tune"** Fine-tune an existing model using a pretrained Language Model (BERT, etc) and a specific dataset or combination of datasets. Then make predictions and evaluation.  
+&rarr; `config_global_4.json`.
+
+## Content description  
+
+- `README.md` Description of project.  
+- `global_config_file_guideline.md` Contains detailed documentation to build well formed config_global file.
+- `data/my_cool_dataset/` Contains raw data that need the same name of the directory.
+- `code/` Contains main scripts.  
+    - `config_global_XX.json` A file to be completed for your specific project.  
+    - `utils/` Contains useful scripts to be called.  
+    - `discut22_1.py` One python script to run them all.  
+- `model/` Contains model to be loaded.
+    - `config_training.jsonnet` A file to be completed for usecases 3 and 4.
+- `projects/` This directory will be created automatically.
+    - `my_cool_exp_v1/` Name of your run. This directory will be created automatically. (see Usage)
+        - `logs_global.json` Logs of all processes, data and results. This file will be created automatically.
+        - `data_converted/` Contains pre-processed data if needed. This directory will be created automatically.
+        - `results/` Contains output files, logs and metrics, if any. This directory will be created automatically.
+            - `train/` Contains specific output related to train (like the model created), if any.
+            - `fine_tune/` Contains specific output related to train (like the model created), if any.
+
+
+## Set up environnement
+- DICUT22 run on Python 3.7. Advise : create a specific virtual envireonment (Miniconda...).
+- Install all librairies required with the following command:
+```
+pip install -r requirements.txt
+```
+- Install pytorch:
+```
+pip install torch==1.9.0+cu111 torchvision==0.10.0+cu111 torchaudio===0.9.0 -f https://download.pytorch.org/whl/torch_stable.html
+```
+
+
+## Configuration file: to chose or to complete
+- `code/config_global_X.json` &rarr; See `global_config_file_guideline.md`.
+
+
+# Usage
+(go to `code` directory)  
+Run the command:
+```
+python discut22_2.py --config config_XX.json [--name my_run_name] [-o]
+```
+--config <> &nbsp; &nbsp; &nbsp; &nbsp; Your config file. (Mandatory)  
+--name <> &nbsp; &nbsp; &nbsp; &nbsp; A name for your run. (Optional)  
+-o, --overwrite &nbsp; &nbsp; &nbsp; &nbsp; Allow overwriting of `data_converted/` and `results/`. (optional)
+
+
+## Support
+laura.riviere@irit.fr
+
+
+## Authors and acknowledgment
+Morteza Ezzabady  
+Laura Rivière  
+Amir Zeldes  
+
+
+
+## License
+Copyright 2023 IRIT-MELODI
+<!---
+
+## Test and Deploy
+
+## Description
+Let people know what your project can do specifically. Provide context and add a link to any reference visitors might be unfamiliar with. A list of Features or a Background subsection can also be added here. If there are alternatives to your project, this is a good place to list differentiating factors.
+
+## Badges
+On some READMEs, you may see small images that convey metadata, such as whether or not all the tests are passing for the project. You can use Shields to add some to your README. Many services also have instructions for adding a badge.
+
+## Visuals
+Depending on what you are making, it can be a good idea to include screenshots or even a video (you'll frequently see GIFs rather than actual videos). Tools like ttygif can help, but check out Asciinema for a more sophisticated method.
+
+## Installation
+Within a particular ecosystem, there may be a common way of installing things, such as using Yarn, NuGet, or Homebrew. However, consider the possibility that whoever is reading your README is a novice and would like more guidance. Listing specific steps helps remove ambiguity and gets people to using your project as quickly as possible. If it only runs in a specific context like a particular programming language version or operating system or has dependencies that have to be installed manually, also add a Requirements subsection.
+
+## Usage
+Use examples liberally, and show the expected output if you can. It's helpful to have inline the smallest example of usage that you can demonstrate, while providing links to more sophisticated examples if they are too long to reasonably include in the README.
+
+## Support
+Tell people where they can go to for help. It can be any combination of an issue tracker, a chat room, an email address, etc.
+
+## Roadmap
+If you have ideas for releases in the future, it is a good idea to list them in the README.
+
+## Contributing
+State if you are open to contributions and what your requirements are for accepting them.
+
+For people who want to make changes to your project, it's helpful to have some documentation on how to get started. Perhaps there is a script that they should run or some environment variables that they need to set. Make these steps explicit. These instructions could also be useful to your future self.
+
+You can also document commands to lint the code or run tests. These steps help to ensure high code quality and reduce the likelihood that the changes inadvertently break something. Having instructions for running tests is especially helpful if it requires external setup, such as starting a Selenium server for testing in a browser.
+
+## Authors and acknowledgment
+## License
+## Project status
+If you have run out of energy or time for your project, put a note at the top of the README saying that development has slowed down or stopped completely. Someone may choose to fork your project or volunteer to step in as a maintainer or owner, allowing your project to keep going. You can also make an explicit request for maintainers.
+
+--->
\ No newline at end of file
diff --git a/DISCUT/code/config_global_1.1.json b/DISCUT/code/config_global_1.1.json
new file mode 100644
index 0000000..f0b5843
--- /dev/null
+++ b/DISCUT/code/config_global_1.1.json
@@ -0,0 +1,50 @@
+{
+    "usecase_description": "Config file for usecase_1 : from a raw text, get the same text but with EDU bracket.",
+    "data_raw": {
+        "name": "edgar_poe_en",
+        "exte": ".txt",
+        "language": "en",
+        "existing_metadata": true
+    },
+    "steps":{
+        "main": "annotation",
+        "pre-processing": {
+            "to_do": true,
+            "syntactic_tool": "stanza",
+            "sentence_split": true,
+            "tokenization": true,
+            "syntactic_parsing": false,
+            "create_metadata": {
+                "to_do": true,
+                "line": "paragraph",
+                "sent": "sent"
+            }
+        },
+        "discourse_segmenter": {
+            "model": "/home/lriviere/andiamo/discut/Results_conllu/results_eng.rst.gum-eng_bert/model.tar.gz",
+            "training": {
+                "toolkit": null,
+                "pre_trained_lm": null,
+                "config_file": null,
+                "train_data_path": null,
+                "validation_data_path": null
+            }
+        },
+        "evaluation": true,
+        "gold_test_data_path": null
+    },
+    "output":{
+        "conll_file":{
+            "to_do": true,
+            "metadata": true,
+            "with_gold_labels": false
+        },
+        "txt_file":{
+            "to_do": false,
+            "metadata": true
+        }
+    }
+}
+
+
+
diff --git a/DISCUT/code/config_global_1.2.json b/DISCUT/code/config_global_1.2.json
new file mode 100644
index 0000000..7a6b920
--- /dev/null
+++ b/DISCUT/code/config_global_1.2.json
@@ -0,0 +1,50 @@
+{
+    "usecase_description": "Config file for usecase_1 : from a tokenized text, get the same text but with EDU bracket.",
+    "data_raw": {
+        "name": "edgar_poe_short",
+        "exte": ".conll",
+        "language": "en",
+        "existing_metadata": true
+    },
+    "steps":{
+        "main": "annotation",
+        "pre-processing": {
+            "to_do": false,
+            "syntactic_tool": "stanza",
+            "sentence_split": true,
+            "tokenization": true,
+            "syntactic_parsing": true,
+            "create_metadata": {
+                "to_do": true,
+                "line": "paragraph",
+                "sent": "sent"
+            }
+        },
+        "discourse_segmenter": {
+            "model": "/home/lriviere/andiamo/discut/Results_conllu/results_eng.rst.gum-eng_bert/model.tar.gz",
+            "training": {
+                "toolkit": null,
+                "pre_trained_lm": null,
+                "config_file": null,
+                "train_data_path": null,
+                "validation_data_path": null
+            }
+        },
+        "evaluation": true,
+        "gold_test_data_path": null
+    },
+    "output":{
+        "conll_file":{
+            "to_do": true,
+            "metadata": true,
+            "with_gold_labels": true
+        },
+        "txt_file":{
+            "to_do": true,
+            "metadata": true
+        }
+    }
+}
+
+
+
diff --git a/DISCUT/code/config_global_2.json b/DISCUT/code/config_global_2.json
new file mode 100644
index 0000000..988e706
--- /dev/null
+++ b/DISCUT/code/config_global_2.json
@@ -0,0 +1,50 @@
+{
+    "usecase_description": "Config file for usecase_2 : from a gold text, make predictions with an existing model, compare gold and predictions with metrics.",
+    "data_raw": {
+        "name": "eng.pdtb.pdtb_dev",
+        "exte": ".conllu",
+        "language": "en",
+        "existing_metadata": true
+    },
+    "steps":{
+        "main": "test",
+        "pre-processing": {
+            "to_do": false,
+            "syntactic_tool": "stanza",
+            "sentence_split": true,
+            "tokenization": true,
+            "syntactic_parsing": true,
+            "create_metadata": {
+                "to_do": true,
+                "line": "paragraph",
+                "sent": "sent"
+            }
+        },
+        "discourse_segmenter": {
+            "model": "/home/lriviere/andiamo/morteza/discut/Results_conllu/results_eng.pdtb.pdtb_bert/model.tar.gz",
+            "training": {
+                "toolkit": null,
+                "pre_trained_lm": null,
+                "config_file": null,
+                "train_data_path": null,
+                "validation_data_path": null
+            }
+        },
+        "evaluation": true,
+        "gold_test_data_path": null
+    },
+    "output":{
+        "conll_file":{
+            "to_do": true,
+            "metadata": true,
+            "with_gold_labels": true
+        },
+        "txt_file":{
+            "to_do": true,
+            "metadata": true
+        }
+    }
+}
+
+
+
diff --git a/DISCUT/code/config_global_3.json b/DISCUT/code/config_global_3.json
new file mode 100644
index 0000000..c24b4ff
--- /dev/null
+++ b/DISCUT/code/config_global_3.json
@@ -0,0 +1,50 @@
+{
+    "usecase_description": "Config file for usecase_3 : from a dataset, splited in train/dev/test, train a model (= fine-tune a LM) and test on testset.",
+    "data_raw": {
+        "name": "eng.rst.rstdt",
+        "exte": ".conllu",
+        "language": "en",
+        "existing_metadata": true
+    },
+    "steps":{
+        "main": "train",
+        "pre-processing": {
+            "to_do": false,
+            "syntactic_tool": "stanza",
+            "sentence_split": true,
+            "tokenization": true,
+            "syntactic_parsing": true,
+            "create_metadata": {
+                "to_do": false,
+                "line": "paragraph",
+                "sent": "sent"
+            }
+        },
+        "discourse_segmenter": {
+            "model": null,
+            "training": {
+                "toolkit": "allennlp",
+                "pre_trained_lm": "bert",
+                "config_file": "/home/lriviere/andiamo/discut22/model/config_training_bert_m.jsonnet",
+                "train_data_path": "eng.rst.rstdt_train",
+                "validation_data_path": "eng.rst.rstdt_dev"
+            }
+        },
+        "evaluation": true,
+        "gold_test_data_path": "eng.rst.rstdt_dev"
+    },
+    "output":{
+        "conll_file":{
+            "to_do": true,
+            "metadata": true,
+            "with_gold_labels": true
+        },
+        "txt_file":{
+            "to_do": false,
+            "metadata": true
+        }
+    }
+}
+
+
+
diff --git a/DISCUT/code/config_global_4.json b/DISCUT/code/config_global_4.json
new file mode 100644
index 0000000..52e75d3
--- /dev/null
+++ b/DISCUT/code/config_global_4.json
@@ -0,0 +1,50 @@
+{
+    "usecase_description": "Config file for usecase_4 : from a dataset, splited in train/dev/test, fine-tune a model (= made of fine-tune of a LM) and test on testset.",
+    "data_raw": {
+        "name": "eng.rst.rstdt",
+        "exte": ".conllu",
+        "language": "en",
+        "existing_metadata": true
+    },
+    "steps":{
+        "main": "fine_tune",
+        "pre-processing": {
+            "to_do": false,
+            "syntactic_tool": "stanza",
+            "sentence_split": true,
+            "tokenization": true,
+            "syntactic_parsing": true,
+            "create_metadata": {
+                "to_do": false,
+                "line": "paragraph",
+                "sent": "sent"
+            }
+        },
+        "discourse_segmenter": {
+            "model": "/home/lriviere/andiamo/discut22/data/eng.rst.rstdt/results_lundi9/model.tar.gz",
+            "training": {
+                "toolkit": "allennlp",
+                "pre_trained_lm": "bert",
+                "config_file": "/home/lriviere/andiamo/discut22/model/config_training_bert_m.jsonnet",
+                "train_data_path": "/home/lriviere/andiamo/discut22/data/eng.sdrt.stac/eng.sdrt.stac_train.conllu",
+                "validation_data_path": "/home/lriviere/andiamo/discut22/data/eng.sdrt.stac/eng.sdrt.stac_dev.conllu"
+            }
+        },
+        "evaluation": true,
+        "gold_test_data_path": "eng.rst.rstdt_dev"
+    },
+    "output":{
+        "conll_file":{
+            "to_do": true,
+            "metadata": true,
+            "with_gold_labels": true
+        },
+        "txt_file":{
+            "to_do": false,
+            "metadata": true
+        }
+    }
+}
+
+
+
diff --git a/DISCUT/code/discut22_2.py b/DISCUT/code/discut22_2.py
new file mode 100644
index 0000000..4c5caaa
--- /dev/null
+++ b/DISCUT/code/discut22_2.py
@@ -0,0 +1,423 @@
+######################################
+###### DISCOURSE SEGMENTOR 2022 ######
+######################################
+""" This the main script
+    And the only one to run,
+    after completion of config.json 
+    Discut22 uses allennlp toolkit. For that, it need NER intermediary format.
+"""
+
+import argparse
+from datetime import datetime
+import os
+import re
+import json
+import utils.syntactic_parsing as synt_pars
+import utils.conv2ner as conv_to_ner # TODO clean it
+import utils.json2conll as json_to_connl # TODO clean it
+import utils.training_allennlp as tr_allen
+import utils.conll2bracket as c2bracket
+import utils.seg_eval as seg_eval
+
+
+        
+
+class Data:
+    def __init__(self, infos, stamp, stamp_time, overwrite):
+        self.name = infos['name']
+        self.lang = infos['language']
+        self.path = f"../data/{self.name}"
+        self.exte = infos['exte']
+        self.raw = f"{self.path}/{self.name}{self.exte}"
+        self.stamp = stamp
+        self.stamp_time = stamp_time
+        self.proj = "../projects"
+        self.run = f"{self.proj}/{stamp}"
+        self.conv = f"{self.run}/data_converted"
+        self.resu = f"{self.run}/results"
+        self.over = overwrite
+        self.meta = infos['existing_metadata']
+        
+
+    def create_folders(self): 
+        print(f"----> Checking/creating folders.")
+        if not os.path.isdir(self.proj):
+            os.mkdir(self.proj)
+        if not os.path.isdir(self.run):
+            os.mkdir(self.run)
+
+        if not os.path.isdir(self.conv):
+            os.mkdir(self.conv)
+        elif self.over == False:
+            self.conv = f"{self.conv}_{stamp_time}"
+            os.mkdir(self.conv)
+
+        if not os.path.isdir(self.resu):
+            os.mkdir(self.resu)
+        elif self.over == False:
+            self.resu = f"{self.resu}_{stamp_time}"
+            os.mkdir(self.resu)
+
+        self.resu_fine = f"{self.resu}/fine_tune"
+        if os.path.isdir(self.resu_fine) and self.over == False:
+            self.resu_fine = f"{self.resu_fine}_{stamp_time}"
+        elif os.path.isdir(self.resu_fine) and self.over == True:
+            os.rmdir(self.resu_fine)
+
+        self.resu_train = f"{self.resu}/train"
+        if os.path.isdir(self.resu_train) and self.over == False:
+            self.resu_train = f"{self.resu_train}_{stamp_time}"
+            
+
+    def pre_processing(self, steps, file_in=None):
+        file_in = self.raw if file_in == None else file_in
+        if steps.pre_process_to_do == True:
+            print(f"----> Preprocessing {self.raw}.")
+            file_out = f"{self.conv}/{self.name}.conll"
+            if steps.synt_tool == "stanza":
+                processors = []
+                metadata = {}
+                if steps.toke == True:
+                    processors.extend(['tokenize', 'mwt'])
+                if steps.synt_parse == True:
+                    processors.extend(['pos', 'lemma', 'depparse'])
+                #if steps.ssplit == True:
+                #    processors.append('constituency')
+                if steps.crea_meta == True:
+                    metadata['line'] = steps.meta_line
+                    metadata['sent'] = steps.meta_sent
+                if data.meta == True:
+                    metadata['meta'] = True
+                processors_str = ",".join(processors)
+                synt_pars.with_stanza(data.lang, file_in, file_out, processors_str, metadata)
+            else:
+                exit(f"Exited. Not valid syntactic tool: \"{steps.synt_tool}\". Options: \"stanza\". Change your config file.")
+        else:
+            file_out = file_in
+        logs.add_infos('data_preprocessed', file_out)
+        self.preprocessed = file_out 
+    
+    def make_ner_format(self):
+        """
+        This fonction build the NER format upon the Segmentor works.
+        INPUT: Tokenized text with whatever number of columns.
+        OUTPUT: Tokenized text with just 4 columns.
+        """
+        self.ner = f"{self.preprocessed}.ner"
+        self.ner = f"{self.conv}/{self.name}.ner"
+        print(f"----> Making NER format {self.ner}.")
+        conv_to_ner.main(self.preprocessed, self.ner, "conll") # <-- TODO faire en relatif#TODO add same for train/dev/test for config train
+        logs.add_infos('data_ner', self.ner)
+
+    def make_predictions(self, steps, js_name=None, fi_ner=None, model=None):
+        js_name = self.name if js_name == None else js_name
+        fi_ner = self.ner if fi_ner == None else fi_ner
+        model = steps.model_path if model == None else model
+        self.pred_json = f"{self.resu}/{js_name}_pred.json"
+        cmd = f"allennlp predict --use-dataset-reader --output-file {self.pred_json} {model} {fi_ner} &> {self.resu}/logs_predictions.txt"
+        print(f"----> Making predictions: {cmd}.")
+        os.system(cmd)
+        logs.add_infos('predictions_cmd', cmd)
+
+    def pred_json_to_conll_w_metadata_w_gold(self, name=None): # here and 3 below..sorry..factorsation TBD
+        name = self.name if name == None else name
+        self.pred_conll_meta_gold = f"{self.resu}/{name}_pred_meta_gold.conll"
+        json_to_connl.js2conllNmetaNgold(self.pred_json, self.pred_conll_meta_gold, "conll", self.preprocessed)
+        return self.pred_conll_meta_gold
+
+    def pred_json_to_conll_w_metadata(self, name=None):
+        name = self.name if name == None else name
+        self.pred_meta_conll = f"{self.resu}/{name}_pred_meta.conll"
+        json_to_connl.js2conllNmeta(self.pred_json, self.pred_meta_conll, "conll", self.preprocessed) 
+        return self.pred_meta_conll
+
+    def pred_json_to_conll_w_gold(self, name=None):
+        name = self.name if name == None else name
+        self.pred_conll_gold = f"{self.resu}/{name}_pred_gold.conll"
+        json_to_connl.js2conll(self.pred_json, self.pred_conll_gold, "conll") 
+        return self.pred_conll_gold
+
+    def pred_json_to_conll(self, name=None):
+        name = self.name if name == None else name
+        self.pred_conll = f"{self.resu}/{name}_pred.conll"
+        json_to_connl.js2conll(self.pred_json, self.pred_conll, "conll") 
+        return self.pred_conll
+
+    def brackets_txt(self, name=None):
+        name = self.name if name == None else name
+        self.brack = f"{self.resu}/{name}_brac.txt"
+        c2bracket.conll2brackets(self.pred_conll, self.brack)
+        return self.brack
+
+    def brackets_txt_with_metadata(self, name=None):
+        name = self.name if name == None else name
+        self.brack_meta = f"{self.resu}/{name}_brac_meta.txt"
+        c2bracket.conll2brackets_with_meta(self.pred_meta_conll, self.brack_meta)
+        return self.brack_meta
+
+    def evaluation(self, steps, prod, gold=None, name=None, model=None):
+        self.basic_metrics = f"{self.resu}/Evaluation_metrics.json"
+
+        gold = self.preprocessed if gold == None else gold
+        name = self.name if name == None else name
+        model = steps.model_path if model == None else model
+
+        if prod.conll_todo == False:    # get pred_file to compute metrics with seg_eval
+            pred = self.pred_json_to_conll(name)
+        else:                       
+            if prod.conll_meta == True:
+                if prod.conll_w_gold == True:
+                    pred = self.pred_json_to_conll_w_metadata_w_gold(name)
+                else:
+                    pred = self.pred_json_to_conll_w_metadata(name)
+            else:
+                if prod.conll_w_gold == True:
+                    pred = self.pred_json_to_conll_w_gold(name)
+                else:
+                    pred = self.pred_json_to_conll(name)
+
+        print(f"----> Predictions to file {pred}.")
+        print(f"----> Evaluation scores to file {self.basic_metrics}.")
+        scores_dict = seg_eval.get_scores(gold, pred)
+        scores_dict['model'] = model
+        logs.add_infos('basic_metrics', scores_dict)
+        logs.add_infos('output_conll_file', pred)
+
+        with open(self.basic_metrics, 'w') as fo:
+            json.dump(scores_dict, fo, indent=4)
+
+        if prod.txt_todo == True:
+            if prod.txt_meta == True:
+                pred = f"{self.resu}/{name}_pred_meta.conll"
+                if not os.path.isfile(pred):
+                    self.pred_json_to_conll_w_metadata(name)
+                pred_txt = self.brackets_txt_with_metadata(name)
+                # os.system(f"rm {pred})
+            else:
+                pred = f"{self.resu}/{name}_pred.conll"
+                if not os.path.isfile(pred):
+                    self.pred_json_to_conll
+                pred_txt = self.brackets_txt(name)
+                # os.system(f"rm {pred})
+            print(f"----> Predictions to file {pred_txt}.")
+            logs.add_infos('output_txt_file', pred_txt)
+
+    def make_output(self, prod):
+        if prod.conll_todo == True:
+            if prod.conll_meta == True:
+                pred = self.pred_json_to_conll_w_metadata()
+            else:
+                pred = self.pred_json_to_conll()
+            print(f"----> Predictions to file {pred}.")
+            logs.add_infos('output_conll_file', pred)
+        if prod.txt_todo == True:
+            if prod.txt_meta == True:
+                pred = self.pred_meta_conll = f"{self.resu}/{self.name}_pred_meta.conll"
+                if not os.path.isfile(pred):
+                    self.pred_json_to_conll_w_metadata()
+                pred_txt = self.brackets_txt_with_metadata()
+                # os.system(f"rm {pred})
+            else:
+                pred = self.pred_conll = f"{self.resu}/{self.name}_pred.conll"
+                if not os.path.isfile(pred):
+                    self.pred_json_to_conll()
+                pred_txt = self.brackets_txt()
+                # os.system(f"rm {pred})
+            print(f"----> Predictions to file {pred_txt}.")
+            logs.add_infos('output_txt_file', pred_txt)
+
+
+class Output:
+    def __init__(self, infos):
+        self.conll_todo = infos['conll_file']['to_do']
+        self.conll_meta = infos['conll_file']['metadata']
+        self.conll_w_gold = infos['conll_file']['with_gold_labels']
+        self.txt_todo = infos['txt_file']['to_do']
+        self.txt_meta = infos['txt_file']['metadata']
+
+
+class Process:
+    def __init__(self, infos):
+        self.main = infos["main"] # train test annotation
+
+        self.pre_process_to_do = infos['pre-processing']['to_do']
+        self.synt_tool = infos['pre-processing']['syntactic_tool']
+        self.synt_parse = infos['pre-processing']['syntactic_parsing']
+        self.toke = infos['pre-processing']['tokenization']
+        self.ssplit = infos['pre-processing']['sentence_split']
+        self.crea_meta = infos['pre-processing']['create_metadata']['to_do']
+        self.meta_line = infos['pre-processing']['create_metadata']['line']
+        self.meta_sent = infos['pre-processing']['create_metadata']['sent']
+        self.eval = infos['evaluation']
+
+        if self.main == "train" or "fine_tune":
+            self.set_train = infos['discourse_segmenter']['training']['train_data_path']
+            self.set_dev = infos['discourse_segmenter']['training']['validation_data_path']
+            self.set_test = infos['gold_test_data_path']
+
+        self.toolkit = infos['discourse_segmenter']['training']['toolkit']
+        self.tr_config = infos['discourse_segmenter']['training']['config_file']
+        self.pretr_lm = infos['discourse_segmenter']['training']['pre_trained_lm']
+
+        self.model = infos['discourse_segmenter']['model'] # ezpz for Tony 
+        self.test_data = infos['gold_test_data_path']
+
+
+    def get_model(self):
+        self.model_path = ""
+        if self.model == "tony": 
+            arch = "french_tokens.tar.gz"
+            if not os.path.isfile(f"../model/tony/{arch}"):
+                dl = "wget https://zenodo.org/record/4235850/files/french_tokens.tar.gz -P ../model --progress=bar"
+                os.system(dl)
+                self.model_path = f"../model/tony/{arch}"
+            else:
+                print("----> Tony already in place !")
+                self.model_path = f"../model/tony/{arch}"
+        else:
+            self.model_path = self.model
+
+    def get_data_for_train(self, data):
+        # from names get path to input
+        self.train_raw = f"{data.path}/{self.set_train}{data.exte}"
+        self.dev_raw = f"{data.path}/{self.set_dev}{data.exte}"
+        self.test_raw = f"{data.path}/{self.set_test}{data.exte}"
+
+    def get_data_for_fine_tune(self, data):
+        """
+        spec: testset is the same that data_raw_name / 
+              trainset & devset are elsewhere and config fill with path not just name
+        """
+        self.train_raw = self.set_train
+        self.dev_raw = self.set_dev
+        self.test_raw = f"{data.path}/{self.set_test}{data.exte}"
+        # reset names to go ez pz for ner formatage
+        self.set_train = re.sub('\.[^\.]+$', '', re.sub('^.*/', '', self.set_train))
+        self.set_dev = re.sub('\.[^\.]+$', '', re.sub('^.*/', '', self.dev_raw))
+
+    def make_sets_ner_format(self, data): #[steps.set_train, steps.set_dev, steps.set_test] 
+        self.train_ner = f"{data.conv}/{self.set_train}{data.exte}.ner"
+        self.dev_ner = f"{data.conv}/{self.set_dev}{data.exte}.ner" 
+        self.test_ner = f"{data.conv}/{self.set_test}{data.exte}.ner" 
+        print(f"----> Making NER format {self.train_ner}.")
+        conv_to_ner.main(self.train_raw, self.train_ner, "conll")
+        print(f"----> Making NER format {self.dev_ner}.")
+        conv_to_ner.main(self.dev_raw, self.dev_ner, "conll")
+        print(f"----> Making NER format {self.test_ner}.")
+        conv_to_ner.main(self.test_raw, self.test_ner, "conll")
+
+    def update_training_config(self):
+        logs.add_json('training_config', self.tr_config)
+        self.tr_config_updated = re.sub('.jsonnet$', '_up.jsonnet', self.tr_config)
+        with open(self.tr_config, 'r') as js:
+            tr_conf = json.load(js)
+        tr_conf['train_data_path'] = self.train_ner
+        tr_conf['validation_data_path'] = self.dev_ner
+        with open(self.tr_config_updated, 'w') as js:
+            json.dump(tr_conf, js)
+        logs.add_json('training_config_updated', self.tr_config_updated)
+
+    def training(self, data):
+        cmd = f"allennlp train -s {data.resu_train} {self.tr_config_updated} &> {data.resu}/logs_training_{data.stamp_time}.txt"
+        cmd = cmd if data.over == False else re.sub('&>', '-f &>', cmd)
+        print(f"----> Training : {cmd}")
+        os.system(cmd)
+        steps.model_path = f"{data.resu_train}/model.tar.gz"
+        logs.add_infos('model_to make predictions', self.model_path)
+        logs.add_infos('logs_trainning_file', f"{data.resu}/logs_training_{data.stamp_time}.txt" )
+
+    def fine_tuning(self, data):
+        logs.add_infos('model_to be fine-tuned', self.model)
+        cmd = f"allennlp fine-tune -m {self.model_path} -c {self.tr_config_updated} -s {data.resu_fine} &> {data.resu}/logs_fine-tuning_{data.stamp_time}.txt"
+        print(f"----> Fine-tuning : {cmd}")
+        os.system(cmd)
+        self.model_ft_path = f"{data.resu_fine}/model.tar.gz"
+        logs.add_infos('model_to make predictions', self.model_ft_path)
+        logs.add_infos('logs_fine-tuning_file', f"{data.resu}/logs_fine-tuning_{data.stamp_time}.txt")
+
+
+def get_stamp():
+    now = datetime.now()
+    stamp = re.sub('[\s:]', '_', str(now))
+    return stamp
+
+def get_config_infos(config, stamp, stamp_time, logs, overwrite):
+    with open(config, 'r', encoding='utf-8') as f:
+        infos = json.load(f)
+        data = Data(infos['data_raw'], stamp, stamp_time, overwrite)
+        steps = Process(infos['steps'])
+        prod = Output(infos['output'])
+        logs.add_infos('config', infos)
+    return data, steps, prod
+
+
+class Logs:
+    def __init__(self):
+        self.dict = {}
+
+    def add_infos(self, key, value):
+        self.dict[key] = value
+
+    def add_json(self, key, jsonf):
+        with open(jsonf, 'r', encoding='utf-8') as f:
+            infos = json.load(f)
+            self.dict[key] = infos
+
+    def print(self, stamp_time):
+        self.file_path = f"{data.run}/logs_global_{stamp_time}.json"
+        with open(self.file_path, 'w', encoding='utf-8') as fl:
+            json.dump(self.dict, fl, indent=4)
+
+
+if __name__ == '__main__':
+    stamp = stamp_time = get_stamp()
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--config', help='Config file in JSON.')
+    parser.add_argument('--name',default=stamp , help='Run name.')
+    parser.add_argument('-o', '--overwrite', action='store_true', help='Overwite output.')
+    args = parser.parse_args()
+    config = args.config
+    stamp = args.name
+    overwrite = args.overwrite
+    
+    logs = Logs()
+    data, steps, prod = get_config_infos(config, stamp, stamp_time, logs, overwrite)
+    data.create_folders()
+    
+    logs.add_infos("stamp", stamp)
+    logs.add_infos("stamp_time", stamp_time)
+    logs.add_infos("overwrite", overwrite)
+
+    if steps.main == "annotation" or steps.main == "test":
+        data.pre_processing(steps)
+        data.make_ner_format()
+        steps.get_model()
+        data.make_predictions(steps) # output allennlp JSON
+        if steps.eval == True:
+            data.evaluation(steps, prod)
+        else:
+            data.make_output(prod)
+    
+    elif steps.main == "train":
+        steps.get_data_for_train(data) #[steps.set_train, steps.set_dev, steps.set_test] 
+        data.pre_processing(steps, file_in=steps.test_raw)
+        steps.make_sets_ner_format(data)
+        steps.update_training_config()
+        steps.training(data)
+        data.make_predictions(steps, js_name=steps.set_test, fi_ner=steps.test_ner)
+        if steps.eval == True:
+            data.evaluation(steps, prod, name=steps.test_data)
+    
+    elif steps.main == "fine_tune":
+        steps.get_data_for_fine_tune(data)
+        data.pre_processing(steps, file_in=steps.test_raw)
+        steps.make_sets_ner_format(data)
+        steps.get_model() # model to be fine-tune
+        steps.update_training_config()
+        steps.fine_tuning(data)
+        data.make_predictions(steps, js_name=steps.set_test, fi_ner=steps.test_ner, model=steps.model_ft_path)
+        if steps.eval == True:
+            data.evaluation(steps, prod, name=steps.test_data, model=steps.model_ft_path)
+
+    logs.print(stamp_time)
+    print(f"----> All logs saved in {logs.file_path}")
\ No newline at end of file
diff --git a/DISCUT/.gitkeep b/DISCUT/code/utils/__init__.py
similarity index 100%
rename from DISCUT/.gitkeep
rename to DISCUT/code/utils/__init__.py
diff --git a/DISCUT/code/utils/conll2bracket.py b/DISCUT/code/utils/conll2bracket.py
new file mode 100644
index 0000000..ec597fd
--- /dev/null
+++ b/DISCUT/code/utils/conll2bracket.py
@@ -0,0 +1,75 @@
+"""24brièvement________
+
+1ok_______BeginSeg=Yes
+2bonjour________
+3tout________
+4le________
+5monde________
+6je________
+7suis________
+8Ilyes________
+9Rebai________
+10euh________
+11je________
+12suis________
+13un________
+14ingénieur________
+15de________
+16recherche________
+17chez________
+18Linagora________
+
+1bonjour_______BeginSeg=Yes
+"""
+
+
+import sys
+import codecs
+
+#input =  open(sys.argv[1],encoding="utf8").readlines()
+
+def conll2brackets(in_f, out_f):
+    start = True
+    input = in_f
+
+    with open(out_f, 'w') as file_out:
+        with open(in_f, 'r') as input:
+            for line in input: 
+                if line.strip()=="":
+                    file_out.write("]")
+                    file_out.write("\n\n")
+                    start = True
+                else:
+                    n, word, *junk, tag = line.split()
+                    if tag=="BeginSeg=Yes":
+                        if not(start):
+                            file_out.write("] ")
+                        file_out.write(f"[ {word} ")
+                    else:
+                        file_out.write(f"{word} ")
+                    start = False
+            file_out.write("]\n\n")
+            
+def conll2brackets_with_meta(in_f, out_f):
+    start = True
+    input = in_f
+
+    with open(out_f, 'w') as file_out:
+        with open(in_f, 'r') as input:
+            for line in input: 
+                if line.startswith("#"):
+                    file_out.write(f"{line}\n")
+                elif line.strip()=="":
+                    file_out.write("]")
+                    file_out.write("\n\n")
+                    start = True
+                else:
+                    n, word, *junk, tag = line.split()
+                    if tag=="BeginSeg=Yes":
+                        if not(start):
+                            file_out.write("] ")
+                        file_out.write(f"[ {word} ")
+                    else:
+                        file_out.write(f"{word} ")
+                    start = False
+            file_out.write("]\n\n")
\ No newline at end of file
diff --git a/DISCUT/code/utils/conv2ner.py b/DISCUT/code/utils/conv2ner.py
new file mode 100644
index 0000000..4e6edbe
--- /dev/null
+++ b/DISCUT/code/utils/conv2ner.py
@@ -0,0 +1,128 @@
+"""
+Convert to ner Connl format to use allennlp dataset reader
+
+basically, just skip lines between docs, strip to 4 fields with words as 1st and tag as last, and format as BIO
+
+TODO: try BIOUL (L=last, U=unit entity = 1 token)
+"""
+import sys
+import argparse 
+
+maptags = {"_":"O",
+           "BeginSeg=Yes": "B-S",
+           "Seg=B-Conn":"B-Conn",
+           "Seg=I-Conn":"I-Conn",
+           "SpaceAfter=No":"O",
+           "Typo=Yes":"O",
+           }
+# 
+parameters = {
+        "LEMMATIZE": False,
+        "MARK_END": False,
+        "SPLIT_TOO_LONG": False,
+        "THRESHOLD": int(180),
+        "input_format": "tok"
+    }
+
+LEMMATIZE=False
+MARK_END=False
+SPLIT_TOO_LONG=False
+THRESHOLD=int(180)
+input_format="tok"
+
+""" def get_param():
+
+    #parser = argparse.ArgumentParser()
+    #parser.add_argument("filepath", help="path to file to convert")
+    #parser.add_argument("--lemmatize", default=False, action='store_true', help="to use with conll input: replace token with its lemma (useful for turk)")
+    #parser.add_argument("--mark-end", default=False, action='store_true', help="add explicit label for end of segment")
+    #parser.add_argument("--split-too-long", default=[False,180], help="split sentences longer than threshold",nargs=2)
+    #parser.add_argument("--input-format",default="tok",help="input format: tok, split.tok, conll ner")
+    #args = parser.parse_args()
+    #MARK_END = args.mark_end
+    # take lemmas instead of token forms (useful for turkish)
+    # also tag all proper nouns with same token
+    #LEMMATIZE = args.lemmatize
+    # split for too long sentences (default 180) for bert
+    #SPLIT_TOO_LONG= args.split_too_long[0]
+    ##THRESHOLD = int(args.split_too_long[1])
+
+    ## for now all params set to default
+    parameters = {
+        "LEMMATIZE": False,
+        "MARK_END": False,
+        "SPLIT_TOO_LONG": False,
+        "THRESHOLD": int(180),
+        "input_format": "tok"
+    }
+
+#filepath = sys.argv[1]
+#filepath = args.filepath
+
+input_format = args.input_format """
+
+
+if SPLIT_TOO_LONG:
+    print("warning: too-long sentence splitting mode = ON ",file=sys.stderr)
+
+token_number=0
+
+def conversion2ner(input, output, params=None):
+
+    with open(output, 'w') as out_f:
+        with open(input, 'r') as f:
+            start_doc = True
+            res = []
+            for line in f:
+                if input_format=="ner":
+                    token_number +=1
+                    if SPLIT_TOO_LONG and token_number>THRESHOLD:
+                        # sentence too long: insert a newline to make a separate sequence       
+                        res.append([])
+                        token_number = 0 
+                    res.append(line.split())
+                elif "\t" not in line:
+                    if not(start_doc): 
+                        res.append([]) # [line.strip()])
+                    start_doc = True
+                #elif line.strip()=="":
+                #    res.append([])
+                #    start_doc = True
+                else:
+                    fields = line.strip().split()
+                    #print(fields,file=sys.stderr)
+                    token_number = int(fields[0].split("-")[0])
+                    if SPLIT_TOO_LONG and token_number>THRESHOLD:
+                        # sentence too long: insert a newline to make a separate sequence
+                        res.append([])
+                    w = fields[1] if not(LEMMATIZE) else fields[2]
+                    label = fields[-1].split("|")[0]
+                    if input_format=="conll":
+                        if LEMMATIZE and fields[3]=="PROPN":
+                            w = "NAME"
+                        pos = "NN"
+                    else:
+                        pos = "NN"
+                    tag = maptags.get(label,"O")
+                    #if start_doc:
+                    #    tag = "B-S"
+                    if not(start_doc) and MARK_END and tag=="B-S" and res[-1][-1]!="B-S":
+                        # then, previous token label is set to B-E to signal end of previous segment
+                        res[-1][-1] = "B-E"
+                    start_doc = False
+                    #if label not in maptags:
+                        #print("warning, strange label ",label,file=sys.stderr)
+                    res.append([w,pos,"O",tag])
+                    
+            for line in res:
+                out_f.write("\t".join(line))
+                out_f.write("\n")
+
+
+
+def main(f_in, f_out, f):
+    input = f_in
+    output = f_out
+    input_format = f
+    #param = get_param()
+    conversion2ner(input, output) # add param
\ No newline at end of file
diff --git a/DISCUT/code/utils/fr_tokenize.py b/DISCUT/code/utils/fr_tokenize.py
new file mode 100644
index 0000000..0efedec
--- /dev/null
+++ b/DISCUT/code/utils/fr_tokenize.py
@@ -0,0 +1,91 @@
+"""take a French document and 
+ 
+ 1) use spacy to tokenize it
+ 2) format it as disrpt input 
+
+TODO: conll option to output spacy analysis 
+"""
+
+import sys
+import codecs
+import spacy
+
+fr = spacy.load('fr_core_news_sm')
+#extra_fields = 8
+# WIP ...
+CONLL = False
+
+#input_file = sys.argv[1]
+#input = codecs.open(input_file,encoding="utf8").read()
+
+
+
+# watch the max length because of Bert restrictions to 512 subword units
+#current = max_length = 0
+#cutoff = 200
+
+
+# build conll stuff. bits from spacy_conll, adapted
+# WIP
+tagmap = fr.Defaults.tag_map
+
+def get_morphology(self, tag):
+    if not self.tagmap or tag not in self.tagmap:
+        return '_'
+    else:
+        feats = [f'{prop}={val}' for prop, val in self.tagmap[tag].items() if not Spacy2ConllParser._is_number(prop)]
+        if feats:
+            return '|'.join(feats)
+        else:
+            return '_'
+
+def head_idx(idx,word):
+    if word.dep_.lower().strip() == 'root':
+        head_idx = 0
+    else:
+        head_idx = word.head.i + 1 - sent[0].i
+    return head_idx 
+
+def word_tuple(idx,word):
+    return (idx,
+        word.text,
+        word.lemma_,
+        word.pos_,
+        word.tag_,
+        get_morphology(word.tag_),
+        head_idx(idx,word),
+        word.dep_,
+        '_',
+        '_'
+    )
+######################
+
+
+def main(f_in, f_out):
+
+    with open(f_out, 'w') as file:
+
+        input_file = f_in
+        input = codecs.open(input_file,encoding="utf8").read()
+
+
+        doc = fr(input)
+
+        current = max_length = 0
+        extra_fields = 8
+
+        # raw doc
+        for i,token in enumerate(doc):
+            if token.text.strip()!="":
+                line = [str(i),token.text]+["_"]*extra_fields
+                file.write("\t".join(line))
+                current = current + 1
+                #if current>cutoff:
+                #    print()
+                file.write("\n")
+            else:
+                file.write("\n")
+                max_length = max(max_length,current)
+                current = 0 
+
+        print("max length sequence = %s"%max_length,file=sys.stderr)
diff --git a/DISCUT/code/utils/json2conll.py b/DISCUT/code/utils/json2conll.py
new file mode 100644
index 0000000..37c5a0f
--- /dev/null
+++ b/DISCUT/code/utils/json2conll.py
@@ -0,0 +1,98 @@
+"""
+reexports allennlp predictions from json to 
+conll format
+"""
+
+import json
+import sys
+import re
+
+#filepath = sys.argv[1]
+#config = sys.argv[2]
+# conll ou tok
+
+map = {"O":"_",
+       "B-S":"BeginSeg=Yes",
+       "U-S":"BeginSeg=Yes",
+       "U-Conn":"Seg=B-Conn",
+       "L-Conn":"Seg=I-Conn",
+       "I-Conn":"Seg=I-Conn",
+       "B-Conn":"Seg=B-Conn",
+       "B-E":"_",
+       "U-E":"_",
+       }
+
+
+def js2conll(filepath, fileoutpath, config):
+    data = [] 
+    for line in open(filepath, 'r'):
+        data.append(json.loads(line))
+    with open(fileoutpath, 'w') as f_out:
+        for doc in data:
+            tokens = zip(doc["words"],doc["tags"])
+            out = "\n".join(("%s\t%s\t%s%s"%(i+1,word,"_\t"*7,map.get(tag,tag)) for (i,(word,tag)) in enumerate(tokens)))
+            if config=="tok":
+                print("# blabla")
+            f_out.write(f'{out}\n')
+            f_out.write("\n")
+            #print()
+
+def js2conllNmeta(data_pred_json, data_out, config, data_meta):
+    data = []
+    sent_pred_count = 0
+    tok = 0
+    for line in open(data_pred_json, 'r'):
+        data.append(json.loads(line))
+
+    with open(data_out, 'w', encoding='utf-8') as fo, open(data_meta, 'r') as fm:       
+        
+        # id 
+        for line in fm:
+            line = line.strip()
+            if line.startswith("#"):
+                fo.write(f"{line}\n")
+            elif line == "":
+                sent_pred_count += 1
+                tok = 0
+                fo.write(f"{line}\n")
+            else:
+                sent_pred = data[sent_pred_count]
+                word = data[sent_pred_count]['words'][tok]
+                tag = data[sent_pred_count]['tags'][tok]
+                tok += 1
+                #print(f"tok: {tok}, tag: {tag}, word: {word}, line : {line}")
+                new_line = re.sub('\t[^\t]+$', '', line)
+                fo.write(f"{new_line}\t{map[tag]}\n")
+                #if int(line.split("\t")[0]) == tok and line.split("\t")[1] == word:
+                #    fo.write(f"{line}\t{tag}\n")
+                    
+
+
+                #print(f"sentpred : {sent_pred}\n")
+                #print(f"word n tag : {word}:::{tag}\n")
+                
+def js2conllNmetaNgold(data_pred_json, data_out, config, gold_n_meta):
+    data = []
+    sent_pred_count = 0
+    tok = 0
+    for line in open(data_pred_json, 'r'):
+        data.append(json.loads(line))
+
+    with open(data_out, 'w', encoding='utf-8') as fo, open(gold_n_meta, 'r') as fm:       
+        
+        # id 
+        for line in fm:
+            line = line.strip()
+            if line.startswith("#"):
+                fo.write(f"{line}\n")
+            elif line == "":
+                sent_pred_count += 1
+                tok = 0
+                fo.write(f"{line}\n")
+            else:
+                sent_pred = data[sent_pred_count]
+                word = data[sent_pred_count]['words'][tok]
+                tag = data[sent_pred_count]['tags'][tok]
+                tok += 1
+                #print(f"tok: {tok}, tag: {tag}, word: {word}, line : {line}")
+                fo.write(f"{line}\t{map[tag]}\n")
\ No newline at end of file
diff --git a/DISCUT/code/utils/seg_eval.py b/DISCUT/code/utils/seg_eval.py
new file mode 100644
index 0000000..d61d2ef
--- /dev/null
+++ b/DISCUT/code/utils/seg_eval.py
@@ -0,0 +1,224 @@
+import io, os, sys, argparse
+
+"""
+Script to evaluate segmentation f-score and perfect discourse unit segmentation proportion from two files. Two input formats are permitted:
+
+  * One token per line, with ten columns, no sentence breaks (default *.tok format) - segmentation indicated in column 10
+  * The same, but with blank lines between sentences (*.conll format)
+
+Token columns follow the CoNLL-U format, with token IDs in the first column and pipe separated key=value pairs in the last column. 
+
+Document boundaries are indicated by a comment: # newdoc id = ...
+
+The evaluation uses micro-averaged F-Scores per corpus (not document macro average).
+
+Example:
+
+```
+# newdoc id = GUM_bio_byron
+1	Education	_	_	_	_	_	_	_	BeginSeg=Yes
+2	and	_	_	_	_	_	_	_	_
+3	early	_	_	_	_	_	_	_	_
+4	loves	_	_	_	_	_	_	_	_
+5	Byron	_	_	_	_	_	_	_	BeginSeg=Yes
+6	received	_	_	_	_	_	_	_	_
+...
+```
+
+Or:
+
+```
+# newdoc id = GUM_bio_byron
+# sent_id = GUM_bio_byron-1
+# text = Education and early loves
+1	Education	education	NOUN	NN	Number=Sing	0	root	_	BeginSeg=Yes
+2	and	and	CCONJ	CC	_	4	cc	_	_
+3	early	early	ADJ	JJ	Degree=Pos	4	amod	_	_
+4	loves	love	NOUN	NNS	Number=Plur	1	conj	_	_
+
+# sent_id = GUM_bio_byron-2
+# text = Byron received his early formal education at Aberdeen Grammar School, and in August 1799 entered the school of Dr. William Glennie, in Dulwich. [17]
+1	Byron	Byron	PROPN	NNP	Number=Sing	2	nsubj	_	BeginSeg=Yes
+2	received	receive	VERB	VBD	Mood=Ind|Tense=Past|VerbForm=Fin	0	root	_	_
+```
+
+For PDTB-style corpora, we calculate exact span-wise f-scores for BIO encoding, without partial credit. In other words, 
+predicting an incorrect span with partial overlap is the same as missing a gold span and predicting an incorrect span
+somewhere else in the corpus. Note also that spans must begin with B-Conn - predicted spans beginning with I-Conn are ignored.
+
+The file format for PDTB style corpora is similar, but with different labels:
+
+```
+1	Fidelity	Fidelity	PROPN	NNP	_	6	nsubj	_	_
+2	,	,	PUNCT	,	_	6	punct	_	_
+3	for	for	ADP	IN	_	4	case	_	Seg=B-Conn
+4	example	example	NOUN	NN	_	6	obl	_	Seg=I-Conn
+5	,	,	PUNCT	,	_	6	punct	_	_
+6	prepared	prepare	VERB	VBN	_	0	root	_	_
+7	ads	ad	NOUN	NNS	_	6	obj	_	_
+...
+```
+
+
+Arguments:
+ * goldfile: shared task gold test data
+ * predfile: same format, with predicted segments positions in column 10 - note **number of tokens must match**  
+ * string_input: if specified, files are replaced by strings with file contents instead of file names
+
+
+"""
+
+#__author__ = "Amir Zeldes"
+__license__ = "Apache 2.0"
+#__version__ = "1.0.1"
+
+def parse_data(infile, string_input=False):
+	if not string_input:
+		data = io.open(infile, encoding="utf8").read().strip().replace("\r", "")
+	else:
+		data = infile.strip()
+
+	tokens = []
+	labels = []
+	spans = []
+	counter = 0
+	span_start = -1
+	span_end = -1
+	for line in data.split("\n"):
+		if "\t" in line:  # Token
+			fields = line.split("\t")
+			if "-" in fields[0]:
+				continue
+			label = fields[-1]
+			# Ensure correct labeling even if other pipe-delimited annotations found in column 10
+			if "BeginSeg=Yes" in label:
+				label = "BeginSeg=Yes"
+			elif "Seg=B-Conn" in label:
+				label ="Seg=B-Conn"
+				span_start = counter
+			elif "Seg=I-Conn" in label:
+				label = "Seg=I-Conn"
+				span_end = counter
+			else:
+				label = "_"
+				if span_start > -1:  # Add span
+					if span_end == -1:
+						span_end = span_start
+					spans.append((span_start,span_end))
+					span_start = -1
+					span_end = -1
+
+			tokens.append(fields[1])
+			labels.append(label)
+			counter +=1
+
+	if span_start > -1 and span_end > -1:  # Add last span
+		spans.append((span_start,span_end))
+
+	return tokens, labels, spans
+
+
+def get_scores(gold_file, pred_file, string_input=False):
+	"""
+
+	:param gold_file: Gold shared task file
+	:param pred_file: File with predictions
+	:param string_input: If True, files are replaced by strings with file contents (for import inside other scripts)
+	:return: dictionary of scores for printing
+	"""
+
+
+	report = ""
+	gold_tokens, gold_labels, gold_spans = parse_data(gold_file, string_input)
+	pred_tokens, pred_labels, pred_spans = parse_data(pred_file, string_input)
+
+	if os.path.isfile(gold_file):
+		doc_name = os.path.basename(gold_file)
+	else:
+		# Use first few tokens to identify file
+		doc_name = " ".join(gold_tokens[0:10]) + "..."
+
+	# Check same number of tokens in both files
+	if len(gold_tokens) != len(pred_tokens):
+		report += "\nFATAL: different number of tokens detected in gold and pred:\n"
+		report += "  o In " + doc_name + ": " + str(len(gold_tokens)) + " gold tokens but " + str(len(pred_tokens)) + " predicted tokens\n\n"
+		sys.stderr.write(report)
+		sys.exit(0)
+
+	# Check tokens are identical
+	for i, tok in enumerate(gold_tokens):
+		if tok != pred_tokens[i]:
+			report += "\nWARN: token strings do not match in gold and pred:\n"
+			report += " o First instance in " + doc_name + " token " + str(i) + "\n"
+			report += "Gold: " + tok + " but Pred: " + pred_tokens[i] + "\n\n"
+			sys.stderr.write(report)
+			break
+
+	# Check if this is EDU or Conn-style data
+	if "BeginSeg=Yes" in gold_labels:
+		mode = "edu"
+		seg_type = "EDUs"
+	#elif "B-S" in gold_labels:
+	#	mode = "edu"
+	#	seg_type = "EDUs"
+	else:
+		mode = "conn"
+		seg_type = "conn spans"
+
+	true_positive = 0
+	false_positive = 0
+	false_negative = 0
+
+	if mode == "edu":
+		for i, gold_label in enumerate(gold_labels):
+			pred_label = pred_labels[i]
+			if gold_label == pred_label:
+				if gold_label == "_":
+					continue
+				else:
+					true_positive += 1
+			else:
+				if pred_label == "_":
+					false_negative += 1
+				else:
+					if gold_label == "_":
+						false_positive += 1
+					else:  # I-Conn/B-Conn mismatch
+						false_positive +=1
+	else:
+		for span in gold_spans:
+			if span in pred_spans:
+				true_positive +=1
+			else:
+				false_negative +=1
+		for span in pred_spans:
+			if span not in gold_spans:
+				false_positive += 1
+
+	try:
+		precision = true_positive / (float(true_positive) + false_positive)
+	except Exception as e:
+		precision = 0
+
+	try:
+		recall = true_positive / (float(true_positive) + false_negative)
+	except Exception as e:
+		recall = 0
+
+	try:
+		f_score = 2 * (precision * recall) / (precision + recall)
+	except:
+		f_score = 0
+
+	score_dict = {}
+	score_dict["doc_name"] = doc_name
+	score_dict["tok_count"] = len(gold_tokens)
+	score_dict["seg_type"] = seg_type
+	score_dict["gold_seg_count"] = true_positive+false_negative
+	score_dict["pred_seg_count"] = true_positive+false_positive
+	score_dict["prec"] = precision
+	score_dict["rec"] = recall
+	score_dict["f_score"] = f_score
+
+	return score_dict
+
diff --git a/DISCUT/code/utils/sent_split.py b/DISCUT/code/utils/sent_split.py
new file mode 100644
index 0000000..d7dbb85
--- /dev/null
+++ b/DISCUT/code/utils/sent_split.py
@@ -0,0 +1,220 @@
+import spacy
+import stanza
+import numpy as np
+from tqdm import tqdm
+from stanza.utils.conll import CoNLL
+
+
+def spliter_stanza(f_in, fi_out, lang, treebank=None):
+    # get language model
+    if treebank is not None:
+        stanza.download(lang, package=treebank)
+    else:
+        stanza.download(lang) 
+    processors = 'tokenize'
+    nlp = stanza.Pipeline(lang, processors=processors, use_gpu=True)
+    # for each doc, get the list of tokens and labels
+    tok_tok_lbls = [(doc_id, doc_toks, doc_lbls) for doc_id, doc_toks, doc_lbls in tok_tokens_labels(f_in)]
+    # for each doc, get the character offset of tokens
+    with open(f_in, encoding='utf-8') as f_tok:
+        tok_str = f_tok.read()
+    tok_tok_begs = [(doc_id, doc_chars, tok_begs) for doc_id, doc_chars, tok_begs, _ in begin_toks_sents(tok_str)]
+    with open(fi_out, mode='w', encoding='utf-8') as f_out:
+        # parse each doc in turn
+        for (doc_id, doc_toks, doc_lbls), (_, doc_chars, tok_begs) in tqdm(zip(tok_tok_lbls, tok_tok_begs), total=min(len(tok_tok_lbls), len(tok_tok_begs))):
+            doc_text = rebuild_text(doc_toks, lang=lang)
+            # print(doc_text)
+            ann = nlp(doc_text)
+            conll_str = CoNLL.conll_as_string(CoNLL.convert_dict(ann.to_dict()))
+            conll_tok_begs = list(begin_toks_sents(conll_str, True))
+            # we parse one doc at a time
+            assert len(conll_tok_begs) == 1
+            _, p_doc_chars, p_tok_begs, p_sent_begs = conll_tok_begs[0]
+            try:
+                assert p_doc_chars == doc_chars
+            except AssertionError:
+                for i, (pdc, dc) in enumerate(zip(p_doc_chars, doc_chars)):
+                    if pdc != dc:
+                        print(f_in, i, p_doc_chars[i - 10:i + 10], doc_chars[i - 10:i + 10])
+                        raise
+            # for each beginning of sentence (in the parser output), find the corresponding token index in the original .tok
+            sent_beg_idc = np.searchsorted(tok_begs, p_sent_begs, side='left')
+            sent_beg_idc = set(sent_beg_idc)
+            # output CONLL-U file
+            f_out.write('# newdoc id = ' + doc_id + '\n')
+            #print('# newdoc id = ' + doc_id, file=f_out)
+            tok_sent_idx = 1
+            for tok_doc_idx, (tok, lbl) in enumerate(zip(doc_toks, doc_lbls), start=0):
+                if tok_doc_idx in sent_beg_idc:
+                    if tok_doc_idx > 0:
+                        # add an empty line after the previous sentence (not for the first token in doc)
+                        f_out.write('\n')
+                        #print('', file=f_out)
+                    tok_sent_idx = 1
+                else:
+                    tok_sent_idx += 1
+                row = (str(tok_sent_idx), tok, '_', '_', '_', '_', '_', '_', '_', lbl)
+                f_out.write('\t'.join(row)+'\n')
+                #print('\t'.join(row).encode('utf-8'), file=f_out)
+            f_out.write('\n')
+            #print('', file=f_out)
+
+
+
+
+def spliter_spacy(f_in, f_out, lg):
+    lm = f"{lg}_core_web_sm"
+    nlp = spacy.load(lm)
+    #nlp = spacy.load("en_core_web_sm")
+
+"""    doc = nlp(text)
+    for sent in doc.sents:
+        print(sent)
+ """
+
+def tok_tokens_labels(tok_filename):
+    """Retrieve the list of tokens and (target) labels for each doc in a .ttok file.
+
+    Parameters
+    ----------
+    tok_filename : str
+        Filename of the .ttok file
+
+    Yields
+    ------
+    doc_toks : List[str]
+        List of tokens in the document.
+    doc_lbls : List[str]
+        List of labels in the document (same length as doc_toks).
+    """
+    with open(tok_filename, encoding='utf-8') as f:
+        doc_id = None
+        doc_toks = []
+        doc_lbls = []
+        for line in f:
+            if line.startswith('# newdoc'):
+                if doc_toks:
+                    yield (doc_id, doc_toks, doc_lbls)
+                doc_id = line.split('id = ')[1].strip()
+                doc_toks = []
+                doc_lbls = []
+            elif line.strip() == '':
+                continue
+            else:
+                fields = line.strip().split('\t')
+                tok = fields[1]
+                lbl = fields[9]
+                doc_toks.append(tok)
+                doc_lbls.append(lbl)
+        else:
+            # yield last doc
+            yield (doc_id, doc_toks, doc_lbls)
+
+
+def begin_toks_sents(conll_str, stanza=False):
+    """Get beginning positions of tokens and sentences as offsets on the non-whitespace characters of a document text.
+
+    Parameters
+    ----------
+    conll_str : str
+        CONLL-U string for the file.
+
+    Yields
+    ------
+    doc_id : str
+        Document id.
+    doc_chars : str
+        Document text excluding whitespaces.
+    tok_begs : List[int]
+        Beginning position of each token in the doc.
+        Correspond to indices in doc_chars.
+    sent_begs : List[int]
+        Beginning position of each sentence in the doc.
+        Correspond to indices in doc_chars.
+    """
+    doc_id = None
+    doc_chars = ''
+    tok_begs = []
+    sent_begs = []
+    in_sent = False
+    cur_idx = 0  # current (non-whitespace) character index
+    for line in conll_str.split('\n'):
+        if line.startswith('# newdoc id = '):
+            if sent_begs:
+                # yield previous doc
+                yield (doc_id, doc_chars, tok_begs, sent_begs)
+            # reset for a new doc
+            doc_id = line.split('# newdoc id = ')[1]
+            doc_chars = ''
+            tok_begs = []
+            sent_begs = []
+            in_sent = False
+            cur_idx = 0
+        elif line.startswith('#'):
+            continue
+        elif line == '':
+            # an empty line marks doc or sentence split
+            in_sent = False
+        else:
+            fields = line.split('\t')
+            assert len(fields) == 10
+            if stanza and not fields[9].startswith('start'):
+                continue
+            # token line
+            tok_begs.append(cur_idx)
+            if not in_sent:
+                # first token in sentence
+                sent_begs.append(cur_idx)
+                in_sent = True
+            
+            # delete whitespaces internal to the token
+            tok_chars = fields[1].replace(' ', '').replace('\xa0', '')
+            cur_idx += len(tok_chars)
+            doc_chars += tok_chars
+    else:
+        # yield last document
+        if sent_begs:
+            yield (doc_id, doc_chars, tok_begs, sent_begs)
+
+def rebuild_text(doc_toks, lang=None):
+    """Rebuild the underlying text from a list of tokens.
+
+    We don't assume any additional information.
+    In particular, the "SpaceAfter=No" provided in some CONLL-U files is ignored.
+
+    Parameters
+    ----------
+    doc_toks : List[str]
+        List of tokens in the document.
+    lang : str
+        Language ; If None, the language is assumed to be one where tokens are
+        separated with whitespaces. Currently the only interesting value is "zh"
+        with no whitespace.
+    """
+    if lang == "zh":
+        return ''.join(doc_toks)
+    # default: insert whitespaces between tokens then remove extraneous ones ;
+    # this heuristic is crude but a reasonable default
+    doc_text = ' '.join(doc_toks)
+    doc_text = (doc_text.replace(' : " ', ': "')
+                .replace(' ,', ',').replace(' .', '.').replace(' !', '!').replace(' ?', '?').replace(' :', ':')
+                .replace('“ ', '“').replace(' ”', '”')
+                .replace(' ;', ';')
+                .replace(' ’', '’')
+                .replace('( ', '(').replace(' )', ')')
+                .replace('[ ', '[').replace(' ]', ']')
+    )
+    return doc_text
+
+
+def main(f_in, f_out, tool, lg):
+    
+    if tool == "spacy" :
+        spliter_spacy(f_in,f_out, lg)
+    elif tool == "stanza":
+        spliter_stanza(f_in, f_out, lg, treebank=None)
+    else:
+        print(" pls defined sentence spliter tool : spacy, blabla")
+
+
+    #return output
\ No newline at end of file
diff --git a/DISCUT/code/utils/syntactic_parsing.py b/DISCUT/code/utils/syntactic_parsing.py
new file mode 100644
index 0000000..8515d7d
--- /dev/null
+++ b/DISCUT/code/utils/syntactic_parsing.py
@@ -0,0 +1,65 @@
+import stanza
+from stanza.utils.conll import CoNLL
+
+
+
+
+
+
+
+def with_stanza(lang, f_in, f_out, process, meta):
+    """ 
+    Stanza's class CoNNL:
+
+    ID = 'id'
+    TEXT = 'text'
+    LEMMA = 'lemma'
+    UPOS = 'upos'
+    XPOS = 'xpos'
+    FEATS = 'feats'
+    HEAD = 'head'
+    DEPREL = 'deprel'
+    DEPS = 'deps'
+    MISC = 'misc' -> 'start_char|end_char'
+    START_CHAR = 'start_char'
+    END_CHAR = 'end_char'
+    FIELD_TO_IDX = {ID: 0, TEXT: 1, LEMMA: 2, UPOS: 3, XPOS: 4, FEATS: 5, HEAD: 6, DEPREL: 7, DEPS: 8, MISC: 9}
+    """
+
+    stanza.download(lang)
+    nlp = stanza.Pipeline(lang, processors=process, use_gpu=True)
+    with open(f_in, 'r', encoding='utf-8') as fi, open(f_out, 'w', encoding='utf-8') as fo:
+        count_line = 0
+        for line in fi:
+            count_line += 1
+            count_sent = 0
+            line = line.strip()
+
+            if line.startswith("#"):
+                if "meta" in meta.keys() and meta['meta'] == True:
+                    fo.write(f"{line}\n")
+            elif line == "":
+                fo.write("\n")
+            else:
+                
+                #if meta['line']:
+                if "line" in meta.keys():
+                    txt = f"#{meta['line']}-{count_line}\n"
+                    fo.write(txt)
+
+                doc = nlp(line)
+                for sent in doc.sentences:
+                    count_sent += 1
+                    #if meta['sent']:
+                    if "sent" in meta.keys():
+                        txt = f"#{meta['sent']}-{count_sent}\n#text=\"{sent.text}\"\n"
+                        fo.write(txt)
+
+                    for token in sent.tokens:
+                        token_conll = CoNLL.convert_token_dict(token.to_dict()[0])
+                        fo.write("\t".join(token_conll))
+                        fo.write("\n")
+
+                    fo.write("\n")
+
+            
\ No newline at end of file
diff --git a/DISCUT/code/utils/training_allennlp.py b/DISCUT/code/utils/training_allennlp.py
new file mode 100644
index 0000000..0aa40d4
--- /dev/null
+++ b/DISCUT/code/utils/training_allennlp.py
@@ -0,0 +1,57 @@
+####### Python version of expes.sh
+
+import os
+
+
+def main(steps):
+    dataset = steps.data.name
+    config = steps.data.file # .tok .conllu
+    lmodel = steps.pretr_lm #options: bert xlm elmo elmo_aligned
+    action = "train" # inutile !
+    evalset = steps.dev_data
+    print(f"dev set : {evalset} \t trainset : {dataset}")
+    has_parent = False # ?? get this var autrement. 
+
+    tr_config = steps.tr_config 
+
+    # cas 1 : pas de "parent", pas de "toolong"
+    # cas 2 : toolong == true donc à spliter
+    # cas 3 : parent == true, pas de toolong
+
+
+    if lmodel == "xlm":
+        bert_vocab = "xlm-roberta-base"
+        bert_weights = "xlm-roberta-base"
+    else : 
+        bert_vocab = "bert-base-multilingual-cased"
+        bert_weights = "bert-base-multilingual-cased"
+
+    if lmodel == "bert_custom" and steps.ner_init == True :
+        # TODO raise error
+        print("You choose bert_custom so 'NER_format_initialisation' shall be set to false.")
+
+    #### train, has_per == False
+    # allennlp train -s Results_${CONFIG}/results_${OUTPUT} ${CODE}configs/${MODEL}.jsonnet --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --include-package allen_custom.custom_bert_token_embedder 
+    # allennlp train -s Resultts_conllu/results_eng.rst.rstdt_bert ../code/utils/configs/bert.jsonnet ....
+    
+    # Dicut- repo morteza
+    #allennlp train -s Results_${CONFIG}/results_${OUTPUT} ${CODE}configs/bert.jsonnet
+    cmd2 = f"allennlp train -s {steps.data.resu} {tr_config}"
+    
+    # Discut-gitlab
+    cmd = f"allennlp train -s {steps.data.resu} {tr_config} --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --include-package allen_custom.custom_bert_token_embedder" 
+    
+    
+    
+    print(cmd2)
+    os.system(cmd2)
+    # then...
+
+    # TODO:
+    #### train, has_par == true, en fait on fine_tune...
+    #allennlp fine-tune -m Results_${CONFIG}/results_${PARENT}_${MODEL}/model.tar.gz -c ${CODE}configs/${MODEL}.jsonnet -s Results_${CONFIG}/results_${DATASET}-${PARENT}_${MODEL} --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --include-package allen_custom.custom_bert_token_embedder 
+    # allennlp fine-tune -m MODEL_ARCHIVE -c CONFIG_FILE -s SERIALIZATION_DIR -o overrides
+
+    # TODO
+    ### ensuite prediction sur valset ou "parent test" ou "finetune test"... ??
+    #allennlp predict --use-dataset-reader --output-file Results_${CONFIG}/results_${OUTPUT}/${DATASET}_${EVAL}.predictions.json Results_${CONFIG}/results_${OUTPUT}/model.tar.gz ${TEST_A_PATH} --silent --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --predictor sentence-tagger --include-package allen_custom.custom_bert_token_embedder 
diff --git a/DISCUT/data/chaperonrouge/chaperonrouge.txt b/DISCUT/data/chaperonrouge/chaperonrouge.txt
new file mode 100644
index 0000000..017db77
--- /dev/null
+++ b/DISCUT/data/chaperonrouge/chaperonrouge.txt
@@ -0,0 +1 @@
+Il était une fois une petite fille que tout le monde aimait bien, surtout sa grand-mère. Elle ne savait qu'entreprendre pour lui faire plaisir. Un jour, elle lui offrit un petit bonnet de velours rouge, qui lui allait si bien qu'elle ne voulut plus en porter d'autre. Du coup, on l'appela Chaperon Rouge. Un jour, sa mère lui dit: “Viens voir, Chaperon Rouge : voici un morceau de gâteau et une bouteille de vin. Porte-les à ta grand-mère; elle est malade et faible; elle s'en délectera; fais vite, avant qu'il ne fasse trop chaud. Et quand tu seras en chemin, sois bien sage et ne t'écarte pas de ta route, sinon tu casserais la bouteille et ta grand-mère n'aurait plus rien. Et quand tu arriveras chez elle, n'oublie pas de dire “Bonjour” et ne va pas fureter dans tous les coins.” “Je ferai tout comme il faut,” dit le Petit Chaperon Rouge à sa mère. La fillette lui dit au revoir. La grand-mère habitait loin, au milieu de la forêt, à une demi-heure du village. Lorsque le Petit Chaperon Rouge arriva dans le bois, il rencontra le Loup. Mais il ne savait pas que c'était une vilaine bête et ne le craignait point. “Bonjour, Chaperon Rouge,” dit le Loup. “Bonjour, Loup,” dit le Chaperon Rouge. “Où donc vas-tu si tôt, Chaperon Rouge?” - “Chez ma grand-mère.” - “Que portes-tu dans ton panier?” - “Du gâteau et du vin. Hier nous avons fait de la pâtisserie, et ça fera du bien à ma grand-mère. Ça la fortifiera.” - “Où habite donc ta grand-mère, Chaperon Rouge?” - “Oh! à un bon quart d'heure d'ici, dans la forêt. Sa maison se trouve sous les trois gros chênes. En dessous, il y a une haie de noisetiers, tu sais bien?” dit le petit Chaperon Rouge. Le Loup se dit: “Voilà un mets bien jeune et bien tendre, un vrai régal! Il sera encore bien meilleur que la vieille. Il faut que je m'y prenne adroitement pour les attraper toutes les eux!” Il l'accompagna un bout de chemin et dit: “Chaperon Rouge, vois ces belles fleurs autour de nous. Pourquoi ne les regardes-tu pas? J'ai l'impression que tu n'écoutes même pas comme les oiseaux chantent joliment. Tu marches comme si tu allais à l'école, alors que tout est si beau, ici, dans la forêt!” Le Petit Chaperon Rouge ouvrit les yeux et lorsqu'elle vit comment les rayons du soleil dansaient de-ci, de-là à travers les arbres, et combien tout était plein de fleurs, elle pensa: “Si j'apportais à ma grand- mère un beau bouquet de fleurs, ça lui ferait bien plaisir. Il est encore si tôt que j'arriverai bien à l'heure.” Elle quitta le chemin, pénétra dans le bois et cueillit des fleurs. Et, chaque fois qu'elle en avait cueilli une, elle se disait: “Plus loin, j'en vois une plus belle,” et elle y allait et s'enfonçait toujours plus profondément dans la forêt. Le Loup lui, courait tout droit vers la maison de la grand-mère. Il frappa à la porte. “Qui est là?” - “C'est le Petit Chaperon Rouge qui t'apporte du gâteau et du vin.” - “Tire la chevillette,” dit la grand-mère. “Je suis trop faible et ne peux me lever.” Le Loup tire la chevillette, la porte s'ouvre et sans dire un mot, il s'approche du lit de la grand-mère et l'avale. Il enfile ses habits, met sa coiffe, se couche dans son lit et tire les rideaux. Pendant ce temps, le petit Chaperon Rouge avait fait la chasse aux fleurs. Lorsque la fillette en eut tant qu'elle pouvait à peine les porter, elle se souvint soudain de sa grand-mère et reprit la route pour se rendre auprès d'elle. Elle fut très étonnée de voir la porte ouverte. Et lorsqu'elle entra dans la chambre, cela lui sembla si curieux qu'elle se dit: “Mon dieu, comme je suis craintive aujourd'hui. Et, cependant, d'habitude, je suis si contente d'être auprès de ma grand-mère!” Elle s'écria: “Bonjour!” Mais nulle réponse. Elle s'approcha du lit et tira les rideaux. La grand-mère y était couchée, sa coiffe tirée très bas sur son visage. Elle avait l'air bizarre. “Oh, grand-mère, comme tu as de grandes oreilles.” - “C'est pour mieux t'entendre!” - “Oh! grand-mère, comme tu as de grands yeux!” - “C'est pour mieux te voir!” - “Oh! grand-mère, comme tu as de grandes mains!” - “C'est pour mieux t'étreindre!” - “Mais, grand-mère, comme tu as une horrible et grande bouche!” - “C'est pour mieux te manger!” À peine le Loup eut-il prononcé ces mots, qu'il bondit hors du lit et avala le pauvre Petit Chaperon Rouge. Lorsque le Loup eut apaisé sa faim, il se recoucha, s'endormit et commença à ronfler bruyamment. Un chasseur passait justement devant la maison. Il se dit: “Comme cette vieille femme ronfle! Il faut que je voie si elle a besoin de quelque chose.” Il entre dans la chambre et quand il arrive devant le lit, il voit que c'est un Loup qui y est couché. “Ah! c'est toi, bandit!” dit-il. “Voilà bien longtemps que je te cherche.” Il se prépare à faire feu lorsque tout à coup l'idée lui vient que le Loup pourrait bien avoir avalé la grand-mère et qu'il serait peut-être encore possible de la sauver. Il ne tire pas, mais prend des ciseaux et commence à ouvrir le ventre du Loup endormi. À peine avait-il donné quelques coups de ciseaux qu'il aperçoit le Chaperon Rouge. Quelques coups encore et la voilà qui sort du Loup et dit: “Ah! comme j'ai eu peur! Comme il faisait sombre dans le ventre du Loup!” Et voilà que la grand-mère sort à son tour, pouvant à peine respirer. Le Petit Chaperon Rouge se hâte de chercher de grosses pierres. Ils en remplissent le ventre du Loup. Lorsque celui-ci se réveilla, il voulut s'enfuir. Mais les pierres étaient si lourdes qu'il s'écrasa par terre et mourut. Ils étaient bien contents tous les trois: le chasseur dépouilla le Loup et l'emporta chez lui. La grand- mère mangea le gâteau et but le vin que le Petit Chaperon Rouge avait apportés. Elle s'en trouva toute ragaillardie. Le Petit Chaperon Rouge cependant pensait: “Je ne quitterai plus jamais mon chemin pour aller me promener dans la forêt, quand ma maman me l'aura interdit.” On raconte encore qu’une autre fois, quand le Petit Chaperon Rouge apportait de nouveau de la galette à sa vieille grand-mère, un autre loup essaya de la distraire et de la faire sortir du chemin. Mais elle s’en garda bien et continua à marcher tout droit. Arrivée chez sa grand-mère, elle lui raconta bien vite que le loup était venu à sa rencontre et qu’il lui avait souhaité le bonjour, mais qu’il l’avait regardée avec des yeux si méchants: “Si je n’avais pas été sur la grand-route, il m’aurait dévorée!” ajouta-t’elle. “Viens,” lui dit sa grand-mère, “nous allons fermer la porte et bien la cadenasser pour qu’il ne puisse pas entrer ici.” Peu après, le loup frappait à la porte et criait: “Ouvre-moi, grand-mère! c’est moi, le Petit Chaperon Rouge, qui t’apporte des gâteaux!” Mais les deux gardèrent le silence et n’ouvrirent point la porte. Tête-Grise fit alors plusieurs fois le tour de la maison à pas feutrés, et, pour finir, il sauta sur le toit, décidé à attendre jusqu’au soir, quand le Petit Chaperon Rouge sortirait, pour profiter de l’obscurité et l’engloutir. Mais la grand-mère se douta bien de ses intentions. “Prends le seau, mon enfant,” dit-elle au Petit Chaperon Rouge, “j’ai fait cuire des saucisses hier, et tu vas porter l’eau de cuisson dans la grande auge de pierre qui est devant l’entrée de la maison.” Le Petit Chaperon Rouge en porta tant et tant de seaux que, pour finir, l’auge était pleine. Alors la bonne odeur de la saucisse vint caresser les narines du loup jusque sur le toit. Il se pencha si bien en tendant le cou, qu’à la fin il glissa et ne put plus se retenir. Il glissa du toit et tomba droit dans l’auge de pierre où il se noya. Allègrement, le Petit Chaperon Rouge regagna sa maison, et personne ne lui fit le moindre mal. FIN
diff --git a/DISCUT/requirements.txt b/DISCUT/requirements.txt
new file mode 100644
index 0000000..3a79a09
--- /dev/null
+++ b/DISCUT/requirements.txt
@@ -0,0 +1,98 @@
+alabaster==0.7.12
+allennlp==0.9.0
+atomicwrites==1.4.0
+attrs==21.2.0
+Babel==2.9.1
+blis==0.2.4
+boto3==1.17.109
+botocore==1.20.109
+cached-property==1.5.2
+certifi==2021.5.30
+cffi==1.14.6
+chardet==4.0.0
+click==8.0.1
+colorama==0.4.4
+conllu==1.3.1
+cycler==0.10.0
+cymem==2.0.5
+docutils==0.17.1
+editdistance==0.5.3
+flaky==3.7.0
+Flask==2.0.1
+Flask-Cors==3.0.10
+ftfy==6.0.3
+gevent==21.1.2
+greenlet==1.1.0
+h5py==3.3.0
+idna==2.10
+imagesize==1.2.0
+importlib-metadata==4.6.1
+iniconfig==1.1.1
+itsdangerous==2.0.1
+Jinja2==3.0.1
+jmespath==0.10.0
+joblib==1.0.1
+jsonnetbin==0.16.0
+jsonpickle==2.0.0
+kiwisolver==1.3.1
+MarkupSafe==2.0.1
+matplotlib==3.4.2
+murmurhash==1.0.5
+nltk==3.6.2
+numpy==1.21.0
+numpydoc==1.1.0
+overrides==3.1.0
+packaging==21.0
+pandas==1.3.1
+parsimonious==0.8.1
+Pillow==8.3.1
+plac==0.9.6
+pluggy==0.13.1
+preshed==2.0.1
+protobuf==3.17.3
+py==1.10.0
+pycparser==2.20
+Pygments==2.9.0
+pyparsing==2.4.7
+pytest==6.2.4
+python-dateutil==2.8.1
+pytorch-pretrained-bert==0.6.2
+pytorch-transformers==1.1.0
+pytz==2021.1
+regex==2021.7.6
+requests==2.25.1
+responses==0.13.3
+s3transfer==0.4.2
+scikit-learn==0.24.2
+scipy==1.7.0
+sentencepiece==0.1.96
+six==1.16.0
+snowballstemmer==2.1.0
+spacy==2.1.9
+Sphinx==4.0.3
+sphinxcontrib-applehelp==1.0.2
+sphinxcontrib-devhelp==1.0.2
+sphinxcontrib-htmlhelp==2.0.0
+sphinxcontrib-jsmath==1.0.1
+sphinxcontrib-qthelp==1.0.3
+sphinxcontrib-serializinghtml==1.1.5
+sqlparse==0.4.1
+srsly==1.0.5
+stanfordnlp==0.2.0
+stanza==1.2.2
+tensorboardX==2.4
+thinc==7.0.8
+threadpoolctl==2.2.0
+toml==0.10.2
+tqdm==4.61.2
+typing-extensions==3.10.0.0
+typing-utils==0.1.0
+Unidecode==1.2.0
+urllib3==1.26.6
+wasabi==0.8.2
+wcwidth==0.2.5
+Werkzeug==2.0.1
+word2number==1.1
+zipp==3.5.0
+zope.event==4.5.0
+zope.interface==5.4.0
\ No newline at end of file
-- 
GitLab