Merge branch 'dev-expes' into 'main'

Dev expes See merge request !2

Merge branch 'dev-expes' into 'main'
42f75de9 · larivier · d6975588 · 2ecf6186 · 42f75de9 · 42f75de9
Commit 42f75de9 authored 2 years ago by larivier
--- a/README.md
+++ b/README.md
@@ -14,42 +14,50 @@ Code: https://gitlab.inria.fr/andiamo/tony

 # Usage
 ## Usecases 
- **Discourse Segmentation:** Take a raw text as input, use a loaded model to make predictions. Output the same text but with EDU segmentation.
+- **Discourse Segmentation:** Take a raw text as input, use a loaded model to make predictions. Output the same text but with EDU segmentation. --> config_1

- **Segmentation Evaluation:** Take an EDU gold segmented text as input, use a loaded model to make predictions. Output scores of model predictions against gold, and output discrepancies.
+- **Segmentation Evaluation:** Take an EDU gold segmented text as input, use a loaded model to make predictions. Output scores of model predictions against gold, and output discrepancies. --> config_2
+
+- **Custom Model Creation:** Fine-tuning (over one or two level) a pretrained Language Model with a specific dataset or combination of datasets. Then make predictions and evaluation. --> config_3


 ## Content description 
 [TBD : xplain directories automatically created during scripts run]
- `data/MyProjet/` Contains input data, raw and/or pre-processed format(s).
-    - `results/` Contains output data, scores and post-processed data. (Also logs of allennlp)
+- `data/my.cool.dataset/` Contains input data, raw and/or pre-processed format(s).
+    - `results.{stamp}/` Contains output data, scores and post-processed data. (Also logs of allennlp)
 - `code/` Contains main scripts.
    - `discut22_1.py` One python script to run them all.
-    - `config_XX.json` A file to be completed (or a dir with choise between simple use_case configs and a template for a custom config).
+    - `config_XX.json` A file to be completed for your specific project (or a dir with choise between simple use_case configs and a template for a custom config). See **
    - `utils/` Contains useful scripts to be called.
 - `model/` Contains model to be loaded or created.
- `documentation.md` Contains detailed documentation (TBD?)
+    - `config_training.jsonnet` A file to be completed. (TBD automatically saved with model when done)
+- `global_config_file_guideline.md` Contains detailed documentation to build well formed config file.

 ## Set up environnement
 - Conda stuff pour python 3.7 (TBD ?)
 - Install all librairies required with the following command:
 ```
-pip install -r <dir?>requirements.txt
+pip install -r requirements.txt
 ```

 ## Configuration file: to chose or to complete
- `code/config_1.json` Config for usecase_1 : take a sentence splited text, apply ToNy, output same text but with EDU brackets.
- [TBD : train models config and all sort of cool options]
+- `code/config_global_X.json` See global_config_file_guideline.md.
+

 ## Run usecase 1
 (go to `code` directory)
 Run this command:
 ```
-python code/discut22.py --config code/config_1.json
+python discut22.py --config config_XX.json
 ```

+## Support
+laura.riviere@irit.fr
+
+
 ## Authors and acknowledgment
-Morteza Ezzabady
+Morteza Ezzabady  
+Laura Rivière  
 Amir Zeldes
 <!---


--- a/code/allen_custom/__pycache__/custom_bert_token_embedder.cpython-37.pyc
+++ b/code/allen_custom/__pycache__/custom_bert_token_embedder.cpython-37.pyc
--- a/code/allen_custom/__pycache__/custom_conll_reader.cpython-37.pyc
+++ b/code/allen_custom/__pycache__/custom_conll_reader.cpython-37.pyc
--- a/code/allen_custom/__pycache__/custom_disrpt_reader.cpython-37.pyc
+++ b/code/allen_custom/__pycache__/custom_disrpt_reader.cpython-37.pyc
--- a/code/allen_custom/__pycache__/custom_simple_tagger.cpython-37.pyc
+++ b/code/allen_custom/__pycache__/custom_simple_tagger.cpython-37.pyc
--- a/code/allen_custom/custom_bert_token_embedder.py
+++ b/code/allen_custom/custom_bert_token_embedder.py
+"""
+A ``TokenEmbedder`` which uses one of the BERT models
+(https://github.com/google-research/bert)
+to produce embeddings.
+
+At its core it uses Hugging Face's PyTorch implementation
+(https://github.com/huggingface/pytorch-pretrained-BERT),
+so thanks to them!
+"""
+from typing import Dict, List
+import logging
+
+import torch
+import torch.nn.functional as F
+
+from pytorch_pretrained_bert.modeling import BertModel
+
+from allennlp.modules.scalar_mix import ScalarMix
+from allennlp.modules.token_embedders.token_embedder import TokenEmbedder
+from allennlp.nn import util
+
+logger = logging.getLogger(__name__)
+
+
+class PretrainedBertModel:
+    """
+    In some instances you may want to load the same BERT model twice
+    (e.g. to use as a token embedder and also as a pooling layer).
+    This factory provides a cache so that you don't actually have to load the model twice.
+    """
+    _cache: Dict[str, BertModel] = {}
+
+    @classmethod
+    def load(cls, model_name: str, cache_model: bool = True) -> BertModel:
+        if model_name in cls._cache:
+            return PretrainedBertModel._cache[model_name]
+
+        model = BertModel.from_pretrained(model_name)
+        if cache_model:
+            cls._cache[model_name] = model
+
+        return model
+
+
+class CustomBertEmbedder(TokenEmbedder):
+    """
+    A ``TokenEmbedder`` that produces BERT embeddings for your tokens.
+    Should be paired with a ``BertIndexer``, which produces wordpiece ids.
+
+    Most likely you probably want to use ``PretrainedBertEmbedder``
+    for one of the named pretrained models, not this base class.
+
+    Parameters
+    ----------
+    bert_model: ``BertModel``
+        The BERT model being wrapped.
+    top_layer_only: ``bool``, optional (default = ``False``)
+        If ``True``, then only return the top layer instead of apply the scalar mix.
+    max_pieces : int, optional (default: 512)
+        The BERT embedder uses positional embeddings and so has a corresponding
+        maximum length for its input ids. Assuming the inputs are windowed
+        and padded appropriately by this length, the embedder will split them into a
+        large batch, feed them into BERT, and recombine the output as if it was a
+        longer sequence.
+    num_start_tokens : int, optional (default: 1)
+        The number of starting special tokens input to BERT (usually 1, i.e., [CLS])
+    num_end_tokens : int, optional (default: 1)
+        The number of ending tokens input to BERT (usually 1, i.e., [SEP])
+    scalar_mix_parameters: ``List[float]``, optional, (default = None)
+        If not ``None``, use these scalar mix parameters to weight the representations
+        produced by different layers. These mixing weights are not updated during
+        training.
+    """
+    def __init__(self,
+                 bert_model: BertModel,
+                 aligning_files = None,
+                 top_layer_only: bool = False,
+                 max_pieces: int = 512,
+                 num_start_tokens: int = 1,
+                 num_end_tokens: int = 1,
+                 scalar_mix_parameters: List[float] = None) -> None:
+        super().__init__()
+        self.bert_model = bert_model
+        #self.aligning_fr = "saved_mappings/fra.sdrt.annodis_eng.rst.gum.pth"
+        self.aligning_fr = "../MUSE/results/stac-annodis_all/best_mapping.pth"
+        #self.aligning_fr = 'saved_mappings/eng.rst.gum_fra.sdrt.annodis.pth'
+        print("ALIGN", self.aligning_fr)
+        self.aligning_fr = torch.from_numpy(torch.load(self.aligning_fr))
+        self.aligning_fr_t = torch.transpose(self.aligning_fr, 0, 1) #.to(torch.device('cuda:0'))
+        print(self.aligning_fr.shape)
+        self.output_dim = bert_model.config.hidden_size
+        self.max_pieces = max_pieces
+        self.num_start_tokens = num_start_tokens
+        self.num_end_tokens = num_end_tokens
+
+        if not top_layer_only:
+            self._scalar_mix = ScalarMix(bert_model.config.num_hidden_layers,
+                                         do_layer_norm=False,
+                                         initial_scalar_parameters=scalar_mix_parameters,
+                                         trainable=scalar_mix_parameters is None)
+        else:
+            self._scalar_mix = None
+
+    def get_output_dim(self) -> int:
+        return self.output_dim
+
+    def forward(self,
+                input_ids: torch.LongTensor,
+                offsets: torch.LongTensor = None,
+                token_type_ids: torch.LongTensor = None) -> torch.Tensor:
+        """
+        Parameters
+        ----------
+        input_ids : ``torch.LongTensor`` The (batch_size, ..., max_sequence_length) tensor of wordpiece ids.
+        offsets : ``torch.LongTensor``, optional
+            The BERT embeddings are one per wordpiece. However it's possible/likely
+            you might want one per original token. In that case, ``offsets``
+            represents the indices of the desired wordpiece for each original token.
+            Depending on how your token indexer is configured, this could be the
+            position of the last wordpiece for each token, or it could be the position
+            of the first wordpiece for each token.
+
+            For example, if you had the sentence "Definitely not", and if the corresponding
+            wordpieces were ["Def", "##in", "##ite", "##ly", "not"], then the input_ids
+            would be 5 wordpiece ids, and the "last wordpiece" offsets would be [3, 4].
+            If offsets are provided, the returned tensor will contain only the wordpiece
+            embeddings at those positions, and (in particular) will contain one embedding
+            per token. If offsets are not provided, the entire tensor of wordpiece embeddings
+            will be returned.
+        token_type_ids : ``torch.LongTensor``, optional
+            If an input consists of two sentences (as in the BERT paper),
+            tokens from the first sentence should have type 0 and tokens from
+            the second sentence should have type 1.  If you don't provide this
+            (the default BertIndexer doesn't) then it's assumed to be all 0s.
+        """
+        # pylint: disable=arguments-differ
+        batch_size, full_seq_len = input_ids.size(0), input_ids.size(-1)
+        initial_dims = list(input_ids.shape[:-1])
+
+        # The embedder may receive an input tensor that has a sequence length longer than can
+        # be fit. In that case, we should expect the wordpiece indexer to create padded windows
+        # of length `self.max_pieces` for us, and have them concatenated into one long sequence.
+        # E.g., "[CLS] I went to the [SEP] [CLS] to the store to [SEP] ..."
+        # We can then split the sequence into sub-sequences of that length, and concatenate them
+        # along the batch dimension so we effectively have one huge batch of partial sentences.
+        # This can then be fed into BERT without any sentence length issues. Keep in mind
+        # that the memory consumption can dramatically increase for large batches with extremely
+        # long sentences.
+        needs_split = full_seq_len > self.max_pieces
+        last_window_size = 0
+        if needs_split:
+            # Split the flattened list by the window size, `max_pieces`
+            split_input_ids = list(input_ids.split(self.max_pieces, dim=-1))
+
+            # We want all sequences to be the same length, so pad the last sequence
+            last_window_size = split_input_ids[-1].size(-1)
+            padding_amount = self.max_pieces - last_window_size
+            split_input_ids[-1] = F.pad(split_input_ids[-1], pad=[0, padding_amount], value=0)
+
+            # Now combine the sequences along the batch dimension
+            input_ids = torch.cat(split_input_ids, dim=0)
+
+            if token_type_ids is not None:
+                # Same for token_type_ids
+                split_token_type_ids = list(token_type_ids.split(self.max_pieces, dim=-1))
+
+                last_window_size = split_token_type_ids[-1].size(-1)
+                padding_amount = self.max_pieces - last_window_size
+                split_token_type_ids[-1] = F.pad(split_token_type_ids[-1], pad=[0, padding_amount], value=0)
+
+                token_type_ids = torch.cat(split_token_type_ids, dim=0)
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros_like(input_ids)
+
+        input_mask = (input_ids != 0).long()
+
+        # input_ids may have extra dimensions, so we reshape down to 2-d
+        # before calling the BERT model and then reshape back at the end.
+        all_encoder_layers, _ = self.bert_model(input_ids=util.combine_initial_dims(input_ids),
+                                                token_type_ids=util.combine_initial_dims(token_type_ids),
+                                                attention_mask=util.combine_initial_dims(input_mask))
+        all_encoder_layers = torch.stack(all_encoder_layers)
+        # ======ROTATION===== #
+        #all_encoder_layers = torch.matmul(all_encoder_layers, self.aligning_fr_t)
+
+        if needs_split:
+            # First, unpack the output embeddings into one long sequence again
+            unpacked_embeddings = torch.split(all_encoder_layers, batch_size, dim=1)
+            unpacked_embeddings = torch.cat(unpacked_embeddings, dim=2)
+
+            # Next, select indices of the sequence such that it will result in embeddings representing the original
+            # sentence. To capture maximal context, the indices will be the middle part of each embedded window
+            # sub-sequence (plus any leftover start and final edge windows), e.g.,
+            #  0     1 2    3  4   5    6    7     8     9   10   11   12    13 14  15
+            # "[CLS] I went to the very fine [SEP] [CLS] the very fine store to eat [SEP]"
+            # with max_pieces = 8 should produce max context indices [2, 3, 4, 10, 11, 12] with additional start
+            # and final windows with indices [0, 1] and [14, 15] respectively.
+
+            # Find the stride as half the max pieces, ignoring the special start and end tokens
+            # Calculate an offset to extract the centermost embeddings of each window
+            stride = (self.max_pieces - self.num_start_tokens - self.num_end_tokens) // 2
+            stride_offset = stride // 2 + self.num_start_tokens
+
+            first_window = list(range(stride_offset))
+
+            max_context_windows = [i for i in range(full_seq_len)
+                                   if stride_offset - 1 < i % self.max_pieces < stride_offset + stride]
+
+            # Lookback what's left, unless it's the whole self.max_pieces window
+            if full_seq_len % self.max_pieces == 0:
+                lookback = self.max_pieces
+            else:
+                lookback = full_seq_len % self.max_pieces
+
+            final_window_start = full_seq_len - lookback + stride_offset + stride
+            final_window = list(range(final_window_start, full_seq_len))
+
+            select_indices = first_window + max_context_windows + final_window
+
+            initial_dims.append(len(select_indices))
+
+            recombined_embeddings = unpacked_embeddings[:, :, select_indices]
+        else:
+            recombined_embeddings = all_encoder_layers
+
+        # Recombine the outputs of all layers
+        # (layers, batch_size * d1 * ... * dn, sequence_length, embedding_dim)
+        # recombined = torch.cat(combined, dim=2)
+        input_mask = (recombined_embeddings != 0).long()
+
+        if self._scalar_mix is not None:
+            mix = self._scalar_mix(recombined_embeddings, input_mask)
+        else:
+            mix = recombined_embeddings[-1]
+
+        # At this point, mix is (batch_size * d1 * ... * dn, sequence_length, embedding_dim)
+
+        if offsets is None:
+            # Resize to (batch_size, d1, ..., dn, sequence_length, embedding_dim)
+            dims = initial_dims if needs_split else input_ids.size()
+            return util.uncombine_initial_dims(mix, dims)
+        else:
+            # offsets is (batch_size, d1, ..., dn, orig_sequence_length)
+            offsets2d = util.combine_initial_dims(offsets)
+            # now offsets is (batch_size * d1 * ... * dn, orig_sequence_length)
+            range_vector = util.get_range_vector(offsets2d.size(0),
+                                                 device=util.get_device_of(mix)).unsqueeze(1)
+            # selected embeddings is also (batch_size * d1 * ... * dn, orig_sequence_length)
+            selected_embeddings = mix[range_vector, offsets2d]
+
+            return util.uncombine_initial_dims(selected_embeddings, offsets.size())
+
+
+@TokenEmbedder.register("custom-bert-pretrained")
+class CustomPretrainedBertEmbedder(CustomBertEmbedder):
+    # pylint: disable=line-too-long
+    """
+    Parameters
+    ----------
+    pretrained_model: ``str``
+        Either the name of the pretrained model to use (e.g. 'bert-base-uncased'),
+        or the path to the .tar.gz file with the model weights.
+
+        If the name is a key in the list of pretrained models at
+        https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py#L41
+        the corresponding path will be used; otherwise it will be interpreted as a path or URL.
+    requires_grad : ``bool``, optional (default = False)
+        If True, compute gradient of BERT parameters for fine tuning.
+    top_layer_only: ``bool``, optional (default = ``False``)
+        If ``True``, then only return the top layer instead of apply the scalar mix.
+    scalar_mix_parameters: ``List[float]``, optional, (default = None)
+        If not ``None``, use these scalar mix parameters to weight the representations
+        produced by different layers. These mixing weights are not updated during
+        training.
+    """
+    def __init__(self, pretrained_model: str, aligning_files, requires_grad: bool = False, top_layer_only: bool = False,
+                 scalar_mix_parameters: List[float] = None) -> None:
+        model = PretrainedBertModel.load(pretrained_model)
+
+        print("ALIGN", aligning_files['fr'])
+
+        for param in model.parameters():
+            param.requires_grad = requires_grad
+
+        print("CHECKPOINT")
+        super().__init__(bert_model=model, top_layer_only=top_layer_only, scalar_mix_parameters=scalar_mix_parameters)
--- a/code/allen_custom/custom_conll_reader.py
+++ b/code/allen_custom/custom_conll_reader.py
+from typing import Dict, List, Sequence, Iterable
+import itertools
+import logging
+import os
+
+from overrides import overrides
+
+from allennlp.common.checks import ConfigurationError
+from allennlp.common.file_utils import cached_path
+from allennlp.data.dataset_readers.dataset_reader import DatasetReader
+from allennlp.data.dataset_readers.dataset_utils import to_bioul
+from allennlp.data.fields import TextField, SequenceLabelField, Field, MetadataField
+from allennlp.data.instance import Instance
+from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
+from allennlp.data.tokenizers import Token
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+def _is_divider(line: str) -> bool:
+    empty_line = line.strip() == ''
+    if empty_line:
+        return True
+    else:
+        first_token = line.split()[0]
+        if first_token == "-DOCSTART-":  # pylint: disable=simplifiable-if-statement
+            return True
+        else:
+            return False
+
+
+@DatasetReader.register("custom_conll_reader")
+class CustomConllDatasetReader(DatasetReader):
+    """
+    Reads instances from a pretokenised file where each line is in the following format:
+
+    WORD POS-TAG CHUNK-TAG NER-TAG
+
+    with a blank line indicating the end of each sentence
+    and '-DOCSTART- -X- -X- O' indicating the end of each article,
+    and converts it into a ``Dataset`` suitable for sequence tagging.
+
+    Each ``Instance`` contains the words in the ``"tokens"`` ``TextField``.
+    The values corresponding to the ``tag_label``
+    values will get loaded into the ``"tags"`` ``SequenceLabelField``.
+    And if you specify any ``feature_labels`` (you probably shouldn't),
+    the corresponding values will get loaded into their own ``SequenceLabelField`` s.
+
+    This dataset reader ignores the "article" divisions and simply treats
+    each sentence as an independent ``Instance``. (Technically the reader splits sentences
+    on any combination of blank lines and "DOCSTART" tags; in particular, it does the right
+    thing on well formed inputs.)
+
+    Parameters
+    ----------
+    token_indexers : ``Dict[str, TokenIndexer]``, optional (default=``{"tokens": SingleIdTokenIndexer()}``)
+        We use this to define the input representation for the text.  See :class:`TokenIndexer`.
+    tag_label: ``str``, optional (default=``ner``)
+        Specify `ner`, `pos`, or `chunk` to have that tag loaded into the instance field `tag`.
+    feature_labels: ``Sequence[str]``, optional (default=``()``)
+        These labels will be loaded as features into the corresponding instance fields:
+        ``pos`` -> ``pos_tags``, ``chunk`` -> ``chunk_tags``, ``ner`` -> ``ner_tags``
+        Each will have its own namespace: ``pos_tags``, ``chunk_tags``, ``ner_tags``.
+        If you want to use one of the tags as a `feature` in your model, it should be
+        specified here.
+    coding_scheme: ``str``, optional (default=``IOB1``)
+        Specifies the coding scheme for ``ner_labels`` and ``chunk_labels``.
+        Valid options are ``IOB1`` and ``BIOUL``.  The ``IOB1`` default maintains
+        the original IOB1 scheme in the CoNLL 2003 NER data.
+        In the IOB1 scheme, I is a token inside a span, O is a token outside
+        a span and B is the beginning of span immediately following another
+        span of the same type.
+    label_namespace: ``str``, optional (default=``labels``)
+        Specifies the namespace for the chosen ``tag_label``.
+    """
+    _VALID_LABELS = {'ner', 'pos', 'chunk'}
+
+    def __init__(self,
+                 token_indexers: Dict[str, TokenIndexer] = None,
+                 tag_label: str = "ner",
+                 feature_labels: Sequence[str] = (),
+                 lazy: bool = False,
+                 coding_scheme: str = "IOB1",
+                 label_namespace: str = "labels") -> None:
+        super().__init__(lazy)
+        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
+        if tag_label is not None and tag_label not in self._VALID_LABELS:
+            raise ConfigurationError("unknown tag label type: {}".format(tag_label))
+        for label in feature_labels:
+            if label not in self._VALID_LABELS:
+                raise ConfigurationError("unknown feature label type: {}".format(label))
+        if coding_scheme not in ("IOB1", "BIOUL"):
+            raise ConfigurationError("unknown coding_scheme: {}".format(coding_scheme))
+
+        self.tag_label = tag_label
+        self.feature_labels = set(feature_labels)
+        self.coding_scheme = coding_scheme
+        self.label_namespace = label_namespace
+        self._original_coding_scheme = "IOB1"
+
+    @overrides
+    def _read(self, file_path: str) -> Iterable[Instance]:
+        # if `file_path` is a URL, redirect to the cache
+        file_path = cached_path(file_path)
+
+        with open(file_path, "r") as data_file:
+            logger.info("Reading instances from lines in file at: %s", file_path)
+
+            # Group into alternative divider / sentence chunks.
+            for is_divider, lines in itertools.groupby(data_file, _is_divider):
+                # Ignore the divider chunks, so that `lines` corresponds to the words
+                # of a single sentence.
+                if not is_divider:
+                    fields = [line.strip().split() for line in lines]
+                    # unzipping trick returns tuples, but our Fields need lists
+                    fields = [list(field) for field in zip(*fields)]
+                    tokens_, pos_tags, chunk_tags, ner_tags = fields
+                    # TextField requires ``Token`` objects
+                    tokens = [Token(token) for token in tokens_]
+
+                    yield self.text_to_instance(tokens, pos_tags, chunk_tags, ner_tags, file_path)
+
+    def get_lang(self, file_path):
+        _, file_name = os.path.split(file_path)
+        lang = file_name[:2]
+        if lang == 'po':
+            lang = 'pt'
+        if lang not in ['en','de','it','fr','pt','sv']:
+            raise ConfigurationError(f"Language {lang} not supported by ELMo")
+        return lang
+
+    def text_to_instance(self, # type: ignore
+                         tokens: List[Token],
+                         pos_tags: List[str] = None,
+                         chunk_tags: List[str] = None,
+                         ner_tags: List[str] = None,
+                         file_path: str = None) -> Instance:
+        """
+        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
+        """
+        # pylint: disable=arguments-differ
+        sequence = TextField(tokens, self._token_indexers)
+        instance_fields: Dict[str, Field] = {'tokens': sequence}
+        instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens], "lang": self.get_lang(file_path)})
+
+        # Recode the labels if necessary.
+        if self.coding_scheme == "BIOUL":
+            coded_chunks = to_bioul(chunk_tags,
+                                    encoding=self._original_coding_scheme) if chunk_tags is not None else None
+            coded_ner = to_bioul(ner_tags,
+                                 encoding=self._original_coding_scheme) if ner_tags is not None else None
+        else:
+            # the default IOB1
+            coded_chunks = chunk_tags
+            coded_ner = ner_tags
+
+        # Add "feature labels" to instance
+        if 'pos' in self.feature_labels:
+            if pos_tags is None:
+                raise ConfigurationError("Dataset reader was specified to use pos_tags as "
+                                         "features. Pass them to text_to_instance.")
+            instance_fields['pos_tags'] = SequenceLabelField(pos_tags, sequence, "pos_tags")
+        if 'chunk' in self.feature_labels:
+            if coded_chunks is None:
+                raise ConfigurationError("Dataset reader was specified to use chunk tags as "
+                                         "features. Pass them to text_to_instance.")
+            instance_fields['chunk_tags'] = SequenceLabelField(coded_chunks, sequence, "chunk_tags")
+        if 'ner' in self.feature_labels:
+            if coded_ner is None:
+                raise ConfigurationError("Dataset reader was specified to use NER tags as "
+                                         " features. Pass them to text_to_instance.")
+            instance_fields['ner_tags'] = SequenceLabelField(coded_ner, sequence, "ner_tags")
+
+        # Add "tag label" to instance
+        if self.tag_label == 'ner' and coded_ner is not None:
+            instance_fields['tags'] = SequenceLabelField(coded_ner, sequence,
+                                                         self.label_namespace)
+        elif self.tag_label == 'pos' and pos_tags is not None:
+            instance_fields['tags'] = SequenceLabelField(pos_tags, sequence,
+                                                         self.label_namespace)
+        elif self.tag_label == 'chunk' and coded_chunks is not None:
+            instance_fields['tags'] = SequenceLabelField(coded_chunks, sequence,
+                                                         self.label_namespace)
+
+        return Instance(instance_fields)
--- a/code/allen_custom/custom_disrpt_reader.py
+++ b/code/allen_custom/custom_disrpt_reader.py
+from typing import Dict, List, Sequence, Iterable
+import itertools
+import logging
+import os
+
+from overrides import overrides
+
+from allennlp.common.checks import ConfigurationError
+from allennlp.common.file_utils import cached_path
+from allennlp.data.dataset_readers.dataset_reader import DatasetReader
+from allennlp.data.dataset_readers.dataset_utils import to_bioul
+from allennlp.data.fields import TextField, SequenceLabelField, Field, MetadataField
+from allennlp.data.instance import Instance
+from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer
+from allennlp.data.tokenizers import Token
+
+logger = logging.getLogger(__name__)  # pylint: disable=invalid-name
+
+def _is_divider(line: str) -> bool:
+    empty_line = line.strip() == ''
+    if empty_line:
+        return True
+    else:
+        first_token = line.split()[0]
+        if first_token == "#":  
+            return True
+        else:
+            return False
+
+
+@DatasetReader.register("custom_disrpt_reader")
+class CustomDisrptDatasetReader(DatasetReader):
+    """
+    Reads instances from a pretokenised file where each line is in the following format:
+
+    WORD POS-TAG CHUNK-TAG NER-TAG
+
+    with a blank line indicating the end of each sentence
+    and '-DOCSTART- -X- -X- O' indicating the end of each article,
+    and converts it into a ``Dataset`` suitable for sequence tagging.
+
+    Each ``Instance`` contains the words in the ``"tokens"`` ``TextField``.
+    The values corresponding to the ``tag_label``
+    values will get loaded into the ``"tags"`` ``SequenceLabelField``.
+    And if you specify any ``feature_labels`` (you probably shouldn't),
+    the corresponding values will get loaded into their own ``SequenceLabelField`` s.
+
+    This dataset reader ignores the "article" divisions and simply treats
+    each sentence as an independent ``Instance``. (Technically the reader splits sentences
+    on any combination of blank lines and "DOCSTART" tags; in particular, it does the right
+    thing on well formed inputs.)
+
+    Parameters
+    ----------
+    token_indexers : ``Dict[str, TokenIndexer]``, optional (default=``{"tokens": SingleIdTokenIndexer()}``)
+        We use this to define the input representation for the text.  See :class:`TokenIndexer`.
+    tag_label: ``str``, optional (default=``ner``)
+        Specify `ner`, `pos`, or `chunk` to have that tag loaded into the instance field `tag`.
+    feature_labels: ``Sequence[str]``, optional (default=``()``)
+        These labels will be loaded as features into the corresponding instance fields:
+        ``pos`` -> ``pos_tags``, ``chunk`` -> ``chunk_tags``, ``ner`` -> ``ner_tags``
+        Each will have its own namespace: ``pos_tags``, ``chunk_tags``, ``ner_tags``.
+        If you want to use one of the tags as a `feature` in your model, it should be
+        specified here.
+    coding_scheme: ``str``, optional (default=``IOB1``)
+        Specifies the coding scheme for ``ner_labels`` and ``chunk_labels``.
+        Valid options are ``IOB1`` and ``BIOUL``.  The ``IOB1`` default maintains
+        the original IOB1 scheme in the CoNLL 2003 NER data.
+        In the IOB1 scheme, I is a token inside a span, O is a token outside
+        a span and B is the beginning of span immediately following another
+        span of the same type.
+    label_namespace: ``str``, optional (default=``labels``)
+        Specifies the namespace for the chosen ``tag_label``.
+    """
+    _VALID_LABELS = {'ner', 'pos', 'chunk'}
+
+    def __init__(self,
+                 token_indexers: Dict[str, TokenIndexer] = None,
+                 tag_label: str = "ner",
+                 feature_labels: Sequence[str] = (),
+                 lazy: bool = False,
+                 coding_scheme: str = "IOB1",
+                 label_namespace: str = "labels") -> None:
+        super().__init__(lazy)
+        self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()}
+        if tag_label is not None and tag_label not in self._VALID_LABELS:
+            raise ConfigurationError("unknown tag label type: {}".format(tag_label))
+        for label in feature_labels:
+            if label not in self._VALID_LABELS:
+                raise ConfigurationError("unknown feature label type: {}".format(label))
+        if coding_scheme not in ("IOB1", "BIOUL"):
+            raise ConfigurationError("unknown coding_scheme: {}".format(coding_scheme))
+
+        self.tag_label = tag_label
+        self.feature_labels = set(feature_labels)
+        self.coding_scheme = coding_scheme
+        self.label_namespace = label_namespace
+        self._original_coding_scheme = "IOB1"
+
+    @overrides
+    def _read(self, file_path: str) -> Iterable[Instance]:
+        # if `file_path` is a URL, redirect to the cache
+        file_path = cached_path(file_path)
+
+        with open(file_path, "r") as data_file:
+            logger.info("Reading instances from lines in file at: %s", file_path)
+
+            # Group into alternative divider / sentence chunks.
+            for is_divider, lines in itertools.groupby(data_file, _is_divider):
+                # Ignore the divider chunks, so that `lines` corresponds to the words
+                # of a single sentence.
+                if not is_divider:
+                    fields = [line.strip().split() for line in lines]
+                    # unzipping trick returns tuples, but our Fields need lists
+                    fields = [list(field) for field in zip(*fields)]
+                    #TOKID TOK _ POS _ _ _ _ _ TAG
+                    chunk_tags, tokens_, _, pos_tags, _, _, _, _, _, ner_tags = fields
+                    chunk_tags = list(map(lambda _: "O", chunk_tags))
+                    ner_tags = list(map(lambda x: "B-S" if x.startswith("BeginSeg=Yes") else "O", ner_tags))
+                    # TextField requires ``Token`` objects
+                    tokens = [Token(token) for token in tokens_]
+
+                    yield self.text_to_instance(tokens, pos_tags, chunk_tags, ner_tags, file_path)
+
+    def get_lang(self, file_path):
+        _, file_name = os.path.split(file_path)
+        lang = file_name[:2]
+        if lang == 'po':
+            lang = 'pt'
+        if lang not in ['en','de','it','fr','pt','sv']:
+            raise ConfigurationError(f"Language {lang} not supported by ELMo")
+        return lang
+
+    def text_to_instance(self, # type: ignore
+                         tokens: List[Token],
+                         pos_tags: List[str] = None,
+                         chunk_tags: List[str] = None,
+                         ner_tags: List[str] = None,
+                         file_path: str = None) -> Instance:
+        """
+        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
+        """
+        # pylint: disable=arguments-differ
+        sequence = TextField(tokens, self._token_indexers)
+        instance_fields: Dict[str, Field] = {'tokens': sequence}
+        instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens], "lang": self.get_lang(file_path)})
+
+        # Recode the labels if necessary.
+        if self.coding_scheme == "BIOUL":
+            coded_chunks = to_bioul(chunk_tags,
+                                    encoding=self._original_coding_scheme) if chunk_tags is not None else None
+            coded_ner = to_bioul(ner_tags,
+                                 encoding=self._original_coding_scheme) if ner_tags is not None else None
+        else:
+            # the default IOB1
+            coded_chunks = chunk_tags
+            coded_ner = ner_tags
+
+        # Add "feature labels" to instance
+        if 'pos' in self.feature_labels:
+            if pos_tags is None:
+                raise ConfigurationError("Dataset reader was specified to use pos_tags as "
+                                         "features. Pass them to text_to_instance.")
+            instance_fields['pos_tags'] = SequenceLabelField(pos_tags, sequence, "pos_tags")
+        if 'chunk' in self.feature_labels:
+            if coded_chunks is None:
+                raise ConfigurationError("Dataset reader was specified to use chunk tags as "
+                                         "features. Pass them to text_to_instance.")
+            instance_fields['chunk_tags'] = SequenceLabelField(coded_chunks, sequence, "chunk_tags")
+        if 'ner' in self.feature_labels:
+            if coded_ner is None:
+                raise ConfigurationError("Dataset reader was specified to use NER tags as "
+                                         " features. Pass them to text_to_instance.")
+            instance_fields['ner_tags'] = SequenceLabelField(coded_ner, sequence, "ner_tags")
+
+        # Add "tag label" to instance
+        if self.tag_label == 'ner' and coded_ner is not None:
+            instance_fields['tags'] = SequenceLabelField(coded_ner, sequence,
+                                                         self.label_namespace)
+        elif self.tag_label == 'pos' and pos_tags is not None:
+            instance_fields['tags'] = SequenceLabelField(pos_tags, sequence,
+                                                         self.label_namespace)
+        elif self.tag_label == 'chunk' and coded_chunks is not None:
+            instance_fields['tags'] = SequenceLabelField(coded_chunks, sequence,
+                                                         self.label_namespace)
+
+        return Instance(instance_fields)
--- a/code/allen_custom/custom_simple_tagger.py
+++ b/code/allen_custom/custom_simple_tagger.py
+from typing import Dict, Optional, List, Any
+
+import random
+
+import numpy
+from overrides import overrides
+import torch
+from torch.nn.modules.linear import Linear
+import torch.nn.functional as F
+
+from allennlp.common.checks import check_dimensions_match, ConfigurationError
+from allennlp.data import Vocabulary
+from allennlp.modules import Seq2SeqEncoder, TimeDistributed, TextFieldEmbedder
+from allennlp.models.model import Model
+from allennlp.nn import InitializerApplicator, RegularizerApplicator
+from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
+from allennlp.training.metrics import CategoricalAccuracy, SpanBasedF1Measure
+
+
+@Model.register("custom_simple_tagger")
+class CustomSimpleTagger(Model):
+    """
+    This ``SimpleTagger`` simply encodes a sequence of text with a stacked ``Seq2SeqEncoder``, then
+    predicts a tag for each token in the sequence.
+
+    Parameters
+    ----------
+    vocab : ``Vocabulary``, required
+        A Vocabulary, required in order to compute sizes for input/output projections.
+    text_field_embedder : ``TextFieldEmbedder``, required
+        Used to embed the ``tokens`` ``TextField`` we get as input to the model.
+    encoder : ``Seq2SeqEncoder``
+        The encoder (with its own internal stacking) that we will use in between embedding tokens
+        and predicting output tags.
+    calculate_span_f1 : ``bool``, optional (default=``None``)
+        Calculate span-level F1 metrics during training. If this is ``True``, then
+        ``label_encoding`` is required. If ``None`` and
+        label_encoding is specified, this is set to ``True``.
+        If ``None`` and label_encoding is not specified, it defaults
+        to ``False``.
+    label_encoding : ``str``, optional (default=``None``)
+        Label encoding to use when calculating span f1.
+        Valid options are "BIO", "BIOUL", "IOB1", "BMES".
+        Required if ``calculate_span_f1`` is true.
+    label_namespace : ``str``, optional (default=``labels``)
+        This is needed to compute the SpanBasedF1Measure metric, if desired.
+        Unless you did something unusual, the default value should be what you want.
+    verbose_metrics : ``bool``, optional (default = False)
+        If true, metrics will be returned per label class in addition
+        to the overall statistics.
+    initializer : ``InitializerApplicator``, optional (default=``InitializerApplicator()``)
+        Used to initialize the model parameters.
+    regularizer : ``RegularizerApplicator``, optional (default=``None``)
+        If provided, will be used to calculate the regularization penalty during training.
+    """
+
+    def __init__(self, vocab: Vocabulary,
+                 text_field_embedder: TextFieldEmbedder,
+                 encoder: Seq2SeqEncoder,
+                 calculate_span_f1: bool = None,
+                 label_encoding: Optional[str] = None,
+                 label_namespace: str = "labels",
+                 verbose_metrics: bool = False,
+                 initializer: InitializerApplicator = InitializerApplicator(),
+                 regularizer: Optional[RegularizerApplicator] = None) -> None:
+        super(CustomSimpleTagger, self).__init__(vocab, regularizer)
+
+        self.label_namespace = label_namespace
+        self.text_field_embedder = text_field_embedder
+        self.num_classes = self.vocab.get_vocab_size(label_namespace)
+        self.encoder = encoder
+        self._verbose_metrics = verbose_metrics
+        self.tag_projection_layer = TimeDistributed(Linear(self.encoder.get_output_dim(),
+                                                           self.num_classes))
+
+        check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(),
+                               "text field embedding dim", "encoder input dim")
+
+        # We keep calculate_span_f1 as a constructor argument for API consistency with
+        # the CrfTagger, even it is redundant in this class
+        # (label_encoding serves the same purpose).
+        if calculate_span_f1 and not label_encoding:
+            raise ConfigurationError("calculate_span_f1 is True, but "
+                                     "no label_encoding was specified.")
+        self.metrics = {
+                "accuracy": CategoricalAccuracy(),
+                "accuracy3": CategoricalAccuracy(top_k=3)
+        }
+
+        if calculate_span_f1 or label_encoding:
+            self._f1_metric = SpanBasedF1Measure(vocab,
+                                                 tag_namespace=label_namespace,
+                                                 label_encoding=label_encoding)
+        else:
+            self._f1_metric = None
+
+        initializer(self)
+
+    @overrides
+    def forward(self,  # type: ignore
+                tokens: Dict[str, torch.LongTensor],
+                tags: torch.LongTensor = None,
+                metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]:
+        # pylint: disable=arguments-differ
+        """
+        Parameters
+        ----------
+        tokens : Dict[str, torch.LongTensor], required
+            The output of ``TextField.as_array()``, which should typically be passed directly to a
+            ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer``
+            tensors.  At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens":
+            Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used
+            for the ``TokenIndexers`` when you created the ``TextField`` representing your
+            sequence.  The dictionary is designed to be passed directly to a ``TextFieldEmbedder``,
+            which knows how to combine different word representations into a single vector per
+            token in your input.
+        tags : torch.LongTensor, optional (default = None)
+            A torch tensor representing the sequence of integer gold class labels of shape
+            ``(batch_size, num_tokens)``.
+        metadata : ``List[Dict[str, Any]]``, optional, (default = None)
+            metadata containing the original words in the sentence to be tagged under a 'words' key.
+
+        Returns
+        -------
+        An output dictionary consisting of:
+        logits : torch.FloatTensor
+            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
+            unnormalised log probabilities of the tag classes.
+        class_probabilities : torch.FloatTensor
+            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
+            a distribution of the tag classes per word.
+        loss : torch.FloatTensor, optional
+            A scalar loss to be optimised.
+
+        """
+        embedded_text_input = self.text_field_embedder(tokens, lang=metadata[0]['lang']) #tokens)
+        batch_size, sequence_length, _ = embedded_text_input.size()
+        mask = get_text_field_mask(tokens)
+        encoded_text = self.encoder(embedded_text_input, mask)
+
+        logits = self.tag_projection_layer(encoded_text)
+        reshaped_log_probs = logits.view(-1, self.num_classes)
+        class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view([batch_size,
+                                                                          sequence_length,
+                                                                          self.num_classes])
+
+
+        output_dict = {"logits": logits, "class_probabilities": class_probabilities}
+
+        if tags is not None:
+            loss = sequence_cross_entropy_with_logits(logits, tags, mask)
+            for metric in self.metrics.values():
+                metric(logits, tags, mask.float())
+            if self._f1_metric is not None:
+                self._f1_metric(logits, tags, mask.float())
+            output_dict["loss"] = loss
+
+        if metadata is not None:
+            output_dict["words"] = [x["words"] for x in metadata]
+        return output_dict
+
+    @overrides
+    def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
+        """
+        Does a simple position-wise argmax over each token, converts indices to string labels, and
+        adds a ``"tags"`` key to the dictionary with the result.
+        """
+        all_predictions = output_dict['class_probabilities']
+        all_predictions = all_predictions.cpu().data.numpy()
+        if all_predictions.ndim == 3:
+            predictions_list = [all_predictions[i] for i in range(all_predictions.shape[0])]
+        else:
+            predictions_list = [all_predictions]
+        all_tags = []
+        for predictions in predictions_list:
+            argmax_indices = numpy.argmax(predictions, axis=-1)
+            tags = [self.vocab.get_token_from_index(x, namespace="labels")
+                    for x in argmax_indices]
+            all_tags.append(tags)
+        output_dict['tags'] = all_tags
+        return output_dict
+
+    @overrides
+    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
+        metrics_to_return = {metric_name: metric.get_metric(reset) for
+                             metric_name, metric in self.metrics.items()}
+
+        if self._f1_metric is not None:
+            f1_dict = self._f1_metric.get_metric(reset=reset)
+            if self._verbose_metrics:
+                metrics_to_return.update(f1_dict)
+            else:
+                metrics_to_return.update({
+                        x: y for x, y in f1_dict.items() if
+                        "overall" in x})
+        return metrics_to_return
--- a/code/classes_def.py
+++ b/code/classes_def.py
@@ -3,23 +3,45 @@


 class Input:
-    def __init__(self, infos):
+    def __init__(self, infos, stamp):
        self.name = infos['name']
        self.lang = infos['language'] 
-        self.path = infos['folder_path'] # misused
-        self.file = infos['file'] 
-        self.form = infos['format'] # not used
-        self.gold = infos['gold'] # not used
-        self.resu = infos['results_path'] # misused : le créer automatiquement
+#        self.path = infos['folder_path'] # misused
+        self.path = f"../data/{self.name}"
+        self.file = infos['file']
+        self.stamp = stamp
+        self.conv = f"{self.path}/data_converted_{stamp}" # à intégrer
+        self.resu = f"{self.path}/results_{stamp}"


 class Process:
    def __init__(self, infos, data):
-        self.main = infos["main"]
-        self.toke = infos['pre-processing']['tokenization'] # not used
        self.data = data
-        self.model = infos['discourse_segmenter']['model'] # ezpz for Tony 
+        self.main = infos["main"] # train test annotation
+
+        self.toke = infos['pre-processing']['tokenization']
+        self.toke_tool = infos['pre-processing']['tokenization_tool']
        self.ssplit = infos['pre-processing']['sentence_split']
        self.ssplitor = infos['pre-processing']['sentence_split_splitor']
+        self.ner_init = infos['pre-processing']['NER_format_initialisation'] # useless because done anyway
+
+        if self.main == "train":
+            if self.ner_init == True : # à faire en relatif !! split truc
+                self.train_data = f"{self.data.path}/{self.data.name}_train.ner{self.data.file}"
+                self.dev_data = f"{self.data.path}/{self.data.name}_dev.ner{self.data.file}"
+            else :
+                self.train_data = infos['discourse_segmenter']['training']['train_data_path']
+                self.dev_data = infos['discourse_segmenter']['training']['validation_data_path']
+        self.toolkit = infos['discourse_segmenter']['training']['toolkit']
+        self.tr_config = infos['discourse_segmenter']['training']['config_file']
+        self.pretr_lm = infos['discourse_segmenter']['training']['pre_trained_lm']
+
+        self.model = infos['discourse_segmenter']['model'] # ezpz for Tony 
+
        self.post_tab = infos['post-processing']['json_to_tab']
-        self.post_bracket = infos['post-processing']['tab_to_bracket']
\ No newline at end of file
+
+        self.eval = infos['evaluation']
+        self.test_data = infos['gold_test_data_path']
+
+        self.post_bracket = infos['post-processing']['tab_to_bracket']
+        
\ No newline at end of file
--- a/code/config_1.json
+++ b/code/config_1.json
@@ -3,31 +3,35 @@
    "input": {
        "name": "chaperontest",
        "file": ".ss",
-        "folder_path": "../data/chaperontest",
-        "format": "raw_sentences",
-        "language": "fr",
-        "gold": false,
-        "results_path": "../data/chaperontest/results"
-    },
-    "output": {
-        "format": "bracket",
-        "framework": "sdrt"
+        "language": "fr"
    },
    "steps":{
        "main": "annotation",
        "pre-processing": {
            "tokenization": true,
+            "tokenization_tool" : "spacy",
            "sentence_split": false,
-            "syntactic_parsing": false,
+            "sentence_split_splitor": null,
+            "syntactic_parsing": false, 
            "NER_format_initialisation": true
        },
        "discourse_segmenter": {
-            "model": "tony"
+            "model": "tony",
+            "training": {
+                "toolkit": null,
+                "pre_trained_lm": null,
+                "config_file": null,
+                "train_data_path": null,
+                "validation_data_path": null
+            }
        },
        "post-processing": {
            "json_to_tab": true,
            "tab_to_bracket":true
-        }
+        },
+        "evaluation": false,
+        "gold_test_data_path": null
    }
 }

+
--- a/code/config_2.json
+++ b/code/config_2.json
 {
-    "usecase_description": "Config file for usecase_2 : Take a EDU gold segmented text au format tok as input, use a loaded model to make predictions. Output scores of model predictions against gold, and output discrepancies. To start, we evaluate tony on annodis dev set.",
+    "usecase_description": "Config file for usecase_2",
    "input": {
        "name": "fra.sdrt.annodis_dev",
        "file": ".ttok",
-        "folder_path": "../data/fra.sdrt.annodis_dev",
-        "format": "truc",
-        "language": "fr",
-        "gold": true,
-        "results_path": "../data/fra.sdrt.annodis_dev/results"
-    },
-    "output": {
-        "format": "ner_tok", 
-        "framework": "sdrt"
+        "language": "fr"
    },
    "steps":{
-        "main": "test",
+        "main": "annotation",
        "pre-processing": {
            "tokenization": false,
+            "tokenization_tool" : "spacy",
            "sentence_split": true,
            "sentence_split_splitor": "stanza",
            "syntactic_parsing": false, 
            "NER_format_initialisation": true
        },
        "discourse_segmenter": {
-            "model": "tony"
+            "model": "tony",
+            "training": {
+                "toolkit": null,
+                "pre_trained_lm": null,
+                "config_file": null,
+                "train_data_path": null,
+                "validation_data_path": null
+            }
        },
        "post-processing": {
            "json_to_tab": true,
-            "tab_to_bracket":false
+            "tab_to_bracket":true
        },
-        "evaluation": true
+        "evaluation": false,
+        "gold_test_data_path": null
    }
 }


--- a/code/config_3.json
+++ b/code/config_3.json
 {
-    "usecase_description": "Config file for usecase_2.2 : Take a EDU gold segmented text au format conll as input, use a loaded model to make predictions. Output scores of model predictions against gold, and output discrepancies. To start, we evaluate tony on annodis dev set.",
+    "usecase_description": "Config file for usecase_3 : Take a EDU gold segmented set of train/dev/test of texts au format conll as input, train a model, output scores.",
    "input": {
-        "name": "fra.sdrt.annodis_dev",
-        "file": ".conllu", 
-        "file_options": [".conllu", ".tok"],
-        "folder_path": "../data/fra.sdrt.annodis_dev",
-        "format": "truc",
-        "language": "fr",
-        "gold": true,
-        "results_path": "../data/fra.sdrt.annodis_dev/results"
-    },
-    "output": {
-        "format": "ner_tok", 
-        "framework": "sdrt"
+        "name": "eng.rst.rstdt",
+        "file": ".conllu",
+        "language": "en"
    },
    "steps":{
-        "main": "test",
+        "main": "train",
        "pre-processing": {
            "tokenization": false,
+            "tokenization_tool" : "spacy",
            "sentence_split": false,
            "sentence_split_splitor": "stanza",
            "syntactic_parsing": false, 
            "NER_format_initialisation": true
        },
        "discourse_segmenter": {
-            "model": "tony"
+            "model": null,
+            "training": {
+                "toolkit": "allennlp",
+                "pre_trained_lm": "bert",
+                "config_file": "../model/config_training_bert.jsonnet",
+                "train_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_train.conllu",
+                "validation_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_dev.conllu"
+            }
        },
        "post-processing": {
-            "json_to_tab": true,
+            "json_to_tab": false,
            "tab_to_bracket":false
        },
-        "evaluation": true
+        "evaluation": true,
+        "gold_test_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_test.conllu"
    }
 }


--- a/code/discut22_1.py
+++ b/code/discut22_1.py
@@ -8,6 +8,8 @@
 import os
 import sys
 import argparse
+import re
+from datetime import datetime
 import pandas as pd # for futur clean output in df
 import json 

@@ -18,19 +20,17 @@ import utils.conv2ner as c2n
 import utils.json2conll as j2c
 import utils.conll2bracket as c2bracket
 import utils.sent_split as ssent
-#import utils.ssplit.parse_corpus as ssent
-#import utils.ssplit.parse_corpus as ssent
-#import utils.ssplit.parse_stanza as ssent
+import utils.training_allennlp as tr_allen


 # fonction to get config stuffs
-def get_config_infos(config_file):
+def get_config_infos(stamp, config_file):

    with open(config_file) as f:
        infos = json.load(f)
-    data_in = Input(infos['input'])
+    data_in = Input(infos['input'], stamp)
    actions = Process(infos['steps'], data_in)
-    print("data to be process : {}".format(data_in.name))
+    print(f"data to be process : {data_in.name}")
    return actions


@@ -40,20 +40,26 @@ def get_model(model_name):

    if name == "tony": 
        arch = "french_tokens.tar.gz"
-        if not os.path.isfile("../model/{}".format(arch)):
+        if not os.path.isfile(f"../model/{name}/{arch}"):
            dl = "wget https://zenodo.org/record/4235850/files/french_tokens.tar.gz -P ../model --progress=bar"
            os.system(dl)
        else:
            print("Tony already in place !")

-    return "../model/{}".format(arch)
+    return f"../model/{name}/{arch}"


+def text_tokenization(f_in, f_out, lang, tool):
+    if lang == "fr" :
+        if  tool == "spacy" :
+            tk.main(f_in, f_out) # .ss -> .tok

-# main call
-def main(config):
+
+
+
+def main(steps):
    
-    steps = get_config_infos(config) # on obtient la liste des trucs
+    #steps = get_config_infos(config) # on obtient la liste des trucs
    # à faire, donnée par la classe Process
    #print([x for x in enumerate(steps)])
    #suivant la liste ordonnée, faire les trucs (for now simple usecase1):
@@ -62,69 +68,123 @@ def main(config):
    # FN: soit besoin sent split, soit besoin tokenizer, soit aucun des deux
    if steps.ssplit == True :       # python code/ssplit/parse_corpus.py ${dataset} --parser stanza --out_dir data
    #### Split text into sentence : not in usecase1
-        data_in = "{}/{}{}".format(steps.data.path, steps.data.name, steps.data.file)
-        data_tok = "{}/{}.tok".format(steps.data.path, steps.data.name)
-        print("Starting sentence spliting...to {}".format(steps.data.path, steps.data.name))
-    #    ssent.main(data_in, data_tok, "stanza", steps.data.lang)
-
-        ssent.main(data_in, data_tok, "stanza", steps.data.lang)
-
+        if not steps.ssplitor == "stanza" :
+            print("pls define sentence splitor") # raise error n kill process
+        data_in = f"{steps.data.path}/{steps.data.name}{steps.data.file}"
+        data_tok = f"{steps.data.path}/{steps.data.name}.tok"
+        print(f"Starting sentence spliting...to {steps.data.path}/steps.data.name")
+        ssent.main(data_in, data_tok, steps.ssplitor, steps.data.lang)
    elif steps.toke == True :
    #### Tokenization du text        # #python ${SEG_DIR}/code/utils/fr_tokenize.py $RAW > ${RAW}.tok 
-        data_in = "{}/{}{}".format(steps.data.path, steps.data.name, steps.data.file)
-        data_tok = "{}/{}.tok".format(steps.data.path, steps.data.name)
-    #    sys.exit("check path")
-        print("Starting Tokenization...to {}".format(data_tok))
-        tk.main(data_in, data_tok) # .ss -> .tok
-
+        data_in = f"{steps.data.path}/{steps.data.name}{steps.data.file}"
+        data_tok = f"{steps.data.path}/{steps.data.name}.tok"
+        print(f"Starting Tokenization...to {data_tok}")
+        #tk.main(f_in, f_out) # .ss -> .tok
+        text_tokenization(data_in, data_tok, steps.data.lang, steps.toke_tool) # .ss -> .tok
    else:
        data_tok = f"{steps.data.path}/{steps.data.name}{steps.data.file}"


+    if steps.ner_init == True:
+        if steps.main == "test" or steps.main =="annotation":
    #### Conversion en NER pb        # #python $RUNTIME/conv2ner.py ${RAW}.tok > ${RAW}.ner.tok
-    data_ner = "{}/{}.ner.tok".format(steps.data.path, steps.data.name)
-    print("Starting conversion to NER format...to {}".format(data_ner))
-    c2n.main(data_tok, data_ner, steps.data.file)
+            data_ner = f"{steps.data.path}/{steps.data.name}.ner.tok"
+            print(f"Starting conversion to NER format...to {data_ner}")
+            c2n.main(data_tok, data_ner, steps.data.file)
+        elif steps.main == "train":
+            for part in ["train", "dev", "test"]:
+                data_tok = f"{steps.data.path}/{steps.data.name}_{part}{steps.data.file}"
+                data_ner = f"{steps.data.path}/{steps.data.name}_{part}.ner{steps.data.file}"
+                print("Starting conversion to NER format...to {}".format(data_ner))
+                c2n.main(data_tok, data_ner, steps.data.file)
+
+
+    # Create the results directory
+    if not os.path.isdir(steps.data.resu):
+        print(" result directory does not exist yet")
+        os.mkdir(steps.data.resu)


+    if steps.main == "train":
+        #model_config = steps.model_config
+        #cmd = "bash utils/expes.sh eng.rst.rstdt model/config_training.jsonnet bert train"
+        #os.system(cmd)
+        if steps.toolkit == "allennlp":
+            print("toolkit allennlp for training")
+        #    tr_allen.main(steps)
+            # set the value of model from null to what was just created by training
+            steps.model = f"{steps.data.resu}/model.tar.gz"
+        elif steps.toolkit == "jiant":
+            print("Jiant toolkit not ready")
+        else :
+            print("toolkit unknown")
+        
+        #check config train file
+    elif steps.main == "test" or steps.main =="annotation":
    #### Appliquer le model choisi, sortir le JSON avec les predictions :score, proba, tags
    # #allennlp predict --use-dataset-reader --output-file ${RESULT_DIR}/${FILE}.json ${MODEL} ${RAW}.ner.tok
-    print("Checking for model...{}".format(steps.model))
-    model_path = get_model(steps.model)
-    data_json = "{}/{}.json".format(steps.data.resu, steps.data.name)
-    cmd = "allennlp predict --use-dataset-reader --output-file {} {} {} &> {}/logs.txt".format(data_json, model_path, data_ner, steps.data.resu)
-    if not os.path.isdir(steps.data.resu):
-        print(" result directory does not exist")
-        os.mkdir(steps.data.resu)
-    print("Starting Prediction...")
-    os.system(cmd)
+        print(f"Checking for model...{steps.model}")
+        model_path = get_model(steps.model)
+        data_json = f"{steps.data.resu}/{steps.data.name}.json"
+        cmd = f"allennlp predict --use-dataset-reader --output-file {data_json} {model_path} {data_ner} &> {steps.data.resu}/logs.txt"
+        print("Starting Prediction...")
+        os.system(cmd)
    #### ------------------------------- TBD do the same but with python script (or JIANT ??)
-
-
-
-
+    else:
+        print(" pb define model")

    if steps.post_tab == True :
-    #### Appliquer les predictions au texte et sortir le texte tokenisé avec la colone des tags-prédis     # #python $RUNTIME/json2conll.py ${RESULT_DIR}/${FILE}.json split.tok > ${RESULT_DIR}/${FILE}.split.tok
-        data_conll = "{}/{}.split.tok".format(steps.data.resu, steps.data.name)
+    #### Appliquer les predictions au texte et sortir le texte tokenisé avec la colone des tags-prédis     
+    # # #python $RUNTIME/json2conll.py ${RESULT_DIR}/${FILE}.json split.tok > ${RESULT_DIR}/${FILE}.split.tok
+        data_conll = f"{steps.data.resu}/{steps.data.name}.split.tok"
        format = "split.tok" # to retrive from config file !!!
-        print("Starting Formating from json to tok format...to {}".format(data_conll))
+        print(f"Starting Formating from json to tok format...to {data_conll}")
        j2c.main(data_json, format, data_conll)

    ####### EVALUATION AGAINST GOLD
    # python discut/code/utils/seg_eval.py data_gold data_pred (-s)
-    data_gold = data_tok
-    data_pred = data_conll
-    cmd = f"python utils/seg_eval.py {data_gold} {data_pred} &> {steps.data.resu}/Evaluation.txt"
-    os.system(cmd)
+    if steps.eval == True : 
+        if steps.main == "train":
+            data_gold = steps.test_data # (())== data NER because of ner_init == true((deleted))
+            if steps.ner_init == True :
+                data_gold_ner = f"{steps.data.path}/{steps.data.name}_test.ner.conllu"
+
+            # make predictions on test_data
+            model_path = steps.model # model just been created
+            # data_json about to be created by predict cmd
+            data_json = f"{steps.data.resu}/{steps.data.name}_test.predictions.json" ## à faire en relatif !! [opt : --silent ??]
+            cmd = f"allennlp predict --use-dataset-reader --output-file {data_json} {model_path} {data_gold_ner} --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --predictor sentence-tagger --include-package allen_custom.custom_bert_token_embedder &> {steps.data.resu}/logs.txt"
+            #cmd = f"allennlp predict --use-dataset-reader --output-file {data_json} {model_path} {data_gold}  &> {steps.data.resu} /logs.txt"
+            print("Starting Prediction...")
+            print(f"cmd prediction: {cmd}")
+            os.system(cmd)
+            
+            data_conll = f"{steps.data.resu}/{steps.data.name}_test.predictions.conll" ## à faire en relatif
+            print(f"Starting Formating from json to tok format...to {data_conll}")
+            j2c.main(data_json, "split.tok", data_conll)
+            #data_pred_ner = f"{steps.data.resu}/eng.rst.rstdt_test.predictions.conll.ner"
+            #c2n.main(data_conll, data_pred_ner, steps.data.file)
+            print(f"starting eval, gold={data_gold}, predictions={data_conll}, model={model_path}")
+            data_g = "../data/eng.rst.rstdt/eng.rst.rstdt_test.conllu"
+            data_p = "../data/eng.rst.rstdt/results/eng.rst.rstdt_test.predictions.conll" # == data_conll
+            cmd = f"python utils/seg_eval.py {data_gold} {data_conll} &> {steps.data.resu}/Evaluation.txt"
+            os.system(cmd)
+
+
+        else :
+            data_gold = data_tok # changer les noms des var, c'est pas clair !
+            data_pred = data_conll #
+            cmd = f"python utils/seg_eval.py {data_gold} {data_pred} &> {steps.data.resu}/Evaluation.txt"
+            os.system(cmd)




    if steps.post_bracket == True :
-    ####prendre le texte tokénisé+tags-prédits et sortir le texte en plain (format du d'ebut, for now en suite de phrases) avec les brackets    # #python $RUNTIME/conll2bracket.py ${RESULT_DIR}/${FILE}.split.tok >  ${RESULT_DIR}/${FILE}.split.tok.bracket
-        data_bracket = "{}/{}.split.tok.bracket".format(steps.data.resu, steps.data.name)
-        print("Starting formating into bracket text...to {}".format(data_bracket))
+    ####prendre le texte tokénisé+tags-prédits et sortir le texte en plain (format du d'ebut, for now en suite de phrases) avec les brackets    
+    # # #python $RUNTIME/conll2bracket.py ${RESULT_DIR}/${FILE}.split.tok >  ${RESULT_DIR}/${FILE}.split.tok.bracket
+        data_bracket = f"{steps.data.resu}/{steps.data.name}.split.tok.bracket"
+        print(f"Starting formating into bracket text...to {data_bracket}")
        c2bracket.main(data_conll, data_bracket)
    

@@ -137,5 +197,13 @@ if __name__ == '__main__':
    args = parser.parse_args()
    config = args.config

-    main(config)
+    now = datetime.now()
+    stamp = re.sub('[\s:]', '_', str(now))
+    my_logs = {}
+    my_logs['stamp'] = stamp
+
+    steps = get_config_infos(stamp, config)
+    print(stamp)
+    main(steps)
+
    print("Done.")
\ No newline at end of file
--- a/code/utils/seg_eval.py
+++ b/code/utils/seg_eval.py
@@ -158,6 +158,9 @@ def get_scores(gold_file, pred_file, string_input=False):
 	if "BeginSeg=Yes" in gold_labels:
 		mode = "edu"
 		seg_type = "EDUs"
+	#elif "B-S" in gold_labels:
+	#	mode = "edu"
+	#	seg_type = "EDUs"
 	else:
 		mode = "conn"
 		seg_type = "conn spans"

--- a/code/utils/training_allennlp.py
+++ b/code/utils/training_allennlp.py
+####### Python version of expes.sh
+
+import os
+
+
+def main(steps):
+    dataset = steps.data.name
+    config = steps.data.file # .tok .conllu
+    lmodel = steps.pretr_lm #options: bert xlm elmo elmo_aligned
+    action = "train" # inutile !
+    evalset = steps.dev_data
+    print(f"dev set : {evalset} \t trainset : {dataset}")
+    has_parent = False # ?? get this var autrement. 
+
+    tr_config = steps.tr_config 
+
+    # cas 1 : pas de "parent", pas de "toolong"
+    # cas 2 : toolong == true donc à spliter
+    # cas 3 : parent == true, pas de toolong
+
+
+    if lmodel == "xlm":
+        bert_vocab = "xlm-roberta-base"
+        bert_weights = "xlm-roberta-base"
+    else : 
+        bert_vocab = "bert-base-multilingual-cased"
+        bert_weights = "bert-base-multilingual-cased"
+
+    if lmodel == "bert_custom" and steps.ner_init == True :
+        # TODO raise error
+        print("You choose bert_custom so 'NER_format_initialisation' shall be set to false.")
+
+    #### train, has_per == False
+    # allennlp train -s Results_${CONFIG}/results_${OUTPUT} ${CODE}configs/${MODEL}.jsonnet --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --include-package allen_custom.custom_bert_token_embedder 
+    # allennlp train -s Resultts_conllu/results_eng.rst.rstdt_bert ../code/utils/configs/bert.jsonnet ....
+    cmd = f"allennlp train -s {steps.data.resu} {tr_config} --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --include-package allen_custom.custom_bert_token_embedder" 
+    print(cmd)
+    os.system(cmd)
+    # then...
+
+    # TODO:
+    #### train, has_par == true, en fait on fine_tune...
+    #allennlp fine-tune -m Results_${CONFIG}/results_${PARENT}_${MODEL}/model.tar.gz -c ${CODE}configs/${MODEL}.jsonnet -s Results_${CONFIG}/results_${DATASET}-${PARENT}_${MODEL} --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --include-package allen_custom.custom_bert_token_embedder 
+
+    # TODO
+    ### ensuite prediction sur valset ou "parent test" ou "finetune test"... ??
+    #allennlp predict --use-dataset-reader --output-file Results_${CONFIG}/results_${OUTPUT}/${DATASET}_${EVAL}.predictions.json Results_${CONFIG}/results_${OUTPUT}/model.tar.gz ${TEST_A_PATH} --silent --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --predictor sentence-tagger --include-package allen_custom.custom_bert_token_embedder 
--- a/global_config_file_guideline.md
+++ b/global_config_file_guideline.md
+# DisCut22 - Global Config File Guideline
+
+## Good practice tips
+
+- You can rename the `config_global.json` file as convenient : a good practice is to make one experiment = one global config file. If so, do not forget to type your file name when you will run the main command `python discut22.py --config **config_XX.json**```
+- Data can only be : 
+    - [boolean] `true`, `false`,
+    - [string] `"my_string_in_between_quote_marks"`,
+    - or `null`.
+- In this documentation, values of fields in **bold** can not be changed and are specific to the usecase.
+- Keep comas as in the templates to avoid errors on JSON format.
+
+
+## For Usecase 1 : **Discourse Segmentation**  
+
+- `"usecase_description":` [string] This field is not a fonctional one. You can describe your project or keep the default text. e.g. ```"Config file for usecase_1 : from a text, get the same text but with EDU bracket using ToNy segmenter."```  
+
+- `input:`{ These fields are mandatory for every Usecases.  
+
+    - `"name":` [string] The name of your input dataset, without the extension. This is also the same name of the directory where you put your input dataset.  e.g. ```"my.cool.dataset"```
+    - `"file":` [string] The extension of your input dataset that reflects its format.
+        - OPTIONS :[".conllu", ".tok", ".ttok", ".ss"]
+    - `"language":` [string] Language ID of your dataset following the ISO 639-1 Code. e.g. ```"en"```
+
+- `"steps":` { 
+    - `"main":` [string] : **"annotation"**
+
+    - `"pre-processing":` {
+        - `"tokenization":` [false, true] *available for FR*
+        - `"sentence_split":` [false, true] *available for FR*
+        - `"sentence_split_splitor":` [string] This is the toolkit you want for sentence spliting.
+            - OPTIONS : ["stanza"] 
+        - `"syntactic_parsing":` [boolean] : **false** *Not yet available*
+        - `"NER_format_initialisation":` [boolean] Set to true if your are working with ToNy. *Set to true anyway ??*
+
+    - `"discourse_segmenter":` {
+        - `"model":` [string] Here the name or the path to the existing model you want to use. e.g. `"tony"`, `"/moredata/andiamo/discut/Results_split.tok/results_eng.rst.gum_bert/model.tar.gz"`
+        - `"training":` {
+            - `"toolkit":` **null**
+            - `"pre_trained_lm":` **null**
+            - `"config_file":` **null**
+            - `"train_data_path":` **null**
+            - `"validation_data_path":` **null**
+
+    - `"post-processing":` { The toolkit AllenNlp output a JSON.
+        - `"json_to_tab":` [boolean] Set to true if you want also a conll-style output with predictions as last column.
+        - `"tab_to_bracket":` [boolean] Set to true if you want also an output as the raw text with brackets as EDU delimiter. If so, `"json_to_tab"` has to be set to true too.
+
+    - `"evaluation":` [boolean] : **false**
+    - `"gold_test_data_path":` [string] **null**
+
+
+## For Usecase 2 : **Segmentation Evaluation**  
+
+- `"usecase_description":` [string] This field is not a fonctional one. You can describe your project or keep the default text. ```"Config file for usecase_2 : Take a EDU gold segmented text au format tok as input, use a loaded model to make predictions. Output scores of model predictions against gold"```
+
+- `input:`{ These fields are mandatory for every Usecases.  
+
+    - `"name":` [string] The name of your input dataset, without the extension. This is also the same name of the directory where you put your input dataset.  e.g. ```"my.cool.dataset"```
+    - `"file":` [string] The extension of your input dataset that reflects its format. 
+        - OPTIONS :[".conllu", ".tok", ".ttok", ".ss"]
+    - `"language":` [string] Language ID of your dataset following the ISO 639-1 Code. e.g. ```"en"```
+
+- `"steps":` { 
+    - `"main":` [string] : **"test"**
+
+    - `"pre-processing":` {
+        - `"tokenization":` [false, true] *available for FR*
+        - `"sentence_split":` [false, true] *available for FR*
+        - `"sentence_split_splitor":` [string] This is the toolkit you want for sentence spliting.
+            - OPTIONS : ["stanza"] 
+        - `"syntactic_parsing":` [boolean] : **false** *Not yet available*
+        - `"NER_format_initialisation":` [boolean] Set to true if your are working with ToNy. *Set to true anyway ??*
+
+    - `"discourse_segmenter":` {
+        - `"model":` [string] Here the name or the path to the existing model you want to use. e.g. `"tony"`, `"/moredata/andiamo/discut/Results_split.tok/results_eng.rst.gum_bert/model.tar.gz"`
+        - `"training":` {
+            - `"toolkit":` **null**
+            - `"pre_trained_lm":` **null**
+            - `"config_file":` **null**
+            - `"train_data_path":` **null**
+            - `"validation_data_path":` **null**
+
+    - `"post-processing":` { The toolkit AllenNlp output a JSON.
+        - `"json_to_tab":` [boolean] : **true**
+        - `"tab_to_bracket":` [boolean] Set to true if you want also an output as the raw text with brackets as EDU delimiter. If so, `"json_to_tab"` has to be set to true too.
+
+    - `"evaluation":` [boolean] : **true**
+    - `"gold_test_data_path":` [string] The path to your gold dataset to make predictions to, and to evaluate against.
+
+
+## For Usecase 3 : **Custom Model Creation**  
+
+- `"usecase_description":` [string] This field is not a fonctional one. You can describe your project or keep the default text. ```"Config file for usecase_3 : Take a EDU gold segmented set of train/dev/test of texts au format conll as input, train a model, output scores."```
+
+- `input:`{ These fields are mandatory for every Usecases.  
+
+    - `"name":` [string] The name of your input dataset, without the extension. This is also the same name of the directory where you put your input dataset.  e.g. ```"my.cool.dataset"```
+    - `"file":` [string] The extension of your input dataset that reflects its format. 
+        - OPTIONS :[".conllu", ".tok", ".ttok", ".ss"]
+    - `"language":` [string] Language ID of your dataset following the ISO 639-1 Code. e.g. ```"en"```
+
+- `"steps":` { 
+    - `"main":` [string] : **"train"**
+
+    - `"pre-processing":` {
+        - `"tokenization":` [false, true] *available for FR*
+        - `"sentence_split":` [false, true] *available for FR*
+        - `"sentence_split_splitor":` [string] This is the toolkit you want for sentence spliting.
+            - OPTIONS : ["stanza"] 
+        - `"syntactic_parsing":` [boolean] : **false** *Not yet available*
+        - `"NER_format_initialisation":` [boolean] Set to true if your are working with ToNy. *Set to true anyway ??*
+
+    - `"discourse_segmenter":` {
+        - `"model":` **null**
+        - `"training":` {
+            - `"toolkit":` [string] The toolkit to build your model (to be added : "jiant").
+                - OPTIONS : ["allennlp"]
+            - `"pre_trained_lm":` **bert** (to be added : roberta..)
+            - `"config_file":` [string] The path to the config file for training. e.g. `"../model/config_training.jsonnet"`
+            - `"train_data_path":` [string] The path to your training dataset. e.g. `"../data/eng.rst.rstdt/eng.rst.rstdt_train.conllu"` *conflict with training_config ??* 
+            - `"validation_data_path":` [string] The path to your development dataset. e.g. `"../data/eng.rst.rstdt/eng.rst.rstdt_dev.conllu"` *idem*
+
+    - `"post-processing":` { The toolkit AllenNlp output a JSON.
+        - `"json_to_tab":` [boolean] : **true**
+        - `"tab_to_bracket":` [boolean] Set to true if you want also an output as the raw text with brackets as EDU delimiter. If so, `"json_to_tab"` has to be set to true too.
+
+    - `"evaluation":` [boolean] : **true**
+    - `"gold_test_data_path":` [string] The path to your gold test dataset to make predictions on, and to evaluate against.
--- a/model/config_training_bert.jsonnet
+++ b/model/config_training_bert.jsonnet
+{
+    "dataset_reader": {
+        "type": "conll2003",
+        "coding_scheme": "BIOUL",
+        "tag_label": "ner",
+        "token_indexers": {
+            "bert": {
+                "type": "bert-pretrained",
+                "do_lowercase": false,
+                "pretrained_model": "bert-base-multilingual-cased",
+                "use_starting_offsets": true
+            },
+            "token_characters": {
+                "type": "characters",
+                "min_padding_length": 3
+            }
+        }
+    },
+    "iterator": {
+        "type": "basic",
+        "batch_size": 2
+    },
+    "model": {
+        "type": "simple_tagger",
+        "encoder": {
+            "type": "lstm",
+            "bidirectional": true,
+            "dropout": 0.5,
+            "hidden_size": 100,
+            "input_size": 896,
+            "num_layers": 2
+        },
+        "text_field_embedder": {
+            "allow_unmatched_keys": true,
+            "embedder_to_indexer_map": {
+                "bert": [
+                    "bert",
+                    "bert-offsets"
+                ],
+                "token_characters": [
+                    "token_characters"
+                ]
+            },
+            "token_embedders": {
+                "bert": {
+                    "type": "bert-pretrained",
+                    "pretrained_model": "bert-base-multilingual-cased"
+                },
+                "token_characters": {
+                    "type": "character_encoding",
+                    "embedding": {
+                        "embedding_dim": 16
+                    },
+                    "encoder": {
+                        "type": "cnn",
+                        "conv_layer_activation": "relu",
+                        "embedding_dim": 16,
+                        "ngram_filter_sizes": [
+                            3
+                        ],
+                        "num_filters": 128
+                    }
+                }
+            }
+        }
+    },
+    "train_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_train.ner.conllu",
+    "validation_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_dev.ner.conllu",
+    "trainer": {
+        "cuda_device": 1,
+        "grad_norm": 5,
+        "num_epochs": 4,
+        "num_serialized_models_to_keep": 3,
+        "optimizer": {
+            "type": "bert_adam",
+            "lr": 0.001
+        }
+    }
+}
\ No newline at end of file