diff --git a/README.md b/README.md index f45e4bad457241af37a6f2ffd953f1b6255259b4..ec7a9a9eb1fd83a24fe4138544f97db61e0e8c93 100644 --- a/README.md +++ b/README.md @@ -14,42 +14,50 @@ Code: https://gitlab.inria.fr/andiamo/tony # Usage ## Usecases -- **Discourse Segmentation:** Take a raw text as input, use a loaded model to make predictions. Output the same text but with EDU segmentation. +- **Discourse Segmentation:** Take a raw text as input, use a loaded model to make predictions. Output the same text but with EDU segmentation. --> config_1 -- **Segmentation Evaluation:** Take an EDU gold segmented text as input, use a loaded model to make predictions. Output scores of model predictions against gold, and output discrepancies. +- **Segmentation Evaluation:** Take an EDU gold segmented text as input, use a loaded model to make predictions. Output scores of model predictions against gold, and output discrepancies. --> config_2 + +- **Custom Model Creation:** Fine-tuning (over one or two level) a pretrained Language Model with a specific dataset or combination of datasets. Then make predictions and evaluation. --> config_3 ## Content description [TBD : xplain directories automatically created during scripts run] -- `data/MyProjet/` Contains input data, raw and/or pre-processed format(s). - - `results/` Contains output data, scores and post-processed data. (Also logs of allennlp) +- `data/my.cool.dataset/` Contains input data, raw and/or pre-processed format(s). + - `results.{stamp}/` Contains output data, scores and post-processed data. (Also logs of allennlp) - `code/` Contains main scripts. - `discut22_1.py` One python script to run them all. - - `config_XX.json` A file to be completed (or a dir with choise between simple use_case configs and a template for a custom config). + - `config_XX.json` A file to be completed for your specific project (or a dir with choise between simple use_case configs and a template for a custom config). See ** - `utils/` Contains useful scripts to be called. - `model/` Contains model to be loaded or created. -- `documentation.md` Contains detailed documentation (TBD?) + - `config_training.jsonnet` A file to be completed. (TBD automatically saved with model when done) +- `global_config_file_guideline.md` Contains detailed documentation to build well formed config file. ## Set up environnement - Conda stuff pour python 3.7 (TBD ?) - Install all librairies required with the following command: ``` -pip install -r <dir?>requirements.txt +pip install -r requirements.txt ``` ## Configuration file: to chose or to complete -- `code/config_1.json` Config for usecase_1 : take a sentence splited text, apply ToNy, output same text but with EDU brackets. -- [TBD : train models config and all sort of cool options] +- `code/config_global_X.json` See global_config_file_guideline.md. + ## Run usecase 1 (go to `code` directory) Run this command: ``` -python code/discut22.py --config code/config_1.json +python discut22.py --config config_XX.json ``` +## Support +laura.riviere@irit.fr + + ## Authors and acknowledgment -Morteza Ezzabady +Morteza Ezzabady +Laura Rivière Amir Zeldes <!--- diff --git a/code/allen_custom/__pycache__/custom_bert_token_embedder.cpython-37.pyc b/code/allen_custom/__pycache__/custom_bert_token_embedder.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..7ba9802b113f8fe5be38ab83deb4daf2695ec9ff Binary files /dev/null and b/code/allen_custom/__pycache__/custom_bert_token_embedder.cpython-37.pyc differ diff --git a/code/allen_custom/__pycache__/custom_conll_reader.cpython-37.pyc b/code/allen_custom/__pycache__/custom_conll_reader.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..61f75bd353d9d94355e7f854fbb639bdca3a0c6f Binary files /dev/null and b/code/allen_custom/__pycache__/custom_conll_reader.cpython-37.pyc differ diff --git a/code/allen_custom/__pycache__/custom_disrpt_reader.cpython-37.pyc b/code/allen_custom/__pycache__/custom_disrpt_reader.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b5fed7c4fd030664682f83ecb79a6980d3d2cf6b Binary files /dev/null and b/code/allen_custom/__pycache__/custom_disrpt_reader.cpython-37.pyc differ diff --git a/code/allen_custom/__pycache__/custom_simple_tagger.cpython-37.pyc b/code/allen_custom/__pycache__/custom_simple_tagger.cpython-37.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c1647c89fbf6170d62b81ac688416a44a176c2c6 Binary files /dev/null and b/code/allen_custom/__pycache__/custom_simple_tagger.cpython-37.pyc differ diff --git a/code/allen_custom/custom_bert_token_embedder.py b/code/allen_custom/custom_bert_token_embedder.py new file mode 100644 index 0000000000000000000000000000000000000000..9e36f12141e21ae104c359282874ea02abd03fb0 --- /dev/null +++ b/code/allen_custom/custom_bert_token_embedder.py @@ -0,0 +1,287 @@ +""" +A ``TokenEmbedder`` which uses one of the BERT models +(https://github.com/google-research/bert) +to produce embeddings. + +At its core it uses Hugging Face's PyTorch implementation +(https://github.com/huggingface/pytorch-pretrained-BERT), +so thanks to them! +""" +from typing import Dict, List +import logging + +import torch +import torch.nn.functional as F + +from pytorch_pretrained_bert.modeling import BertModel + +from allennlp.modules.scalar_mix import ScalarMix +from allennlp.modules.token_embedders.token_embedder import TokenEmbedder +from allennlp.nn import util + +logger = logging.getLogger(__name__) + + +class PretrainedBertModel: + """ + In some instances you may want to load the same BERT model twice + (e.g. to use as a token embedder and also as a pooling layer). + This factory provides a cache so that you don't actually have to load the model twice. + """ + _cache: Dict[str, BertModel] = {} + + @classmethod + def load(cls, model_name: str, cache_model: bool = True) -> BertModel: + if model_name in cls._cache: + return PretrainedBertModel._cache[model_name] + + model = BertModel.from_pretrained(model_name) + if cache_model: + cls._cache[model_name] = model + + return model + + +class CustomBertEmbedder(TokenEmbedder): + """ + A ``TokenEmbedder`` that produces BERT embeddings for your tokens. + Should be paired with a ``BertIndexer``, which produces wordpiece ids. + + Most likely you probably want to use ``PretrainedBertEmbedder`` + for one of the named pretrained models, not this base class. + + Parameters + ---------- + bert_model: ``BertModel`` + The BERT model being wrapped. + top_layer_only: ``bool``, optional (default = ``False``) + If ``True``, then only return the top layer instead of apply the scalar mix. + max_pieces : int, optional (default: 512) + The BERT embedder uses positional embeddings and so has a corresponding + maximum length for its input ids. Assuming the inputs are windowed + and padded appropriately by this length, the embedder will split them into a + large batch, feed them into BERT, and recombine the output as if it was a + longer sequence. + num_start_tokens : int, optional (default: 1) + The number of starting special tokens input to BERT (usually 1, i.e., [CLS]) + num_end_tokens : int, optional (default: 1) + The number of ending tokens input to BERT (usually 1, i.e., [SEP]) + scalar_mix_parameters: ``List[float]``, optional, (default = None) + If not ``None``, use these scalar mix parameters to weight the representations + produced by different layers. These mixing weights are not updated during + training. + """ + def __init__(self, + bert_model: BertModel, + aligning_files = None, + top_layer_only: bool = False, + max_pieces: int = 512, + num_start_tokens: int = 1, + num_end_tokens: int = 1, + scalar_mix_parameters: List[float] = None) -> None: + super().__init__() + self.bert_model = bert_model + #self.aligning_fr = "saved_mappings/fra.sdrt.annodis_eng.rst.gum.pth" + self.aligning_fr = "../MUSE/results/stac-annodis_all/best_mapping.pth" + #self.aligning_fr = 'saved_mappings/eng.rst.gum_fra.sdrt.annodis.pth' + print("ALIGN", self.aligning_fr) + self.aligning_fr = torch.from_numpy(torch.load(self.aligning_fr)) + self.aligning_fr_t = torch.transpose(self.aligning_fr, 0, 1) #.to(torch.device('cuda:0')) + print(self.aligning_fr.shape) + self.output_dim = bert_model.config.hidden_size + self.max_pieces = max_pieces + self.num_start_tokens = num_start_tokens + self.num_end_tokens = num_end_tokens + + if not top_layer_only: + self._scalar_mix = ScalarMix(bert_model.config.num_hidden_layers, + do_layer_norm=False, + initial_scalar_parameters=scalar_mix_parameters, + trainable=scalar_mix_parameters is None) + else: + self._scalar_mix = None + + def get_output_dim(self) -> int: + return self.output_dim + + def forward(self, + input_ids: torch.LongTensor, + offsets: torch.LongTensor = None, + token_type_ids: torch.LongTensor = None) -> torch.Tensor: + """ + Parameters + ---------- + input_ids : ``torch.LongTensor`` The (batch_size, ..., max_sequence_length) tensor of wordpiece ids. + offsets : ``torch.LongTensor``, optional + The BERT embeddings are one per wordpiece. However it's possible/likely + you might want one per original token. In that case, ``offsets`` + represents the indices of the desired wordpiece for each original token. + Depending on how your token indexer is configured, this could be the + position of the last wordpiece for each token, or it could be the position + of the first wordpiece for each token. + + For example, if you had the sentence "Definitely not", and if the corresponding + wordpieces were ["Def", "##in", "##ite", "##ly", "not"], then the input_ids + would be 5 wordpiece ids, and the "last wordpiece" offsets would be [3, 4]. + If offsets are provided, the returned tensor will contain only the wordpiece + embeddings at those positions, and (in particular) will contain one embedding + per token. If offsets are not provided, the entire tensor of wordpiece embeddings + will be returned. + token_type_ids : ``torch.LongTensor``, optional + If an input consists of two sentences (as in the BERT paper), + tokens from the first sentence should have type 0 and tokens from + the second sentence should have type 1. If you don't provide this + (the default BertIndexer doesn't) then it's assumed to be all 0s. + """ + # pylint: disable=arguments-differ + batch_size, full_seq_len = input_ids.size(0), input_ids.size(-1) + initial_dims = list(input_ids.shape[:-1]) + + # The embedder may receive an input tensor that has a sequence length longer than can + # be fit. In that case, we should expect the wordpiece indexer to create padded windows + # of length `self.max_pieces` for us, and have them concatenated into one long sequence. + # E.g., "[CLS] I went to the [SEP] [CLS] to the store to [SEP] ..." + # We can then split the sequence into sub-sequences of that length, and concatenate them + # along the batch dimension so we effectively have one huge batch of partial sentences. + # This can then be fed into BERT without any sentence length issues. Keep in mind + # that the memory consumption can dramatically increase for large batches with extremely + # long sentences. + needs_split = full_seq_len > self.max_pieces + last_window_size = 0 + if needs_split: + # Split the flattened list by the window size, `max_pieces` + split_input_ids = list(input_ids.split(self.max_pieces, dim=-1)) + + # We want all sequences to be the same length, so pad the last sequence + last_window_size = split_input_ids[-1].size(-1) + padding_amount = self.max_pieces - last_window_size + split_input_ids[-1] = F.pad(split_input_ids[-1], pad=[0, padding_amount], value=0) + + # Now combine the sequences along the batch dimension + input_ids = torch.cat(split_input_ids, dim=0) + + if token_type_ids is not None: + # Same for token_type_ids + split_token_type_ids = list(token_type_ids.split(self.max_pieces, dim=-1)) + + last_window_size = split_token_type_ids[-1].size(-1) + padding_amount = self.max_pieces - last_window_size + split_token_type_ids[-1] = F.pad(split_token_type_ids[-1], pad=[0, padding_amount], value=0) + + token_type_ids = torch.cat(split_token_type_ids, dim=0) + + if token_type_ids is None: + token_type_ids = torch.zeros_like(input_ids) + + input_mask = (input_ids != 0).long() + + # input_ids may have extra dimensions, so we reshape down to 2-d + # before calling the BERT model and then reshape back at the end. + all_encoder_layers, _ = self.bert_model(input_ids=util.combine_initial_dims(input_ids), + token_type_ids=util.combine_initial_dims(token_type_ids), + attention_mask=util.combine_initial_dims(input_mask)) + all_encoder_layers = torch.stack(all_encoder_layers) + # ======ROTATION===== # + #all_encoder_layers = torch.matmul(all_encoder_layers, self.aligning_fr_t) + + if needs_split: + # First, unpack the output embeddings into one long sequence again + unpacked_embeddings = torch.split(all_encoder_layers, batch_size, dim=1) + unpacked_embeddings = torch.cat(unpacked_embeddings, dim=2) + + # Next, select indices of the sequence such that it will result in embeddings representing the original + # sentence. To capture maximal context, the indices will be the middle part of each embedded window + # sub-sequence (plus any leftover start and final edge windows), e.g., + # 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 + # "[CLS] I went to the very fine [SEP] [CLS] the very fine store to eat [SEP]" + # with max_pieces = 8 should produce max context indices [2, 3, 4, 10, 11, 12] with additional start + # and final windows with indices [0, 1] and [14, 15] respectively. + + # Find the stride as half the max pieces, ignoring the special start and end tokens + # Calculate an offset to extract the centermost embeddings of each window + stride = (self.max_pieces - self.num_start_tokens - self.num_end_tokens) // 2 + stride_offset = stride // 2 + self.num_start_tokens + + first_window = list(range(stride_offset)) + + max_context_windows = [i for i in range(full_seq_len) + if stride_offset - 1 < i % self.max_pieces < stride_offset + stride] + + # Lookback what's left, unless it's the whole self.max_pieces window + if full_seq_len % self.max_pieces == 0: + lookback = self.max_pieces + else: + lookback = full_seq_len % self.max_pieces + + final_window_start = full_seq_len - lookback + stride_offset + stride + final_window = list(range(final_window_start, full_seq_len)) + + select_indices = first_window + max_context_windows + final_window + + initial_dims.append(len(select_indices)) + + recombined_embeddings = unpacked_embeddings[:, :, select_indices] + else: + recombined_embeddings = all_encoder_layers + + # Recombine the outputs of all layers + # (layers, batch_size * d1 * ... * dn, sequence_length, embedding_dim) + # recombined = torch.cat(combined, dim=2) + input_mask = (recombined_embeddings != 0).long() + + if self._scalar_mix is not None: + mix = self._scalar_mix(recombined_embeddings, input_mask) + else: + mix = recombined_embeddings[-1] + + # At this point, mix is (batch_size * d1 * ... * dn, sequence_length, embedding_dim) + + if offsets is None: + # Resize to (batch_size, d1, ..., dn, sequence_length, embedding_dim) + dims = initial_dims if needs_split else input_ids.size() + return util.uncombine_initial_dims(mix, dims) + else: + # offsets is (batch_size, d1, ..., dn, orig_sequence_length) + offsets2d = util.combine_initial_dims(offsets) + # now offsets is (batch_size * d1 * ... * dn, orig_sequence_length) + range_vector = util.get_range_vector(offsets2d.size(0), + device=util.get_device_of(mix)).unsqueeze(1) + # selected embeddings is also (batch_size * d1 * ... * dn, orig_sequence_length) + selected_embeddings = mix[range_vector, offsets2d] + + return util.uncombine_initial_dims(selected_embeddings, offsets.size()) + + +@TokenEmbedder.register("custom-bert-pretrained") +class CustomPretrainedBertEmbedder(CustomBertEmbedder): + # pylint: disable=line-too-long + """ + Parameters + ---------- + pretrained_model: ``str`` + Either the name of the pretrained model to use (e.g. 'bert-base-uncased'), + or the path to the .tar.gz file with the model weights. + + If the name is a key in the list of pretrained models at + https://github.com/huggingface/pytorch-pretrained-BERT/blob/master/pytorch_pretrained_bert/modeling.py#L41 + the corresponding path will be used; otherwise it will be interpreted as a path or URL. + requires_grad : ``bool``, optional (default = False) + If True, compute gradient of BERT parameters for fine tuning. + top_layer_only: ``bool``, optional (default = ``False``) + If ``True``, then only return the top layer instead of apply the scalar mix. + scalar_mix_parameters: ``List[float]``, optional, (default = None) + If not ``None``, use these scalar mix parameters to weight the representations + produced by different layers. These mixing weights are not updated during + training. + """ + def __init__(self, pretrained_model: str, aligning_files, requires_grad: bool = False, top_layer_only: bool = False, + scalar_mix_parameters: List[float] = None) -> None: + model = PretrainedBertModel.load(pretrained_model) + + print("ALIGN", aligning_files['fr']) + + for param in model.parameters(): + param.requires_grad = requires_grad + + print("CHECKPOINT") + super().__init__(bert_model=model, top_layer_only=top_layer_only, scalar_mix_parameters=scalar_mix_parameters) diff --git a/code/allen_custom/custom_conll_reader.py b/code/allen_custom/custom_conll_reader.py new file mode 100644 index 0000000000000000000000000000000000000000..9d7a37b3b4b0a536d59401d1c0fbaa7e3db9167d --- /dev/null +++ b/code/allen_custom/custom_conll_reader.py @@ -0,0 +1,184 @@ +from typing import Dict, List, Sequence, Iterable +import itertools +import logging +import os + +from overrides import overrides + +from allennlp.common.checks import ConfigurationError +from allennlp.common.file_utils import cached_path +from allennlp.data.dataset_readers.dataset_reader import DatasetReader +from allennlp.data.dataset_readers.dataset_utils import to_bioul +from allennlp.data.fields import TextField, SequenceLabelField, Field, MetadataField +from allennlp.data.instance import Instance +from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer +from allennlp.data.tokenizers import Token + +logger = logging.getLogger(__name__) # pylint: disable=invalid-name + +def _is_divider(line: str) -> bool: + empty_line = line.strip() == '' + if empty_line: + return True + else: + first_token = line.split()[0] + if first_token == "-DOCSTART-": # pylint: disable=simplifiable-if-statement + return True + else: + return False + + +@DatasetReader.register("custom_conll_reader") +class CustomConllDatasetReader(DatasetReader): + """ + Reads instances from a pretokenised file where each line is in the following format: + + WORD POS-TAG CHUNK-TAG NER-TAG + + with a blank line indicating the end of each sentence + and '-DOCSTART- -X- -X- O' indicating the end of each article, + and converts it into a ``Dataset`` suitable for sequence tagging. + + Each ``Instance`` contains the words in the ``"tokens"`` ``TextField``. + The values corresponding to the ``tag_label`` + values will get loaded into the ``"tags"`` ``SequenceLabelField``. + And if you specify any ``feature_labels`` (you probably shouldn't), + the corresponding values will get loaded into their own ``SequenceLabelField`` s. + + This dataset reader ignores the "article" divisions and simply treats + each sentence as an independent ``Instance``. (Technically the reader splits sentences + on any combination of blank lines and "DOCSTART" tags; in particular, it does the right + thing on well formed inputs.) + + Parameters + ---------- + token_indexers : ``Dict[str, TokenIndexer]``, optional (default=``{"tokens": SingleIdTokenIndexer()}``) + We use this to define the input representation for the text. See :class:`TokenIndexer`. + tag_label: ``str``, optional (default=``ner``) + Specify `ner`, `pos`, or `chunk` to have that tag loaded into the instance field `tag`. + feature_labels: ``Sequence[str]``, optional (default=``()``) + These labels will be loaded as features into the corresponding instance fields: + ``pos`` -> ``pos_tags``, ``chunk`` -> ``chunk_tags``, ``ner`` -> ``ner_tags`` + Each will have its own namespace: ``pos_tags``, ``chunk_tags``, ``ner_tags``. + If you want to use one of the tags as a `feature` in your model, it should be + specified here. + coding_scheme: ``str``, optional (default=``IOB1``) + Specifies the coding scheme for ``ner_labels`` and ``chunk_labels``. + Valid options are ``IOB1`` and ``BIOUL``. The ``IOB1`` default maintains + the original IOB1 scheme in the CoNLL 2003 NER data. + In the IOB1 scheme, I is a token inside a span, O is a token outside + a span and B is the beginning of span immediately following another + span of the same type. + label_namespace: ``str``, optional (default=``labels``) + Specifies the namespace for the chosen ``tag_label``. + """ + _VALID_LABELS = {'ner', 'pos', 'chunk'} + + def __init__(self, + token_indexers: Dict[str, TokenIndexer] = None, + tag_label: str = "ner", + feature_labels: Sequence[str] = (), + lazy: bool = False, + coding_scheme: str = "IOB1", + label_namespace: str = "labels") -> None: + super().__init__(lazy) + self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} + if tag_label is not None and tag_label not in self._VALID_LABELS: + raise ConfigurationError("unknown tag label type: {}".format(tag_label)) + for label in feature_labels: + if label not in self._VALID_LABELS: + raise ConfigurationError("unknown feature label type: {}".format(label)) + if coding_scheme not in ("IOB1", "BIOUL"): + raise ConfigurationError("unknown coding_scheme: {}".format(coding_scheme)) + + self.tag_label = tag_label + self.feature_labels = set(feature_labels) + self.coding_scheme = coding_scheme + self.label_namespace = label_namespace + self._original_coding_scheme = "IOB1" + + @overrides + def _read(self, file_path: str) -> Iterable[Instance]: + # if `file_path` is a URL, redirect to the cache + file_path = cached_path(file_path) + + with open(file_path, "r") as data_file: + logger.info("Reading instances from lines in file at: %s", file_path) + + # Group into alternative divider / sentence chunks. + for is_divider, lines in itertools.groupby(data_file, _is_divider): + # Ignore the divider chunks, so that `lines` corresponds to the words + # of a single sentence. + if not is_divider: + fields = [line.strip().split() for line in lines] + # unzipping trick returns tuples, but our Fields need lists + fields = [list(field) for field in zip(*fields)] + tokens_, pos_tags, chunk_tags, ner_tags = fields + # TextField requires ``Token`` objects + tokens = [Token(token) for token in tokens_] + + yield self.text_to_instance(tokens, pos_tags, chunk_tags, ner_tags, file_path) + + def get_lang(self, file_path): + _, file_name = os.path.split(file_path) + lang = file_name[:2] + if lang == 'po': + lang = 'pt' + if lang not in ['en','de','it','fr','pt','sv']: + raise ConfigurationError(f"Language {lang} not supported by ELMo") + return lang + + def text_to_instance(self, # type: ignore + tokens: List[Token], + pos_tags: List[str] = None, + chunk_tags: List[str] = None, + ner_tags: List[str] = None, + file_path: str = None) -> Instance: + """ + We take `pre-tokenized` input here, because we don't have a tokenizer in this class. + """ + # pylint: disable=arguments-differ + sequence = TextField(tokens, self._token_indexers) + instance_fields: Dict[str, Field] = {'tokens': sequence} + instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens], "lang": self.get_lang(file_path)}) + + # Recode the labels if necessary. + if self.coding_scheme == "BIOUL": + coded_chunks = to_bioul(chunk_tags, + encoding=self._original_coding_scheme) if chunk_tags is not None else None + coded_ner = to_bioul(ner_tags, + encoding=self._original_coding_scheme) if ner_tags is not None else None + else: + # the default IOB1 + coded_chunks = chunk_tags + coded_ner = ner_tags + + # Add "feature labels" to instance + if 'pos' in self.feature_labels: + if pos_tags is None: + raise ConfigurationError("Dataset reader was specified to use pos_tags as " + "features. Pass them to text_to_instance.") + instance_fields['pos_tags'] = SequenceLabelField(pos_tags, sequence, "pos_tags") + if 'chunk' in self.feature_labels: + if coded_chunks is None: + raise ConfigurationError("Dataset reader was specified to use chunk tags as " + "features. Pass them to text_to_instance.") + instance_fields['chunk_tags'] = SequenceLabelField(coded_chunks, sequence, "chunk_tags") + if 'ner' in self.feature_labels: + if coded_ner is None: + raise ConfigurationError("Dataset reader was specified to use NER tags as " + " features. Pass them to text_to_instance.") + instance_fields['ner_tags'] = SequenceLabelField(coded_ner, sequence, "ner_tags") + + # Add "tag label" to instance + if self.tag_label == 'ner' and coded_ner is not None: + instance_fields['tags'] = SequenceLabelField(coded_ner, sequence, + self.label_namespace) + elif self.tag_label == 'pos' and pos_tags is not None: + instance_fields['tags'] = SequenceLabelField(pos_tags, sequence, + self.label_namespace) + elif self.tag_label == 'chunk' and coded_chunks is not None: + instance_fields['tags'] = SequenceLabelField(coded_chunks, sequence, + self.label_namespace) + + return Instance(instance_fields) diff --git a/code/allen_custom/custom_disrpt_reader.py b/code/allen_custom/custom_disrpt_reader.py new file mode 100644 index 0000000000000000000000000000000000000000..68189e92ae1e2471b5e71181f9729b5b2b2d3e7f --- /dev/null +++ b/code/allen_custom/custom_disrpt_reader.py @@ -0,0 +1,187 @@ +from typing import Dict, List, Sequence, Iterable +import itertools +import logging +import os + +from overrides import overrides + +from allennlp.common.checks import ConfigurationError +from allennlp.common.file_utils import cached_path +from allennlp.data.dataset_readers.dataset_reader import DatasetReader +from allennlp.data.dataset_readers.dataset_utils import to_bioul +from allennlp.data.fields import TextField, SequenceLabelField, Field, MetadataField +from allennlp.data.instance import Instance +from allennlp.data.token_indexers import TokenIndexer, SingleIdTokenIndexer +from allennlp.data.tokenizers import Token + +logger = logging.getLogger(__name__) # pylint: disable=invalid-name + +def _is_divider(line: str) -> bool: + empty_line = line.strip() == '' + if empty_line: + return True + else: + first_token = line.split()[0] + if first_token == "#": + return True + else: + return False + + +@DatasetReader.register("custom_disrpt_reader") +class CustomDisrptDatasetReader(DatasetReader): + """ + Reads instances from a pretokenised file where each line is in the following format: + + WORD POS-TAG CHUNK-TAG NER-TAG + + with a blank line indicating the end of each sentence + and '-DOCSTART- -X- -X- O' indicating the end of each article, + and converts it into a ``Dataset`` suitable for sequence tagging. + + Each ``Instance`` contains the words in the ``"tokens"`` ``TextField``. + The values corresponding to the ``tag_label`` + values will get loaded into the ``"tags"`` ``SequenceLabelField``. + And if you specify any ``feature_labels`` (you probably shouldn't), + the corresponding values will get loaded into their own ``SequenceLabelField`` s. + + This dataset reader ignores the "article" divisions and simply treats + each sentence as an independent ``Instance``. (Technically the reader splits sentences + on any combination of blank lines and "DOCSTART" tags; in particular, it does the right + thing on well formed inputs.) + + Parameters + ---------- + token_indexers : ``Dict[str, TokenIndexer]``, optional (default=``{"tokens": SingleIdTokenIndexer()}``) + We use this to define the input representation for the text. See :class:`TokenIndexer`. + tag_label: ``str``, optional (default=``ner``) + Specify `ner`, `pos`, or `chunk` to have that tag loaded into the instance field `tag`. + feature_labels: ``Sequence[str]``, optional (default=``()``) + These labels will be loaded as features into the corresponding instance fields: + ``pos`` -> ``pos_tags``, ``chunk`` -> ``chunk_tags``, ``ner`` -> ``ner_tags`` + Each will have its own namespace: ``pos_tags``, ``chunk_tags``, ``ner_tags``. + If you want to use one of the tags as a `feature` in your model, it should be + specified here. + coding_scheme: ``str``, optional (default=``IOB1``) + Specifies the coding scheme for ``ner_labels`` and ``chunk_labels``. + Valid options are ``IOB1`` and ``BIOUL``. The ``IOB1`` default maintains + the original IOB1 scheme in the CoNLL 2003 NER data. + In the IOB1 scheme, I is a token inside a span, O is a token outside + a span and B is the beginning of span immediately following another + span of the same type. + label_namespace: ``str``, optional (default=``labels``) + Specifies the namespace for the chosen ``tag_label``. + """ + _VALID_LABELS = {'ner', 'pos', 'chunk'} + + def __init__(self, + token_indexers: Dict[str, TokenIndexer] = None, + tag_label: str = "ner", + feature_labels: Sequence[str] = (), + lazy: bool = False, + coding_scheme: str = "IOB1", + label_namespace: str = "labels") -> None: + super().__init__(lazy) + self._token_indexers = token_indexers or {'tokens': SingleIdTokenIndexer()} + if tag_label is not None and tag_label not in self._VALID_LABELS: + raise ConfigurationError("unknown tag label type: {}".format(tag_label)) + for label in feature_labels: + if label not in self._VALID_LABELS: + raise ConfigurationError("unknown feature label type: {}".format(label)) + if coding_scheme not in ("IOB1", "BIOUL"): + raise ConfigurationError("unknown coding_scheme: {}".format(coding_scheme)) + + self.tag_label = tag_label + self.feature_labels = set(feature_labels) + self.coding_scheme = coding_scheme + self.label_namespace = label_namespace + self._original_coding_scheme = "IOB1" + + @overrides + def _read(self, file_path: str) -> Iterable[Instance]: + # if `file_path` is a URL, redirect to the cache + file_path = cached_path(file_path) + + with open(file_path, "r") as data_file: + logger.info("Reading instances from lines in file at: %s", file_path) + + # Group into alternative divider / sentence chunks. + for is_divider, lines in itertools.groupby(data_file, _is_divider): + # Ignore the divider chunks, so that `lines` corresponds to the words + # of a single sentence. + if not is_divider: + fields = [line.strip().split() for line in lines] + # unzipping trick returns tuples, but our Fields need lists + fields = [list(field) for field in zip(*fields)] + #TOKID TOK _ POS _ _ _ _ _ TAG + chunk_tags, tokens_, _, pos_tags, _, _, _, _, _, ner_tags = fields + chunk_tags = list(map(lambda _: "O", chunk_tags)) + ner_tags = list(map(lambda x: "B-S" if x.startswith("BeginSeg=Yes") else "O", ner_tags)) + # TextField requires ``Token`` objects + tokens = [Token(token) for token in tokens_] + + yield self.text_to_instance(tokens, pos_tags, chunk_tags, ner_tags, file_path) + + def get_lang(self, file_path): + _, file_name = os.path.split(file_path) + lang = file_name[:2] + if lang == 'po': + lang = 'pt' + if lang not in ['en','de','it','fr','pt','sv']: + raise ConfigurationError(f"Language {lang} not supported by ELMo") + return lang + + def text_to_instance(self, # type: ignore + tokens: List[Token], + pos_tags: List[str] = None, + chunk_tags: List[str] = None, + ner_tags: List[str] = None, + file_path: str = None) -> Instance: + """ + We take `pre-tokenized` input here, because we don't have a tokenizer in this class. + """ + # pylint: disable=arguments-differ + sequence = TextField(tokens, self._token_indexers) + instance_fields: Dict[str, Field] = {'tokens': sequence} + instance_fields["metadata"] = MetadataField({"words": [x.text for x in tokens], "lang": self.get_lang(file_path)}) + + # Recode the labels if necessary. + if self.coding_scheme == "BIOUL": + coded_chunks = to_bioul(chunk_tags, + encoding=self._original_coding_scheme) if chunk_tags is not None else None + coded_ner = to_bioul(ner_tags, + encoding=self._original_coding_scheme) if ner_tags is not None else None + else: + # the default IOB1 + coded_chunks = chunk_tags + coded_ner = ner_tags + + # Add "feature labels" to instance + if 'pos' in self.feature_labels: + if pos_tags is None: + raise ConfigurationError("Dataset reader was specified to use pos_tags as " + "features. Pass them to text_to_instance.") + instance_fields['pos_tags'] = SequenceLabelField(pos_tags, sequence, "pos_tags") + if 'chunk' in self.feature_labels: + if coded_chunks is None: + raise ConfigurationError("Dataset reader was specified to use chunk tags as " + "features. Pass them to text_to_instance.") + instance_fields['chunk_tags'] = SequenceLabelField(coded_chunks, sequence, "chunk_tags") + if 'ner' in self.feature_labels: + if coded_ner is None: + raise ConfigurationError("Dataset reader was specified to use NER tags as " + " features. Pass them to text_to_instance.") + instance_fields['ner_tags'] = SequenceLabelField(coded_ner, sequence, "ner_tags") + + # Add "tag label" to instance + if self.tag_label == 'ner' and coded_ner is not None: + instance_fields['tags'] = SequenceLabelField(coded_ner, sequence, + self.label_namespace) + elif self.tag_label == 'pos' and pos_tags is not None: + instance_fields['tags'] = SequenceLabelField(pos_tags, sequence, + self.label_namespace) + elif self.tag_label == 'chunk' and coded_chunks is not None: + instance_fields['tags'] = SequenceLabelField(coded_chunks, sequence, + self.label_namespace) + + return Instance(instance_fields) diff --git a/code/allen_custom/custom_simple_tagger.py b/code/allen_custom/custom_simple_tagger.py new file mode 100644 index 0000000000000000000000000000000000000000..f4cc3da10d070f02ac42b480fe1c53e98ae917bc --- /dev/null +++ b/code/allen_custom/custom_simple_tagger.py @@ -0,0 +1,196 @@ +from typing import Dict, Optional, List, Any + +import random + +import numpy +from overrides import overrides +import torch +from torch.nn.modules.linear import Linear +import torch.nn.functional as F + +from allennlp.common.checks import check_dimensions_match, ConfigurationError +from allennlp.data import Vocabulary +from allennlp.modules import Seq2SeqEncoder, TimeDistributed, TextFieldEmbedder +from allennlp.models.model import Model +from allennlp.nn import InitializerApplicator, RegularizerApplicator +from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits +from allennlp.training.metrics import CategoricalAccuracy, SpanBasedF1Measure + + +@Model.register("custom_simple_tagger") +class CustomSimpleTagger(Model): + """ + This ``SimpleTagger`` simply encodes a sequence of text with a stacked ``Seq2SeqEncoder``, then + predicts a tag for each token in the sequence. + + Parameters + ---------- + vocab : ``Vocabulary``, required + A Vocabulary, required in order to compute sizes for input/output projections. + text_field_embedder : ``TextFieldEmbedder``, required + Used to embed the ``tokens`` ``TextField`` we get as input to the model. + encoder : ``Seq2SeqEncoder`` + The encoder (with its own internal stacking) that we will use in between embedding tokens + and predicting output tags. + calculate_span_f1 : ``bool``, optional (default=``None``) + Calculate span-level F1 metrics during training. If this is ``True``, then + ``label_encoding`` is required. If ``None`` and + label_encoding is specified, this is set to ``True``. + If ``None`` and label_encoding is not specified, it defaults + to ``False``. + label_encoding : ``str``, optional (default=``None``) + Label encoding to use when calculating span f1. + Valid options are "BIO", "BIOUL", "IOB1", "BMES". + Required if ``calculate_span_f1`` is true. + label_namespace : ``str``, optional (default=``labels``) + This is needed to compute the SpanBasedF1Measure metric, if desired. + Unless you did something unusual, the default value should be what you want. + verbose_metrics : ``bool``, optional (default = False) + If true, metrics will be returned per label class in addition + to the overall statistics. + initializer : ``InitializerApplicator``, optional (default=``InitializerApplicator()``) + Used to initialize the model parameters. + regularizer : ``RegularizerApplicator``, optional (default=``None``) + If provided, will be used to calculate the regularization penalty during training. + """ + + def __init__(self, vocab: Vocabulary, + text_field_embedder: TextFieldEmbedder, + encoder: Seq2SeqEncoder, + calculate_span_f1: bool = None, + label_encoding: Optional[str] = None, + label_namespace: str = "labels", + verbose_metrics: bool = False, + initializer: InitializerApplicator = InitializerApplicator(), + regularizer: Optional[RegularizerApplicator] = None) -> None: + super(CustomSimpleTagger, self).__init__(vocab, regularizer) + + self.label_namespace = label_namespace + self.text_field_embedder = text_field_embedder + self.num_classes = self.vocab.get_vocab_size(label_namespace) + self.encoder = encoder + self._verbose_metrics = verbose_metrics + self.tag_projection_layer = TimeDistributed(Linear(self.encoder.get_output_dim(), + self.num_classes)) + + check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(), + "text field embedding dim", "encoder input dim") + + # We keep calculate_span_f1 as a constructor argument for API consistency with + # the CrfTagger, even it is redundant in this class + # (label_encoding serves the same purpose). + if calculate_span_f1 and not label_encoding: + raise ConfigurationError("calculate_span_f1 is True, but " + "no label_encoding was specified.") + self.metrics = { + "accuracy": CategoricalAccuracy(), + "accuracy3": CategoricalAccuracy(top_k=3) + } + + if calculate_span_f1 or label_encoding: + self._f1_metric = SpanBasedF1Measure(vocab, + tag_namespace=label_namespace, + label_encoding=label_encoding) + else: + self._f1_metric = None + + initializer(self) + + @overrides + def forward(self, # type: ignore + tokens: Dict[str, torch.LongTensor], + tags: torch.LongTensor = None, + metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]: + # pylint: disable=arguments-differ + """ + Parameters + ---------- + tokens : Dict[str, torch.LongTensor], required + The output of ``TextField.as_array()``, which should typically be passed directly to a + ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer`` + tensors. At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens": + Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used + for the ``TokenIndexers`` when you created the ``TextField`` representing your + sequence. The dictionary is designed to be passed directly to a ``TextFieldEmbedder``, + which knows how to combine different word representations into a single vector per + token in your input. + tags : torch.LongTensor, optional (default = None) + A torch tensor representing the sequence of integer gold class labels of shape + ``(batch_size, num_tokens)``. + metadata : ``List[Dict[str, Any]]``, optional, (default = None) + metadata containing the original words in the sentence to be tagged under a 'words' key. + + Returns + ------- + An output dictionary consisting of: + logits : torch.FloatTensor + A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing + unnormalised log probabilities of the tag classes. + class_probabilities : torch.FloatTensor + A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing + a distribution of the tag classes per word. + loss : torch.FloatTensor, optional + A scalar loss to be optimised. + + """ + embedded_text_input = self.text_field_embedder(tokens, lang=metadata[0]['lang']) #tokens) + batch_size, sequence_length, _ = embedded_text_input.size() + mask = get_text_field_mask(tokens) + encoded_text = self.encoder(embedded_text_input, mask) + + logits = self.tag_projection_layer(encoded_text) + reshaped_log_probs = logits.view(-1, self.num_classes) + class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view([batch_size, + sequence_length, + self.num_classes]) + + + output_dict = {"logits": logits, "class_probabilities": class_probabilities} + + if tags is not None: + loss = sequence_cross_entropy_with_logits(logits, tags, mask) + for metric in self.metrics.values(): + metric(logits, tags, mask.float()) + if self._f1_metric is not None: + self._f1_metric(logits, tags, mask.float()) + output_dict["loss"] = loss + + if metadata is not None: + output_dict["words"] = [x["words"] for x in metadata] + return output_dict + + @overrides + def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: + """ + Does a simple position-wise argmax over each token, converts indices to string labels, and + adds a ``"tags"`` key to the dictionary with the result. + """ + all_predictions = output_dict['class_probabilities'] + all_predictions = all_predictions.cpu().data.numpy() + if all_predictions.ndim == 3: + predictions_list = [all_predictions[i] for i in range(all_predictions.shape[0])] + else: + predictions_list = [all_predictions] + all_tags = [] + for predictions in predictions_list: + argmax_indices = numpy.argmax(predictions, axis=-1) + tags = [self.vocab.get_token_from_index(x, namespace="labels") + for x in argmax_indices] + all_tags.append(tags) + output_dict['tags'] = all_tags + return output_dict + + @overrides + def get_metrics(self, reset: bool = False) -> Dict[str, float]: + metrics_to_return = {metric_name: metric.get_metric(reset) for + metric_name, metric in self.metrics.items()} + + if self._f1_metric is not None: + f1_dict = self._f1_metric.get_metric(reset=reset) + if self._verbose_metrics: + metrics_to_return.update(f1_dict) + else: + metrics_to_return.update({ + x: y for x, y in f1_dict.items() if + "overall" in x}) + return metrics_to_return diff --git a/code/classes_def.py b/code/classes_def.py index 7e9857c207ee0467f3d48b790ed14100e10e49c7..baf173327ca66311592349889ab1658895466d0c 100644 --- a/code/classes_def.py +++ b/code/classes_def.py @@ -3,23 +3,45 @@ class Input: - def __init__(self, infos): + def __init__(self, infos, stamp): self.name = infos['name'] self.lang = infos['language'] - self.path = infos['folder_path'] # misused - self.file = infos['file'] - self.form = infos['format'] # not used - self.gold = infos['gold'] # not used - self.resu = infos['results_path'] # misused : le créer automatiquement +# self.path = infos['folder_path'] # misused + self.path = f"../data/{self.name}" + self.file = infos['file'] + self.stamp = stamp + self.conv = f"{self.path}/data_converted_{stamp}" # à intégrer + self.resu = f"{self.path}/results_{stamp}" class Process: def __init__(self, infos, data): - self.main = infos["main"] - self.toke = infos['pre-processing']['tokenization'] # not used self.data = data - self.model = infos['discourse_segmenter']['model'] # ezpz for Tony + self.main = infos["main"] # train test annotation + + self.toke = infos['pre-processing']['tokenization'] + self.toke_tool = infos['pre-processing']['tokenization_tool'] self.ssplit = infos['pre-processing']['sentence_split'] self.ssplitor = infos['pre-processing']['sentence_split_splitor'] + self.ner_init = infos['pre-processing']['NER_format_initialisation'] # useless because done anyway + + if self.main == "train": + if self.ner_init == True : # à faire en relatif !! split truc + self.train_data = f"{self.data.path}/{self.data.name}_train.ner{self.data.file}" + self.dev_data = f"{self.data.path}/{self.data.name}_dev.ner{self.data.file}" + else : + self.train_data = infos['discourse_segmenter']['training']['train_data_path'] + self.dev_data = infos['discourse_segmenter']['training']['validation_data_path'] + self.toolkit = infos['discourse_segmenter']['training']['toolkit'] + self.tr_config = infos['discourse_segmenter']['training']['config_file'] + self.pretr_lm = infos['discourse_segmenter']['training']['pre_trained_lm'] + + self.model = infos['discourse_segmenter']['model'] # ezpz for Tony + self.post_tab = infos['post-processing']['json_to_tab'] - self.post_bracket = infos['post-processing']['tab_to_bracket'] \ No newline at end of file + + self.eval = infos['evaluation'] + self.test_data = infos['gold_test_data_path'] + + self.post_bracket = infos['post-processing']['tab_to_bracket'] + \ No newline at end of file diff --git a/code/config_2.json b/code/config_2.json deleted file mode 100644 index 5876381e9f607355baf1c99350ffbdc27bcc8bed..0000000000000000000000000000000000000000 --- a/code/config_2.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "usecase_description": "Config file for usecase_2 : Take a EDU gold segmented text au format tok as input, use a loaded model to make predictions. Output scores of model predictions against gold, and output discrepancies. To start, we evaluate tony on annodis dev set.", - "input": { - "name": "fra.sdrt.annodis_dev", - "file": ".ttok", - "folder_path": "../data/fra.sdrt.annodis_dev", - "format": "truc", - "language": "fr", - "gold": true, - "results_path": "../data/fra.sdrt.annodis_dev/results" - }, - "output": { - "format": "ner_tok", - "framework": "sdrt" - }, - "steps":{ - "main": "test", - "pre-processing": { - "tokenization": false, - "sentence_split": true, - "sentence_split_splitor": "stanza", - "syntactic_parsing": false, - "NER_format_initialisation": true - }, - "discourse_segmenter": { - "model": "tony" - }, - "post-processing": { - "json_to_tab": true, - "tab_to_bracket":false - }, - "evaluation": true - } -} - - diff --git a/code/config_3.json b/code/config_3.json deleted file mode 100644 index f28b55a0b3f79ad27888ebaa74b8ca8b70d88e95..0000000000000000000000000000000000000000 --- a/code/config_3.json +++ /dev/null @@ -1,37 +0,0 @@ -{ - "usecase_description": "Config file for usecase_2.2 : Take a EDU gold segmented text au format conll as input, use a loaded model to make predictions. Output scores of model predictions against gold, and output discrepancies. To start, we evaluate tony on annodis dev set.", - "input": { - "name": "fra.sdrt.annodis_dev", - "file": ".conllu", - "file_options": [".conllu", ".tok"], - "folder_path": "../data/fra.sdrt.annodis_dev", - "format": "truc", - "language": "fr", - "gold": true, - "results_path": "../data/fra.sdrt.annodis_dev/results" - }, - "output": { - "format": "ner_tok", - "framework": "sdrt" - }, - "steps":{ - "main": "test", - "pre-processing": { - "tokenization": false, - "sentence_split": false, - "sentence_split_splitor": "stanza", - "syntactic_parsing": false, - "NER_format_initialisation": true - }, - "discourse_segmenter": { - "model": "tony" - }, - "post-processing": { - "json_to_tab": true, - "tab_to_bracket":false - }, - "evaluation": true - } -} - - diff --git a/code/config_1.json b/code/config_global_1.json similarity index 53% rename from code/config_1.json rename to code/config_global_1.json index 712bdd1903198bc16ecfa13fa5e20b4027a60af0..cd6c01ca6812e45cf87d284f4ee323677800a07b 100644 --- a/code/config_1.json +++ b/code/config_global_1.json @@ -3,31 +3,35 @@ "input": { "name": "chaperontest", "file": ".ss", - "folder_path": "../data/chaperontest", - "format": "raw_sentences", - "language": "fr", - "gold": false, - "results_path": "../data/chaperontest/results" - }, - "output": { - "format": "bracket", - "framework": "sdrt" + "language": "fr" }, "steps":{ "main": "annotation", "pre-processing": { "tokenization": true, + "tokenization_tool" : "spacy", "sentence_split": false, - "syntactic_parsing": false, + "sentence_split_splitor": null, + "syntactic_parsing": false, "NER_format_initialisation": true }, "discourse_segmenter": { - "model": "tony" + "model": "tony", + "training": { + "toolkit": null, + "pre_trained_lm": null, + "config_file": null, + "train_data_path": null, + "validation_data_path": null + } }, "post-processing": { "json_to_tab": true, "tab_to_bracket":true - } + }, + "evaluation": false, + "gold_test_data_path": null } } + diff --git a/code/config_global_2.json b/code/config_global_2.json new file mode 100644 index 0000000000000000000000000000000000000000..e46805bab828ba094025a51396c0448c1f6c3064 --- /dev/null +++ b/code/config_global_2.json @@ -0,0 +1,37 @@ +{ + "usecase_description": "Config file for usecase_2", + "input": { + "name": "fra.sdrt.annodis_dev", + "file": ".ttok", + "language": "fr" + }, + "steps":{ + "main": "annotation", + "pre-processing": { + "tokenization": false, + "tokenization_tool" : "spacy", + "sentence_split": true, + "sentence_split_splitor": "stanza", + "syntactic_parsing": false, + "NER_format_initialisation": true + }, + "discourse_segmenter": { + "model": "tony", + "training": { + "toolkit": null, + "pre_trained_lm": null, + "config_file": null, + "train_data_path": null, + "validation_data_path": null + } + }, + "post-processing": { + "json_to_tab": true, + "tab_to_bracket":true + }, + "evaluation": false, + "gold_test_data_path": null + } +} + + diff --git a/code/config_global_3.json b/code/config_global_3.json new file mode 100644 index 0000000000000000000000000000000000000000..9703e8d798bbb98f1cf02f1dd40f7c2c061e3473 --- /dev/null +++ b/code/config_global_3.json @@ -0,0 +1,37 @@ +{ + "usecase_description": "Config file for usecase_3 : Take a EDU gold segmented set of train/dev/test of texts au format conll as input, train a model, output scores.", + "input": { + "name": "eng.rst.rstdt", + "file": ".conllu", + "language": "en" + }, + "steps":{ + "main": "train", + "pre-processing": { + "tokenization": false, + "tokenization_tool" : "spacy", + "sentence_split": false, + "sentence_split_splitor": "stanza", + "syntactic_parsing": false, + "NER_format_initialisation": true + }, + "discourse_segmenter": { + "model": null, + "training": { + "toolkit": "allennlp", + "pre_trained_lm": "bert", + "config_file": "../model/config_training_bert.jsonnet", + "train_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_train.conllu", + "validation_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_dev.conllu" + } + }, + "post-processing": { + "json_to_tab": false, + "tab_to_bracket":false + }, + "evaluation": true, + "gold_test_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_test.conllu" + } +} + + diff --git a/code/discut22_1.py b/code/discut22_1.py index 5a24a5cef8eaa5d2d86340a8896ca491453837b3..1a40cfe5c0206ff3826bf173ef805dcb0345c92b 100644 --- a/code/discut22_1.py +++ b/code/discut22_1.py @@ -8,6 +8,8 @@ import os import sys import argparse +import re +from datetime import datetime import pandas as pd # for futur clean output in df import json @@ -18,19 +20,17 @@ import utils.conv2ner as c2n import utils.json2conll as j2c import utils.conll2bracket as c2bracket import utils.sent_split as ssent -#import utils.ssplit.parse_corpus as ssent -#import utils.ssplit.parse_corpus as ssent -#import utils.ssplit.parse_stanza as ssent +import utils.training_allennlp as tr_allen # fonction to get config stuffs -def get_config_infos(config_file): +def get_config_infos(stamp, config_file): with open(config_file) as f: infos = json.load(f) - data_in = Input(infos['input']) + data_in = Input(infos['input'], stamp) actions = Process(infos['steps'], data_in) - print("data to be process : {}".format(data_in.name)) + print(f"data to be process : {data_in.name}") return actions @@ -40,20 +40,26 @@ def get_model(model_name): if name == "tony": arch = "french_tokens.tar.gz" - if not os.path.isfile("../model/{}".format(arch)): + if not os.path.isfile(f"../model/{name}/{arch}"): dl = "wget https://zenodo.org/record/4235850/files/french_tokens.tar.gz -P ../model --progress=bar" os.system(dl) else: print("Tony already in place !") - return "../model/{}".format(arch) + return f"../model/{name}/{arch}" +def text_tokenization(f_in, f_out, lang, tool): + if lang == "fr" : + if tool == "spacy" : + tk.main(f_in, f_out) # .ss -> .tok -# main call -def main(config): + + + +def main(steps): - steps = get_config_infos(config) # on obtient la liste des trucs + #steps = get_config_infos(config) # on obtient la liste des trucs # à faire, donnée par la classe Process #print([x for x in enumerate(steps)]) #suivant la liste ordonnée, faire les trucs (for now simple usecase1): @@ -62,69 +68,123 @@ def main(config): # FN: soit besoin sent split, soit besoin tokenizer, soit aucun des deux if steps.ssplit == True : # python code/ssplit/parse_corpus.py ${dataset} --parser stanza --out_dir data #### Split text into sentence : not in usecase1 - data_in = "{}/{}{}".format(steps.data.path, steps.data.name, steps.data.file) - data_tok = "{}/{}.tok".format(steps.data.path, steps.data.name) - print("Starting sentence spliting...to {}".format(steps.data.path, steps.data.name)) - # ssent.main(data_in, data_tok, "stanza", steps.data.lang) - - ssent.main(data_in, data_tok, "stanza", steps.data.lang) - + if not steps.ssplitor == "stanza" : + print("pls define sentence splitor") # raise error n kill process + data_in = f"{steps.data.path}/{steps.data.name}{steps.data.file}" + data_tok = f"{steps.data.path}/{steps.data.name}.tok" + print(f"Starting sentence spliting...to {steps.data.path}/steps.data.name") + ssent.main(data_in, data_tok, steps.ssplitor, steps.data.lang) elif steps.toke == True : #### Tokenization du text # #python ${SEG_DIR}/code/utils/fr_tokenize.py $RAW > ${RAW}.tok - data_in = "{}/{}{}".format(steps.data.path, steps.data.name, steps.data.file) - data_tok = "{}/{}.tok".format(steps.data.path, steps.data.name) - # sys.exit("check path") - print("Starting Tokenization...to {}".format(data_tok)) - tk.main(data_in, data_tok) # .ss -> .tok - + data_in = f"{steps.data.path}/{steps.data.name}{steps.data.file}" + data_tok = f"{steps.data.path}/{steps.data.name}.tok" + print(f"Starting Tokenization...to {data_tok}") + #tk.main(f_in, f_out) # .ss -> .tok + text_tokenization(data_in, data_tok, steps.data.lang, steps.toke_tool) # .ss -> .tok else: data_tok = f"{steps.data.path}/{steps.data.name}{steps.data.file}" + if steps.ner_init == True: + if steps.main == "test" or steps.main =="annotation": #### Conversion en NER pb # #python $RUNTIME/conv2ner.py ${RAW}.tok > ${RAW}.ner.tok - data_ner = "{}/{}.ner.tok".format(steps.data.path, steps.data.name) - print("Starting conversion to NER format...to {}".format(data_ner)) - c2n.main(data_tok, data_ner, steps.data.file) + data_ner = f"{steps.data.path}/{steps.data.name}.ner.tok" + print(f"Starting conversion to NER format...to {data_ner}") + c2n.main(data_tok, data_ner, steps.data.file) + elif steps.main == "train": + for part in ["train", "dev", "test"]: + data_tok = f"{steps.data.path}/{steps.data.name}_{part}{steps.data.file}" + data_ner = f"{steps.data.path}/{steps.data.name}_{part}.ner{steps.data.file}" + print("Starting conversion to NER format...to {}".format(data_ner)) + c2n.main(data_tok, data_ner, steps.data.file) + + + # Create the results directory + if not os.path.isdir(steps.data.resu): + print(" result directory does not exist yet") + os.mkdir(steps.data.resu) + if steps.main == "train": + #model_config = steps.model_config + #cmd = "bash utils/expes.sh eng.rst.rstdt model/config_training.jsonnet bert train" + #os.system(cmd) + if steps.toolkit == "allennlp": + print("toolkit allennlp for training") + # tr_allen.main(steps) + # set the value of model from null to what was just created by training + steps.model = f"{steps.data.resu}/model.tar.gz" + elif steps.toolkit == "jiant": + print("Jiant toolkit not ready") + else : + print("toolkit unknown") + + #check config train file + elif steps.main == "test" or steps.main =="annotation": #### Appliquer le model choisi, sortir le JSON avec les predictions :score, proba, tags # #allennlp predict --use-dataset-reader --output-file ${RESULT_DIR}/${FILE}.json ${MODEL} ${RAW}.ner.tok - print("Checking for model...{}".format(steps.model)) - model_path = get_model(steps.model) - data_json = "{}/{}.json".format(steps.data.resu, steps.data.name) - cmd = "allennlp predict --use-dataset-reader --output-file {} {} {} &> {}/logs.txt".format(data_json, model_path, data_ner, steps.data.resu) - if not os.path.isdir(steps.data.resu): - print(" result directory does not exist") - os.mkdir(steps.data.resu) - print("Starting Prediction...") - os.system(cmd) + print(f"Checking for model...{steps.model}") + model_path = get_model(steps.model) + data_json = f"{steps.data.resu}/{steps.data.name}.json" + cmd = f"allennlp predict --use-dataset-reader --output-file {data_json} {model_path} {data_ner} &> {steps.data.resu}/logs.txt" + print("Starting Prediction...") + os.system(cmd) #### ------------------------------- TBD do the same but with python script (or JIANT ??) - - - - + else: + print(" pb define model") if steps.post_tab == True : - #### Appliquer les predictions au texte et sortir le texte tokenisé avec la colone des tags-prédis # #python $RUNTIME/json2conll.py ${RESULT_DIR}/${FILE}.json split.tok > ${RESULT_DIR}/${FILE}.split.tok - data_conll = "{}/{}.split.tok".format(steps.data.resu, steps.data.name) + #### Appliquer les predictions au texte et sortir le texte tokenisé avec la colone des tags-prédis + # # #python $RUNTIME/json2conll.py ${RESULT_DIR}/${FILE}.json split.tok > ${RESULT_DIR}/${FILE}.split.tok + data_conll = f"{steps.data.resu}/{steps.data.name}.split.tok" format = "split.tok" # to retrive from config file !!! - print("Starting Formating from json to tok format...to {}".format(data_conll)) + print(f"Starting Formating from json to tok format...to {data_conll}") j2c.main(data_json, format, data_conll) ####### EVALUATION AGAINST GOLD # python discut/code/utils/seg_eval.py data_gold data_pred (-s) - data_gold = data_tok - data_pred = data_conll - cmd = f"python utils/seg_eval.py {data_gold} {data_pred} &> {steps.data.resu}/Evaluation.txt" - os.system(cmd) + if steps.eval == True : + if steps.main == "train": + data_gold = steps.test_data # (())== data NER because of ner_init == true((deleted)) + if steps.ner_init == True : + data_gold_ner = f"{steps.data.path}/{steps.data.name}_test.ner.conllu" + + # make predictions on test_data + model_path = steps.model # model just been created + # data_json about to be created by predict cmd + data_json = f"{steps.data.resu}/{steps.data.name}_test.predictions.json" ## à faire en relatif !! [opt : --silent ??] + cmd = f"allennlp predict --use-dataset-reader --output-file {data_json} {model_path} {data_gold_ner} --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --predictor sentence-tagger --include-package allen_custom.custom_bert_token_embedder &> {steps.data.resu}/logs.txt" + #cmd = f"allennlp predict --use-dataset-reader --output-file {data_json} {model_path} {data_gold} &> {steps.data.resu} /logs.txt" + print("Starting Prediction...") + print(f"cmd prediction: {cmd}") + os.system(cmd) + + data_conll = f"{steps.data.resu}/{steps.data.name}_test.predictions.conll" ## à faire en relatif + print(f"Starting Formating from json to tok format...to {data_conll}") + j2c.main(data_json, "split.tok", data_conll) + #data_pred_ner = f"{steps.data.resu}/eng.rst.rstdt_test.predictions.conll.ner" + #c2n.main(data_conll, data_pred_ner, steps.data.file) + print(f"starting eval, gold={data_gold}, predictions={data_conll}, model={model_path}") + data_g = "../data/eng.rst.rstdt/eng.rst.rstdt_test.conllu" + data_p = "../data/eng.rst.rstdt/results/eng.rst.rstdt_test.predictions.conll" # == data_conll + cmd = f"python utils/seg_eval.py {data_gold} {data_conll} &> {steps.data.resu}/Evaluation.txt" + os.system(cmd) + + + else : + data_gold = data_tok # changer les noms des var, c'est pas clair ! + data_pred = data_conll # + cmd = f"python utils/seg_eval.py {data_gold} {data_pred} &> {steps.data.resu}/Evaluation.txt" + os.system(cmd) if steps.post_bracket == True : - ####prendre le texte tokénisé+tags-prédits et sortir le texte en plain (format du d'ebut, for now en suite de phrases) avec les brackets # #python $RUNTIME/conll2bracket.py ${RESULT_DIR}/${FILE}.split.tok > ${RESULT_DIR}/${FILE}.split.tok.bracket - data_bracket = "{}/{}.split.tok.bracket".format(steps.data.resu, steps.data.name) - print("Starting formating into bracket text...to {}".format(data_bracket)) + ####prendre le texte tokénisé+tags-prédits et sortir le texte en plain (format du d'ebut, for now en suite de phrases) avec les brackets + # # #python $RUNTIME/conll2bracket.py ${RESULT_DIR}/${FILE}.split.tok > ${RESULT_DIR}/${FILE}.split.tok.bracket + data_bracket = f"{steps.data.resu}/{steps.data.name}.split.tok.bracket" + print(f"Starting formating into bracket text...to {data_bracket}") c2bracket.main(data_conll, data_bracket) @@ -137,5 +197,13 @@ if __name__ == '__main__': args = parser.parse_args() config = args.config - main(config) + now = datetime.now() + stamp = re.sub('[\s:]', '_', str(now)) + my_logs = {} + my_logs['stamp'] = stamp + + steps = get_config_infos(stamp, config) + print(stamp) + main(steps) + print("Done.") \ No newline at end of file diff --git a/code/utils/seg_eval.py b/code/utils/seg_eval.py index 3083f4b50a73366144def531cd2bba22c9a30c49..1808782d74c9c5c9e8961409fa558af9a07bce1f 100644 --- a/code/utils/seg_eval.py +++ b/code/utils/seg_eval.py @@ -158,6 +158,9 @@ def get_scores(gold_file, pred_file, string_input=False): if "BeginSeg=Yes" in gold_labels: mode = "edu" seg_type = "EDUs" + #elif "B-S" in gold_labels: + # mode = "edu" + # seg_type = "EDUs" else: mode = "conn" seg_type = "conn spans" diff --git a/code/utils/training_allennlp.py b/code/utils/training_allennlp.py new file mode 100644 index 0000000000000000000000000000000000000000..65d4dfdb42e4a371c22858534ebefc6ce94038f0 --- /dev/null +++ b/code/utils/training_allennlp.py @@ -0,0 +1,47 @@ +####### Python version of expes.sh + +import os + + +def main(steps): + dataset = steps.data.name + config = steps.data.file # .tok .conllu + lmodel = steps.pretr_lm #options: bert xlm elmo elmo_aligned + action = "train" # inutile ! + evalset = steps.dev_data + print(f"dev set : {evalset} \t trainset : {dataset}") + has_parent = False # ?? get this var autrement. + + tr_config = steps.tr_config + + # cas 1 : pas de "parent", pas de "toolong" + # cas 2 : toolong == true donc à spliter + # cas 3 : parent == true, pas de toolong + + + if lmodel == "xlm": + bert_vocab = "xlm-roberta-base" + bert_weights = "xlm-roberta-base" + else : + bert_vocab = "bert-base-multilingual-cased" + bert_weights = "bert-base-multilingual-cased" + + if lmodel == "bert_custom" and steps.ner_init == True : + # TODO raise error + print("You choose bert_custom so 'NER_format_initialisation' shall be set to false.") + + #### train, has_per == False + # allennlp train -s Results_${CONFIG}/results_${OUTPUT} ${CODE}configs/${MODEL}.jsonnet --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --include-package allen_custom.custom_bert_token_embedder + # allennlp train -s Resultts_conllu/results_eng.rst.rstdt_bert ../code/utils/configs/bert.jsonnet .... + cmd = f"allennlp train -s {steps.data.resu} {tr_config} --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --include-package allen_custom.custom_bert_token_embedder" + print(cmd) + os.system(cmd) + # then... + + # TODO: + #### train, has_par == true, en fait on fine_tune... + #allennlp fine-tune -m Results_${CONFIG}/results_${PARENT}_${MODEL}/model.tar.gz -c ${CODE}configs/${MODEL}.jsonnet -s Results_${CONFIG}/results_${DATASET}-${PARENT}_${MODEL} --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --include-package allen_custom.custom_bert_token_embedder + + # TODO + ### ensuite prediction sur valset ou "parent test" ou "finetune test"... ?? + #allennlp predict --use-dataset-reader --output-file Results_${CONFIG}/results_${OUTPUT}/${DATASET}_${EVAL}.predictions.json Results_${CONFIG}/results_${OUTPUT}/model.tar.gz ${TEST_A_PATH} --silent --include-package allen_custom.custom_conll_reader --include-package allen_custom.custom_simple_tagger --include-package allen_custom.custom_disrpt_reader --predictor sentence-tagger --include-package allen_custom.custom_bert_token_embedder diff --git a/global_config_file_guideline.md b/global_config_file_guideline.md new file mode 100644 index 0000000000000000000000000000000000000000..580bd7d576dfa97043b48369e38324a56455c724 --- /dev/null +++ b/global_config_file_guideline.md @@ -0,0 +1,129 @@ +# DisCut22 - Global Config File Guideline + +## Good practice tips + +- You can rename the `config_global.json` file as convenient : a good practice is to make one experiment = one global config file. If so, do not forget to type your file name when you will run the main command `python discut22.py --config **config_XX.json**``` +- Data can only be : + - [boolean] `true`, `false`, + - [string] `"my_string_in_between_quote_marks"`, + - or `null`. +- In this documentation, values of fields in **bold** can not be changed and are specific to the usecase. +- Keep comas as in the templates to avoid errors on JSON format. + + +## For Usecase 1 : **Discourse Segmentation** + +- `"usecase_description":` [string] This field is not a fonctional one. You can describe your project or keep the default text. e.g. ```"Config file for usecase_1 : from a text, get the same text but with EDU bracket using ToNy segmenter."``` + +- `input:`{ These fields are mandatory for every Usecases. + + - `"name":` [string] The name of your input dataset, without the extension. This is also the same name of the directory where you put your input dataset. e.g. ```"my.cool.dataset"``` + - `"file":` [string] The extension of your input dataset that reflects its format. + - OPTIONS :[".conllu", ".tok", ".ttok", ".ss"] + - `"language":` [string] Language ID of your dataset following the ISO 639-1 Code. e.g. ```"en"``` + +- `"steps":` { + - `"main":` [string] : **"annotation"** + + - `"pre-processing":` { + - `"tokenization":` [false, true] *available for FR* + - `"sentence_split":` [false, true] *available for FR* + - `"sentence_split_splitor":` [string] This is the toolkit you want for sentence spliting. + - OPTIONS : ["stanza"] + - `"syntactic_parsing":` [boolean] : **false** *Not yet available* + - `"NER_format_initialisation":` [boolean] Set to true if your are working with ToNy. *Set to true anyway ??* + + - `"discourse_segmenter":` { + - `"model":` [string] Here the name or the path to the existing model you want to use. e.g. `"tony"`, `"/moredata/andiamo/discut/Results_split.tok/results_eng.rst.gum_bert/model.tar.gz"` + - `"training":` { + - `"toolkit":` **null** + - `"pre_trained_lm":` **null** + - `"config_file":` **null** + - `"train_data_path":` **null** + - `"validation_data_path":` **null** + + - `"post-processing":` { The toolkit AllenNlp output a JSON. + - `"json_to_tab":` [boolean] Set to true if you want also a conll-style output with predictions as last column. + - `"tab_to_bracket":` [boolean] Set to true if you want also an output as the raw text with brackets as EDU delimiter. If so, `"json_to_tab"` has to be set to true too. + + - `"evaluation":` [boolean] : **false** + - `"gold_test_data_path":` [string] **null** + + +## For Usecase 2 : **Segmentation Evaluation** + +- `"usecase_description":` [string] This field is not a fonctional one. You can describe your project or keep the default text. ```"Config file for usecase_2 : Take a EDU gold segmented text au format tok as input, use a loaded model to make predictions. Output scores of model predictions against gold"``` + +- `input:`{ These fields are mandatory for every Usecases. + + - `"name":` [string] The name of your input dataset, without the extension. This is also the same name of the directory where you put your input dataset. e.g. ```"my.cool.dataset"``` + - `"file":` [string] The extension of your input dataset that reflects its format. + - OPTIONS :[".conllu", ".tok", ".ttok", ".ss"] + - `"language":` [string] Language ID of your dataset following the ISO 639-1 Code. e.g. ```"en"``` + +- `"steps":` { + - `"main":` [string] : **"test"** + + - `"pre-processing":` { + - `"tokenization":` [false, true] *available for FR* + - `"sentence_split":` [false, true] *available for FR* + - `"sentence_split_splitor":` [string] This is the toolkit you want for sentence spliting. + - OPTIONS : ["stanza"] + - `"syntactic_parsing":` [boolean] : **false** *Not yet available* + - `"NER_format_initialisation":` [boolean] Set to true if your are working with ToNy. *Set to true anyway ??* + + - `"discourse_segmenter":` { + - `"model":` [string] Here the name or the path to the existing model you want to use. e.g. `"tony"`, `"/moredata/andiamo/discut/Results_split.tok/results_eng.rst.gum_bert/model.tar.gz"` + - `"training":` { + - `"toolkit":` **null** + - `"pre_trained_lm":` **null** + - `"config_file":` **null** + - `"train_data_path":` **null** + - `"validation_data_path":` **null** + + - `"post-processing":` { The toolkit AllenNlp output a JSON. + - `"json_to_tab":` [boolean] : **true** + - `"tab_to_bracket":` [boolean] Set to true if you want also an output as the raw text with brackets as EDU delimiter. If so, `"json_to_tab"` has to be set to true too. + + - `"evaluation":` [boolean] : **true** + - `"gold_test_data_path":` [string] The path to your gold dataset to make predictions to, and to evaluate against. + + +## For Usecase 3 : **Custom Model Creation** + +- `"usecase_description":` [string] This field is not a fonctional one. You can describe your project or keep the default text. ```"Config file for usecase_3 : Take a EDU gold segmented set of train/dev/test of texts au format conll as input, train a model, output scores."``` + +- `input:`{ These fields are mandatory for every Usecases. + + - `"name":` [string] The name of your input dataset, without the extension. This is also the same name of the directory where you put your input dataset. e.g. ```"my.cool.dataset"``` + - `"file":` [string] The extension of your input dataset that reflects its format. + - OPTIONS :[".conllu", ".tok", ".ttok", ".ss"] + - `"language":` [string] Language ID of your dataset following the ISO 639-1 Code. e.g. ```"en"``` + +- `"steps":` { + - `"main":` [string] : **"train"** + + - `"pre-processing":` { + - `"tokenization":` [false, true] *available for FR* + - `"sentence_split":` [false, true] *available for FR* + - `"sentence_split_splitor":` [string] This is the toolkit you want for sentence spliting. + - OPTIONS : ["stanza"] + - `"syntactic_parsing":` [boolean] : **false** *Not yet available* + - `"NER_format_initialisation":` [boolean] Set to true if your are working with ToNy. *Set to true anyway ??* + + - `"discourse_segmenter":` { + - `"model":` **null** + - `"training":` { + - `"toolkit":` [string] The toolkit to build your model (to be added : "jiant"). + - OPTIONS : ["allennlp"] + - `"pre_trained_lm":` **bert** (to be added : roberta..) + - `"config_file":` [string] The path to the config file for training. e.g. `"../model/config_training.jsonnet"` + - `"train_data_path":` [string] The path to your training dataset. e.g. `"../data/eng.rst.rstdt/eng.rst.rstdt_train.conllu"` *conflict with training_config ??* + - `"validation_data_path":` [string] The path to your development dataset. e.g. `"../data/eng.rst.rstdt/eng.rst.rstdt_dev.conllu"` *idem* + + - `"post-processing":` { The toolkit AllenNlp output a JSON. + - `"json_to_tab":` [boolean] : **true** + - `"tab_to_bracket":` [boolean] Set to true if you want also an output as the raw text with brackets as EDU delimiter. If so, `"json_to_tab"` has to be set to true too. + + - `"evaluation":` [boolean] : **true** + - `"gold_test_data_path":` [string] The path to your gold test dataset to make predictions on, and to evaluate against. diff --git a/model/config_training_bert.jsonnet b/model/config_training_bert.jsonnet new file mode 100644 index 0000000000000000000000000000000000000000..a656ba4f8bb1104aaea62e5acd0fe6a5af66a46b --- /dev/null +++ b/model/config_training_bert.jsonnet @@ -0,0 +1,79 @@ +{ + "dataset_reader": { + "type": "conll2003", + "coding_scheme": "BIOUL", + "tag_label": "ner", + "token_indexers": { + "bert": { + "type": "bert-pretrained", + "do_lowercase": false, + "pretrained_model": "bert-base-multilingual-cased", + "use_starting_offsets": true + }, + "token_characters": { + "type": "characters", + "min_padding_length": 3 + } + } + }, + "iterator": { + "type": "basic", + "batch_size": 2 + }, + "model": { + "type": "simple_tagger", + "encoder": { + "type": "lstm", + "bidirectional": true, + "dropout": 0.5, + "hidden_size": 100, + "input_size": 896, + "num_layers": 2 + }, + "text_field_embedder": { + "allow_unmatched_keys": true, + "embedder_to_indexer_map": { + "bert": [ + "bert", + "bert-offsets" + ], + "token_characters": [ + "token_characters" + ] + }, + "token_embedders": { + "bert": { + "type": "bert-pretrained", + "pretrained_model": "bert-base-multilingual-cased" + }, + "token_characters": { + "type": "character_encoding", + "embedding": { + "embedding_dim": 16 + }, + "encoder": { + "type": "cnn", + "conv_layer_activation": "relu", + "embedding_dim": 16, + "ngram_filter_sizes": [ + 3 + ], + "num_filters": 128 + } + } + } + } + }, + "train_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_train.ner.conllu", + "validation_data_path": "../data/eng.rst.rstdt/eng.rst.rstdt_dev.ner.conllu", + "trainer": { + "cuda_device": 1, + "grad_norm": 5, + "num_epochs": 4, + "num_serialized_models_to_keep": 3, + "optimizer": { + "type": "bert_adam", + "lr": 0.001 + } + } +} \ No newline at end of file