diff --git a/.gitignore b/.gitignore
index 9e49a4d5214f8889f3da509dbb6a44c5ed1aa230..31a9e25abafa1f60ace5e12854fdb997c3401672 100644
--- a/.gitignore
+++ b/.gitignore
@@ -206,6 +206,5 @@ Results_tok/*
 data/*
 
 recover_underscores/*
-code/contextual_embeddings/configs/*
 results_ignore/*
 scripts/*
diff --git a/align/README.md b/align/README.md
index b42badb2459498d4917f3dd57d86996604b5b239..ce6fecf9b1b4c91057354b4c2804046b8a5456b3 100644
--- a/align/README.md
+++ b/align/README.md
@@ -21,8 +21,8 @@ By default:
 
 - for ELMo models: Glove embeddings under **discut/embeddings**:
 
-`wget  -P embeddings/ "http://nlp.stanford.edu/data/glove.6B.zip"
-gunzip embeddings/glove.6B.zip`
+`wget  -P embeddings/ "http://nlp.stanford.edu/data/glove.6B.zip"`
+`gunzip embeddings/glove.6B.zip`
 
 # Compute alignment matrix
 
diff --git a/align/elmo_get_tokid.py b/align/elmo_get_tokid.py
deleted file mode 100644
index 814a04852a6ced126a3f6d2efac6a03c9a971ff2..0000000000000000000000000000000000000000
--- a/align/elmo_get_tokid.py
+++ /dev/null
@@ -1,10 +0,0 @@
-from allennlp.data.token_indexers.elmo_indexer import ELMoTokenCharactersIndexer
-from allennlp.data.tokenizers.token import Token
-from allennlp.data.vocabulary import Vocabulary
-from allennlp.data.token_indexers.single_id_token_indexer import SingleIdTokenIndexer
-
-indexer = SingleIdTokenIndexer()
-tokens = ['.', 'lfdss', 'dfsd', 'oui', 'a']
-index = indexer.tokens_to_indices([Token(token) for token in tokens], Vocabulary(), 'baba')
-
-print(index)
diff --git a/align/get_tok_ids.py b/align/get_tok_ids.py
deleted file mode 100644
index 6f006bf5ad8d45c44431c3d071a9ee5e8dd66548..0000000000000000000000000000000000000000
--- a/align/get_tok_ids.py
+++ /dev/null
@@ -1,12 +0,0 @@
-import numpy as np
-from transformers import BertTokenizer
-import torch
-from torch import nn
-
-bert = 'bert-base-multilingual-cased'
-tokenizer = BertTokenizer.from_pretrained(bert)
-
-tokens = ['fdsfsdfkc', 'dsfdsdf', 'The', 'to', 'because', 'since', 'that', '.', ';', ',', ':', 'is', 'a', 'he', 'said', 'the']
-for token in tokens:
-    print(token, tokenizer.convert_tokens_to_ids(token))
-
diff --git a/align/misc/check_parse.py b/align/misc/check_parse.py
deleted file mode 100644
index ecb1fb3d2cb58657026c4655e38a23ed36278632..0000000000000000000000000000000000000000
--- a/align/misc/check_parse.py
+++ /dev/null
@@ -1,16 +0,0 @@
-import sys
-import numpy as np
-
-path = sys.argv[1]
-data = np.load(path, allow_pickle=True)
-tok_ids, toks, labels = [data[f] for f in data.files]
-
-for i, tok in enumerate(toks):
-    if isinstance(tok, str):
-        if len(tok) > 35:
-            print(tok)
-    else:
-        print(tok_ids[i])
-        print(type(tok))
-
-
diff --git a/align/misc/check_rich.py b/align/misc/check_rich.py
deleted file mode 100644
index 6233fe54f6418a7afc24cfa88c9409a32c576de3..0000000000000000000000000000000000000000
--- a/align/misc/check_rich.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import sys
-import numpy as np
-
-path = sys.argv[1]
-data = np.load(path, allow_pickle=True)
-tok_ids, toks, labels, upos, idheads, deprels = [data[f] for f in data.files]
-
-print(len(toks))
-print(len(upos))
-
-labs_upos = np.unique(upos)
-labs_idheads = np.unique(idheads)
-labs_deprel = np.unique(deprels)
-print(labs_upos)
-print(labs_idheads)
-print(labs_deprel)
-
-for i, tok in enumerate(toks):
-    if isinstance(tok, str):
-        if len(tok) > 35:
-            print(tok)
-    else:
-        print(tok_ids[i])
-        print(type(tok))
-
-for i, tok in enumerate(upos):
-    if isinstance(tok, str):
-        if len(tok) > 35:
-            print(tok)
-    else:
-        print(tok_ids[i])
-        print(type(tok))
-
-
diff --git a/align/misc/plot_mapping.py b/align/misc/plot_mapping.py
deleted file mode 100644
index 53cb9c109b0fafb033b2884cebe03badeb8cc7a9..0000000000000000000000000000000000000000
--- a/align/misc/plot_mapping.py
+++ /dev/null
@@ -1,183 +0,0 @@
-import numpy as np
-import sys
-import os
-from transformers import BertModel
-import torch
-from torch.utils.data import DataLoader
-from tqdm import tqdm
-from train_model_baseline import generate_sentence_list, collate_batch
-from sklearn.decomposition import PCA
-import matplotlib.pyplot as plt
-import argparse
-from scipy.spatial.distance import cosine
-from scipy.spatial.transform import Rotation
-
-def mapping(src_corpus, tgt_corpus, mapping, sset, fmt):
-    
-    data = {}
-    mapping = torch.load(mapping)
-    mapping_tr = mapping.transpose()
-    rotation = Rotation.from_matrix(mapping)
-    rotation_inv = rotation.inv()
-    print("rotation_inv", rotation_inv.as_matrix().shape)
-    #mapping = - mapping
-
-    if sset == 'all':
-        ssets = ['train', 'test', 'dev']
-        for corpus in [src_corpus, tgt_corpus]:
-            for s in ssets:
-                data[corpus][s] = generate_sentence_list(corpus, s, fmt)
-            data[corpus] = data[corpus]['train'] + data[corpus]['test'] + data[corpus]['dev']
-    else: 
-        for corpus in [src_corpus, tgt_corpus]:    
-            data[corpus] = generate_sentence_list(corpus, sset, fmt)
-
-    data = data[src_corpus] + data[tgt_corpus]
-    batch_size = 64
-    
-    dataloader = DataLoader(data, batch_size=batch_size, collate_fn=collate_batch)
-    bert = 'bert-base-multilingual-cased'
-    bert_embeddings = BertModel.from_pretrained(bert)
-
-    #words for which to collect contextual embeddings in order to plot point clouds
-    clouds = ['est','que','is','that']
-    clouds_en = clouds[2:]
-    cloud_embeddings = {}
-    aligned_embeddings = {}
-    for cloud in clouds:
-        cloud_embeddings[cloud] = []
-    for cloud in clouds_en:
-        aligned_embeddings[cloud] = []
-
-    words_fr = ['est','que','le','pour','mais','et']
-    words_en = ['is','that','the','for','but','and']
-    fr = {}
-    fr_al = {}
-    en = {}
-    en_al = {}
-    for word in words_fr:
-        fr[word] = []
-        fr_al[word] = []
-    for word in words_en:
-        en[word] = []
-        en_al[word] = []
-    
-    #write this as function
-    for sentence_batch in tqdm(dataloader):
-        bert_output = bert_embeddings(**sentence_batch.getBatchEncoding()).last_hidden_state
-
-        for i, sentence in enumerate(sentence_batch.tokens):
-            bert_sentence_output = bert_output[i]
-                
-            for j, token in enumerate(sentence):
-                bert_token_output = bert_sentence_output[j].detach().numpy()
-                if token in clouds:
-                    cloud_embeddings[token].append(bert_token_output)
-                if token in clouds_en:
-                    aligned_emb = np.matmul(bert_token_output, mapping_tr)
-                    aligned_embeddings[token].append(aligned_emb)
-
-                if token in words_fr:
-                    fr[token].append(bert_token_output)
-                    aligned = np.matmul(bert_token_output, mapping_tr)
-                    #aligned = np.matmul(mapping_tr, bert_token_output.transpose())
-                    #aligned = np.matmul(mapping, bert_token_output.transpose())
-                    fr_al[token].append(aligned)
-                if token in words_en:
-                    en[token].append(bert_token_output)
-                    aligned = np.matmul(bert_token_output, mapping_tr)
-                    #aligned = np.matmul(mapping_tr, bert_token_output.transpose())
-                    #aligned = np.matmul(mapping, bert_token_output.transpose())
-                    en_al[token].append(aligned)
-
-    pca = PCA(n_components=2)
-    #plot_clouds(cloud_embeddings, pca, 'before')
-    for cloud in clouds[:2]:
-        aligned_embeddings[cloud] = cloud_embeddings[cloud] #add unchanged target vectors
-    #plot_clouds(aligned_embeddings, pca, 'after')
-    analyze(words_fr, words_en, fr, en, fr_al, en_al)
-
-def plot_clouds(cloud_embeddings, pca, text):
-    tok_en0, tok_en1, tok_fr0, tok_fr1 = cloud_embeddings.keys()
-
-    print(f'b= {tok_en0}')
-    print(f'c= {tok_en1}')
-    print(f'm= {tok_fr0}')
-    print(f'r= {tok_fr1}')
-    
-    colors = ['b', 'c', 'm', 'r'] 
-    embs_en0 = np.array([emb for emb in cloud_embeddings[tok_en0]])
-    embs_en1 = np.array([emb for emb in cloud_embeddings[tok_en1]])
-    embs_fr0 = np.array([emb for emb in cloud_embeddings[tok_fr0]])
-    embs_fr1 = np.array([emb for emb in cloud_embeddings[tok_fr1]])
-
-    n_en0 = embs_en0.shape[0]
-    n_en1 = embs_en1.shape[0]
-    n_fr0 = embs_fr0.shape[0]
-    n_fr1 = embs_fr1.shape[0]
-
-    full_embs = np.concatenate((embs_en0, embs_en1, embs_fr0, embs_fr1), axis=0)
-    embs_reduced = pca.fit_transform(full_embs)
-
-    print(n_en0, n_en1, n_fr0, n_fr1)
-    transp = embs_reduced.transpose()
-    red_en0 = transp[:,:n_en0]
-    sep = n_en0 + n_en1
-    red_en1 = transp[:,n_en0:sep]
-    sep1 = sep + n_fr0
-    red_fr0 = transp[:,sep:sep1]
-    red_fr1 = transp[:,sep1:]
-    
-    plt.scatter(red_en0[0], red_en0[1], marker='.', color=colors[0], label = tok_en0)
-    plt.scatter(red_en1[0], red_en1[1], marker='.', color=colors[1], label = tok_en1)
-    plt.scatter(red_fr0[0], red_fr0[1], marker='.', color=colors[2], label = tok_fr0)
-    plt.scatter(red_fr1[0], red_fr1[1], marker='.', color=colors[3], label = tok_fr1)
-
-    plt.title(f'{text} alignment')
-    plt.legend()
-    plt.show()
-    plt.savefig(f'{text}.png')
-    plt.clf()
-
-def analyze(words_fr, words_en, fr, en, fr_al, en_al):
-    
-    print("EN aligned")
-    for word_fr in words_fr:
-        for word_en in words_en:
-            #dist_before = cosine(np.array(fr[word_fr]).mean(axis=0),np.array(en[word_en]).mean(axis=0))
-            dist_before = np.linalg.norm(np.array(fr[word_fr]).mean(axis=0) - np.array(en[word_en]).mean(axis=0))
-            #dist_after = cosine(np.array(fr[word_fr]).mean(axis=0),np.array(en_al[word_en]).mean(axis=0))
-            dist_after = np.linalg.norm(np.array(fr[word_fr]).mean(axis=0) - np.array(en_al[word_en]).mean(axis=0))
-            print(f'{word_fr} -- {word_en}')
-            print(f'Distance before = {dist_before}')
-            print(f'Distance after = {dist_after}')
-            print(f'Deplacement = {dist_before - dist_after}')
- 
-    print('\n========\nFR aligned')
-    for word_fr in words_fr:
-        for word_en in words_en:
-            #dist_before = cosine(np.array(fr[word_fr]).mean(axis=0),np.array(en[word_en]).mean(axis=0))
-            dist_before = np.linalg.norm(np.array(fr[word_fr]).mean(axis=0) - np.array(en[word_en]).mean(axis=0))
-            #dist_after = cosine(np.array(fr_al[word_fr]).mean(axis=0),np.array(en[word_en]).mean(axis=0))
-            dist_after = np.linalg.norm(np.array(fr_al[word_fr]).mean(axis=0) - np.array(en[word_en]).mean(axis=0))
-            print(f'{word_fr} -- {word_en}')
-            print(f'Distance before = {dist_before}')
-            print(f'Distance after = {dist_after}')
-            print(f'Deplacement = {dist_before - dist_after}')
-
-def main():
-    parser = argparse.ArgumentParser(description='Compute anchors for a given dataset and a given word list')
-    parser.add_argument('--src_corpus', required=True, help='corpus to align')
-    parser.add_argument('--tgt_corpus', required=True, help='corpus on which to align')
-    parser.add_argument('--mapping', required=True, help='path to .pth mapping file')
-    parser.add_argument('--set', default='train', help='portion of the corpus to test on (train/test/dev/all)')
-    parser.add_argument('--format', default='conllu', help='tok or conllu')
-    params = parser.parse_args()
-
-    if not os.path.isfile(params.mapping):
-        print(f'no file at {params.mapping}.')
-        sys.exit()
-    mapping(params.src_corpus, params.tgt_corpus, params.mapping, params.set, params.format)
-
-if __name__ == '__main__':
-    main()
diff --git a/align/misc/stupid_tagger.py b/align/misc/stupid_tagger.py
deleted file mode 100644
index e0c31b28e7980f2d2a6da385d33bbd5c94aee4a8..0000000000000000000000000000000000000000
--- a/align/misc/stupid_tagger.py
+++ /dev/null
@@ -1,202 +0,0 @@
-from typing import Dict, Optional, List, Any
-
-import numpy
-from overrides import overrides
-import torch
-from torch.nn.modules.linear import Linear
-import torch.nn.functional as F
-
-from allennlp.common.checks import check_dimensions_match, ConfigurationError
-from allennlp.data import Vocabulary
-from allennlp.modules import Seq2SeqEncoder, TimeDistributed, TextFieldEmbedder
-from allennlp.models.model import Model
-from allennlp.nn import InitializerApplicator, RegularizerApplicator
-from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits
-from allennlp.training.metrics import CategoricalAccuracy, SpanBasedF1Measure
-
-
-@Model.register("stupid_tagger")
-class StupidTagger(Model):
-    """
-    This ``SimpleTagger`` simply encodes a sequence of text with a stacked ``Seq2SeqEncoder``, then
-    predicts a tag for each token in the sequence.
-
-    Parameters
-    ----------
-    vocab : ``Vocabulary``, required
-        A Vocabulary, required in order to compute sizes for input/output projections.
-    text_field_embedder : ``TextFieldEmbedder``, required
-        Used to embed the ``tokens`` ``TextField`` we get as input to the model.
-    encoder : ``Seq2SeqEncoder``
-        The encoder (with its own internal stacking) that we will use in between embedding tokens
-        and predicting output tags.
-    calculate_span_f1 : ``bool``, optional (default=``None``)
-        Calculate span-level F1 metrics during training. If this is ``True``, then
-        ``label_encoding`` is required. If ``None`` and
-        label_encoding is specified, this is set to ``True``.
-        If ``None`` and label_encoding is not specified, it defaults
-        to ``False``.
-    label_encoding : ``str``, optional (default=``None``)
-        Label encoding to use when calculating span f1.
-        Valid options are "BIO", "BIOUL", "IOB1", "BMES".
-        Required if ``calculate_span_f1`` is true.
-    label_namespace : ``str``, optional (default=``labels``)
-        This is needed to compute the SpanBasedF1Measure metric, if desired.
-        Unless you did something unusual, the default value should be what you want.
-    verbose_metrics : ``bool``, optional (default = False)
-        If true, metrics will be returned per label class in addition
-        to the overall statistics.
-    initializer : ``InitializerApplicator``, optional (default=``InitializerApplicator()``)
-        Used to initialize the model parameters.
-    regularizer : ``RegularizerApplicator``, optional (default=``None``)
-        If provided, will be used to calculate the regularization penalty during training.
-    """
-
-    def __init__(self, vocab: Vocabulary,
-                 text_field_embedder: TextFieldEmbedder,
-                 encoder: Seq2SeqEncoder,
-                 calculate_span_f1: bool = None,
-                 label_encoding: Optional[str] = None,
-                 label_namespace: str = "labels",
-                 verbose_metrics: bool = False,
-                 initializer: InitializerApplicator = InitializerApplicator(),
-                 regularizer: Optional[RegularizerApplicator] = None) -> None:
-        super().__init__(vocab, regularizer)
-
-        self.label_namespace = label_namespace
-        self.text_field_embedder = text_field_embedder
-        self.num_classes = self.vocab.get_vocab_size(label_namespace)
-        self.encoder = encoder
-        self._verbose_metrics = verbose_metrics
-        self.tag_projection_layer = TimeDistributed(Linear(self.encoder.get_output_dim(),
-                                                           self.num_classes))
-
-        check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(),
-                               "text field embedding dim", "encoder input dim")
-
-        # We keep calculate_span_f1 as a constructor argument for API consistency with
-        # the CrfTagger, even it is redundant in this class
-        # (label_encoding serves the same purpose).
-        if calculate_span_f1 and not label_encoding:
-            raise ConfigurationError("calculate_span_f1 is True, but "
-                                     "no label_encoding was specified.")
-        self.metrics = {
-                "accuracy": CategoricalAccuracy(),
-                "accuracy3": CategoricalAccuracy(top_k=3)
-        }
-
-        if calculate_span_f1 or label_encoding:
-            self._f1_metric = SpanBasedF1Measure(vocab,
-                                                 tag_namespace=label_namespace,
-                                                 label_encoding=label_encoding)
-        else:
-            self._f1_metric = None
-
-        initializer(self)
-
-    @overrides
-    def forward(self,  # type: ignore
-                tokens: Dict[str, torch.LongTensor],
-                tags: torch.LongTensor = None,
-                metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]:
-        # pylint: disable=arguments-differ
-        """
-        Parameters
-        ----------
-        tokens : Dict[str, torch.LongTensor], required
-            The output of ``TextField.as_array()``, which should typically be passed directly to a
-            ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer``
-            tensors.  At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens":
-            Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used
-            for the ``TokenIndexers`` when you created the ``TextField`` representing your
-            sequence.  The dictionary is designed to be passed directly to a ``TextFieldEmbedder``,
-            which knows how to combine different word representations into a single vector per
-            token in your input.
-        tags : torch.LongTensor, optional (default = None)
-            A torch tensor representing the sequence of integer gold class labels of shape
-            ``(batch_size, num_tokens)``.
-        metadata : ``List[Dict[str, Any]]``, optional, (default = None)
-            metadata containing the original words in the sentence to be tagged under a 'words' key.
-
-        Returns
-        -------
-        An output dictionary consisting of:
-        logits : torch.FloatTensor
-            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
-            unnormalised log probabilities of the tag classes.
-        class_probabilities : torch.FloatTensor
-            A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing
-            a distribution of the tag classes per word.
-        loss : torch.FloatTensor, optional
-            A scalar loss to be optimised.
-
-        """
-        embedded_text_input = self.text_field_embedder(tokens) 
-        batch_size, sequence_length, _ = embedded_text_input.size()
-        mask = get_text_field_mask(tokens)
-        encoded_text = self.encoder(embedded_text_input, mask)
-
-        logits = self.tag_projection_layer(encoded_text)
-        reshaped_log_probs = logits.view(-1, self.num_classes)
-        class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view([batch_size,
-                                                                          sequence_length,
-                                                                          self.num_classes])
-
-        output_dict = {"logits": logits, "class_probabilities": class_probabilities}
-
-        if tags is not None:
-            loss = sequence_cross_entropy_with_logits(logits, tags, mask)
-            for metric in self.metrics.values():
-                metric(logits, tags, mask.float())
-            if self._f1_metric is not None:
-                self._f1_metric(logits, tags, mask.float())
-            output_dict["loss"] = loss
-
-        if metadata is not None:
-            output_dict["words"] = [x["words"] for x in metadata]
-
-        #OVERWRITE VALUES
-        #output_dict['loss'] = torch.tensor(0).to(torch.long)
-        probas = torch.zeros(batch_size, sequence_length, self.num_classes)
-        probas[:,0,-1] = 1
-        probas[:,1:,0] = 1
-        output_dict['logits'] = probas
-        output_dict['class_probabilities'] = probas
-
-        return output_dict
-
-    @overrides
-    def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]:
-        """
-        Does a simple position-wise argmax over each token, converts indices to string labels, and
-        adds a ``"tags"`` key to the dictionary with the result.
-        """
-        all_predictions = output_dict['class_probabilities']
-        all_predictions = all_predictions.cpu().data.numpy()
-        if all_predictions.ndim == 3:
-            predictions_list = [all_predictions[i] for i in range(all_predictions.shape[0])]
-        else:
-            predictions_list = [all_predictions]
-        all_tags = []
-        for predictions in predictions_list:
-            argmax_indices = numpy.argmax(predictions, axis=-1)
-            tags = [self.vocab.get_token_from_index(x, namespace="labels")
-                    for x in argmax_indices]
-            all_tags.append(tags)
-        output_dict['tags'] = all_tags
-        return output_dict
-
-    @overrides
-    def get_metrics(self, reset: bool = False) -> Dict[str, float]:
-        metrics_to_return = {metric_name: metric.get_metric(reset) for
-                             metric_name, metric in self.metrics.items()}
-
-        if self._f1_metric is not None:
-            f1_dict = self._f1_metric.get_metric(reset=reset)
-            if self._verbose_metrics:
-                metrics_to_return.update(f1_dict)
-            else:
-                metrics_to_return.update({
-                        x: y for x, y in f1_dict.items() if
-                        "overall" in x})
-        return metrics_to_return
diff --git a/align/post_lstm.py b/align/post_lstm.py
deleted file mode 100644
index c9aa4c7ddd344752a305509009d3d28c7f4dadb7..0000000000000000000000000000000000000000
--- a/align/post_lstm.py
+++ /dev/null
@@ -1,34 +0,0 @@
-import sys, os
-import torch
-from visualize import plot_heatmap_anchors
-from transformers import BertTokenizer
-
-tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
-
-def make_anchor(tok_id):
-    input_file = os.path.join('saved_post_lstm', f'{tok_id}.pth')
-    matrix = torch.load(input_file)
-    return torch.mean(matrix, dim=0)
-
-def plot_post_lstm(tokens):
-    dico = {}
-
-    dico['First'] = make_anchor('fst')
-    dico['Random'] = make_anchor('rand')
-
-    #for token in tokens:
-    #    tok_id = tokenizer.convert_tokens_to_ids(token)
-    #    dico[token] = make_anchor(tok_id)
-        
-    plot_heatmap_anchors(tokens, dico, 'post_lstm_rstdt_fst_rand', n_dim=200, emb_dim=200)
-
-def main():
-    if len(sys.argv) < 2:
-        print('Please enter at least one token id.')
-        sys.exit()
-    tokens = sys.argv[1:]
-    print(tokens)
-    plot_post_lstm(tokens)
-
-if __name__ == '__main__':
-    main()
diff --git a/code/contextual_embeddings/configs/bert_custom.jsonnet b/code/contextual_embeddings/configs/bert_custom.jsonnet
new file mode 100644
index 0000000000000000000000000000000000000000..8763334aa3e5274a22ff901f30f99cc50e7ef183
--- /dev/null
+++ b/code/contextual_embeddings/configs/bert_custom.jsonnet
@@ -0,0 +1,72 @@
+{
+  "dataset_reader": {
+    "type": "custom_disrpt_reader",
+    "tag_label": "ner",
+    "coding_scheme": "BIOUL",
+    "token_indexers": {
+      "bert": {
+          "type": "bert-pretrained",
+          "pretrained_model": std.extVar("BERT_VOCAB"),
+          "do_lowercase": false,
+          "use_starting_offsets": true
+      },
+      "token_characters": {
+        "type": "characters",
+        "min_padding_length": 3
+      },
+    }
+  },
+  "train_data_path": std.extVar("TRAIN_DATA_PATH"),
+  "validation_data_path": std.extVar("TEST_A_PATH"),
+  "model": {
+    "type": "simple_tagger",
+    "text_field_embedder": {
+        "allow_unmatched_keys": true,
+        "embedder_to_indexer_map": {
+            "bert": ["bert", "bert-offsets"],
+            "token_characters": ["token_characters"]
+        },
+        "token_embedders": {
+            "bert": {
+                "type": "bert-pretrained",
+                "pretrained_model": std.extVar("BERT_WEIGHTS")
+            },
+            "token_characters": {
+                "type": "character_encoding",
+                "embedding": {
+                    "embedding_dim": 16
+                },
+                "encoder": {
+                    "type": "cnn",
+                    "embedding_dim": 16,
+                    "num_filters": 128,
+                    "ngram_filter_sizes": [3],
+                    "conv_layer_activation": "relu"
+                }
+            }
+        }
+    },
+    "encoder": {
+        "type": "lstm",
+        "input_size": 896,
+        "hidden_size": 100,
+        "num_layers": 2,
+        "dropout": 0.5,
+        "bidirectional": true
+    }
+  },
+  "iterator": {
+    "type": "basic",
+    "batch_size": 2
+  },
+  "trainer": {
+    "optimizer": {
+        "type": "bert_adam",
+        "lr": 0.001
+    },
+    "num_serialized_models_to_keep": 3,
+    "num_epochs": 4,
+    "grad_norm": 5.0,
+    "cuda_device": 0
+  }
+}
diff --git a/code/contextual_embeddings/configs/elmo.jsonnet b/code/contextual_embeddings/configs/elmo.jsonnet
new file mode 100644
index 0000000000000000000000000000000000000000..f09fe098f194104b8605b8b0521ca1878a8bffea
--- /dev/null
+++ b/code/contextual_embeddings/configs/elmo.jsonnet
@@ -0,0 +1,93 @@
+// Configuration for the NER model with ELMo, modified slightly from
+// the version included in "Deep Contextualized Word Representations",
+// taken from AllenNLP examples
+// modified for the disrpt discourse segmentation shared task -- 2019 
+{
+
+  "dataset_reader": {
+    "type": "conll2003",
+    "tag_label": "ner",
+    "coding_scheme": "BIOUL",
+    "token_indexers": {
+      "tokens": {
+        "type": "single_id",
+        "lowercase_tokens": true
+      },
+      "token_characters": {
+        "type": "characters",
+        "min_padding_length": 3
+      },
+      "elmo": {
+        "type": "elmo_characters"
+     }
+    }
+  },
+  "train_data_path": std.extVar("TRAIN_DATA_PATH"),
+  "validation_data_path": std.extVar("TEST_A_PATH"),
+  "model": {
+    "type": "simple_tagger",
+    "text_field_embedder": {
+      "token_embedders": {
+        "tokens": {
+            "type": "embedding",
+            "embedding_dim": 50,
+            "pretrained_file": "../tony/embeddings/glove.6B.50d.txt",
+            "trainable": true
+        },
+        "elmo":{
+            "type": "elmo_token_embedder",
+        "options_file": "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json",
+        "weight_file": "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5",
+            "do_layer_norm": false,
+            "dropout": 0.0
+        },
+        "token_characters": {
+            "type": "character_encoding",
+            "embedding": {
+            "embedding_dim": 16
+            },
+            "encoder": {
+            "type": "cnn",
+            "embedding_dim": 16,
+            "num_filters": 128,
+            "ngram_filter_sizes": [3],
+            "conv_layer_activation": "relu"
+            }
+        }
+      }
+    },
+    "encoder": {
+      "type": "lstm",
+      "input_size": 1202,
+      "hidden_size": 100,
+      "num_layers": 1,
+      "dropout": 0.5,
+      "bidirectional": true
+    },
+    "regularizer": [
+      [
+        "scalar_parameters",
+        {
+          "type": "l2",
+          "alpha": 0.1
+        }
+      ]
+    ]
+  },
+  "iterator": {
+    "type": "basic",
+    "batch_size": 2
+  },
+  "trainer": {
+    "optimizer": {
+        "type": "adam",
+        "lr": 0.001
+    },
+    //"validation_metric": "+f1_measure",
+    "num_serialized_models_to_keep": 3,
+    "num_epochs": 10,
+    "grad_norm": 5.0,
+    "patience": 2,
+    "cuda_device": 0
+  }
+}
diff --git a/code/contextual_embeddings/configs/melmo.jsonnet b/code/contextual_embeddings/configs/melmo.jsonnet
new file mode 100644
index 0000000000000000000000000000000000000000..4d3581793f7fbe7c8b7a02aa5f0430ec89a8a539
--- /dev/null
+++ b/code/contextual_embeddings/configs/melmo.jsonnet
@@ -0,0 +1,116 @@
+// Configuration for the NER model with ELMo, modified slightly from
+// the version included in "Deep Contextualized Word Representations",
+// taken from AllenNLP examples
+// modified for the disrpt discourse segmentation shared task -- 2019 
+{
+
+  "dataset_reader": {
+    "type": "custom_conll_reader",
+    "tag_label": "ner",
+    "coding_scheme": "BIOUL",
+    "token_indexers": {
+      "tokens": {
+        "type": "single_id",
+        "lowercase_tokens": true
+      },
+      "token_characters": {
+        "type": "characters",
+        "min_padding_length": 3
+      },
+      "elmo": {
+        "type": "elmo_characters"
+     }
+    }
+  },
+  "train_data_path": std.extVar("TRAIN_DATA_PATH"),
+  "validation_data_path": std.extVar("TEST_A_PATH"),
+  "model": {
+    "type": "custom_simple_tagger",
+    "text_field_embedder": {
+      "token_embedders": {
+        "tokens": {
+            "type": "embedding",
+            "embedding_dim": 50,
+            "pretrained_file": "embeddings/glove.6B.50d.txt",
+            "trainable": true
+        },
+	"elmo": {
+	    "type": "elmo_token_embedder_multilang",
+	    "do_layer_norm": false,
+	    "dropout": 0.3,
+	    "scalar_mix_parameters": [
+		-9e10,
+		1,
+		-9e10
+	    ],
+	    "options_files": {
+		"en": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/options262.json",
+		"es": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/options262.json",
+		"fr": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/options262.json",
+		"it": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/options262.json",
+		"pt": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/options262.json",
+		"sv": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/options262.json",
+		"de": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/options262.json"
+	    },
+	    "weight_files": {
+		"en": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/en_weights.hdf5",
+		"es": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/es_weights.hdf5",
+		"fr": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/fr_weights.hdf5",
+		"it": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/it_weights.hdf5",
+		"pt": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/pt_weights.hdf5",
+		"sv": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/sv_weights.hdf5",
+		"de": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/de_weights.hdf5"
+	    } 
+	},
+       "token_characters": {
+            "type": "character_encoding",
+            "embedding": {
+            "embedding_dim": 16
+            },
+            "encoder": {
+            "type": "cnn",
+            "embedding_dim": 16,
+            "num_filters": 128,
+            "ngram_filter_sizes": [3],
+            "conv_layer_activation": "relu"
+            }
+        }
+      }
+    },
+    "encoder": {
+      "type": "lstm",
+      "input_size": 1202,
+      "hidden_size": 100,
+      "num_layers": 1,
+      "dropout": 0.5,
+      "bidirectional": true
+    },
+    "regularizer": [
+      [
+        "scalar_parameters",
+        {
+          "type": "l2",
+          "alpha": 0.1
+        }
+      ]
+    ]
+  },
+  "iterator": {
+    "type": "same_language",
+    "batch_size": 2,
+    "sorting_keys": [["words", "num_tokens"]],
+    "instances_per_epoch": 32000
+  },
+  "trainer": {
+    "optimizer": {
+        "type": "adam",
+        "lr": 0.001
+    },
+    //"validation_metric": "+f1_measure",
+    "num_serialized_models_to_keep": 3,
+    "num_epochs": 4,
+    "grad_norm": 5.0,
+    "patience": 2,
+    "cuda_device": 0
+  }
+}
diff --git a/code/contextual_embeddings/configs/melmo_aligned.jsonnet b/code/contextual_embeddings/configs/melmo_aligned.jsonnet
new file mode 100644
index 0000000000000000000000000000000000000000..637c04c2b69b511d64375ee023bd81fa0b788b91
--- /dev/null
+++ b/code/contextual_embeddings/configs/melmo_aligned.jsonnet
@@ -0,0 +1,125 @@
+// Configuration for the NER model with ELMo, modified slightly from
+// the version included in "Deep Contextualized Word Representations",
+// taken from AllenNLP examples
+// modified for the disrpt discourse segmentation shared task -- 2019 
+{
+
+  "dataset_reader": {
+    "type": "custom_conll_reader",
+    "tag_label": "ner",
+    "coding_scheme": "BIOUL",
+    "token_indexers": {
+      "tokens": {
+        "type": "single_id",
+        "lowercase_tokens": true
+      },
+      "token_characters": {
+        "type": "characters",
+        "min_padding_length": 3
+      },
+      "elmo": {
+        "type": "elmo_characters"
+     }
+    }
+  },
+  "train_data_path": std.extVar("TRAIN_DATA_PATH"),
+  "validation_data_path": std.extVar("TEST_A_PATH"),
+  "model": {
+    "type": "custom_simple_tagger",
+    "text_field_embedder": {
+      "token_embedders": {
+        "tokens": {
+            "type": "embedding",
+            "embedding_dim": 50,
+            "pretrained_file": "embeddings/glove.6B.50d.txt",
+            "trainable": true
+        },
+	"elmo": {
+	    "type": "elmo_token_embedder_multilang",
+	    "do_layer_norm": false,
+	    "dropout": 0.3,
+	    "scalar_mix_parameters": [
+		-9e10,
+		1,
+		-9e10
+	    ],
+	    "options_files": {
+		"en": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/options262.json",
+		"es": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/options262.json",
+		"fr": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/options262.json",
+		"it": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/options262.json",
+		"pt": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/options262.json",
+		"sv": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/options262.json",
+		"de": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/options262.json"
+	    },
+	    "weight_files": {
+		"en": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/en_weights.hdf5",
+		"es": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/es_weights.hdf5",
+		"fr": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/fr_weights.hdf5",
+		"it": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/it_weights.hdf5",
+		"pt": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/pt_weights.hdf5",
+		"sv": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/sv_weights.hdf5",
+		"de": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/de_weights.hdf5"
+	    },
+	    "aligning_files": {
+			"en": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/en_best_mapping.pth",
+                        "es": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/es_best_mapping.pth",
+                        "fr": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/fr_best_mapping.pth",
+                        "it": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/it_best_mapping.pth",
+                        "pt": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/pt_best_mapping.pth",
+                        "sv": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/sv_best_mapping.pth",
+                        "de": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/de_best_mapping.pth"
+            },
+	},
+       "token_characters": {
+            "type": "character_encoding",
+            "embedding": {
+            "embedding_dim": 16
+            },
+            "encoder": {
+            "type": "cnn",
+            "embedding_dim": 16,
+            "num_filters": 128,
+            "ngram_filter_sizes": [3],
+            "conv_layer_activation": "relu"
+            }
+        }
+      }
+    },
+    "encoder": {
+      "type": "lstm",
+      "input_size": 1202,
+      "hidden_size": 100,
+      "num_layers": 1,
+      "dropout": 0.5,
+      "bidirectional": true
+    },
+    "regularizer": [
+      [
+        "scalar_parameters",
+        {
+          "type": "l2",
+          "alpha": 0.1
+        }
+      ]
+    ]
+  },
+  "iterator": {
+    "type": "same_language",
+    "batch_size": 2,
+    "sorting_keys": [["words", "num_tokens"]],
+    "instances_per_epoch": 32000
+  },
+  "trainer": {
+    "optimizer": {
+        "type": "adam",
+        "lr": 0.001
+    },
+    //"validation_metric": "+f1_measure",
+    "num_serialized_models_to_keep": 3,
+    "num_epochs": 4,
+    "grad_norm": 5.0,
+    "patience": 2,
+    "cuda_device": 0
+  }
+}
diff --git a/code/contextual_embeddings/expes_elmo.sh b/code/contextual_embeddings/expes_elmo.sh
deleted file mode 100644
index 4a4755ac770ee4cd6eba98db551507049b9fb005..0000000000000000000000000000000000000000
--- a/code/contextual_embeddings/expes_elmo.sh
+++ /dev/null
@@ -1,110 +0,0 @@
-# usage
-# bash expes.sh dataset config model action [parent]
-
-echo "data=$1, config=$2, model=$3, action=$4"
-   
-export DATASET=${1}
-# eg "eng.rst.gum"
-
-export CONFIG=${2}
-# options: conll tok split.tok
-
-export MODEL=${3}
-# options: bert xlm
-
-export ACTION=${4}
-# options: train test
- 
-if [ -z "$5" ];
-then
-    export HAS_PAR=false
-    export TOOLONG=false
-elif [ "${5}" = "--s" ];
-then
-    export TOOLONG=true
-    export SPLIT=${6}
-else
-    export HAS_PAR=true
-    export TOOLONG=false
-    export PARENT=${5}
-fi
-
-if [ $# -eq 7 ] && [ "${6}" = "--s" ]; 
-then
-    export TOOLONG=true
-    export SPLIT=${7}
-fi
-
-#if [ "$MODEL" = "xlm" ]; 
-#then 
-#    export BERT_VOCAB="xlm-roberta-base"
-#    export BERT_WEIGHTS="xlm-roberta-base"
-#else
-#    export BERT_VOCAB="bert-base-multilingual-cased"
-#    export BERT_WEIGHTS="bert-base-multilingual-cased"                                                                                                   
-#fi
-
-if [ "$ACTION" = "train" ];
-then
-    export EVAL=dev
-else
-    export EVAL=test
-fi
-
-export GOLD_BASE="data/"
-export CONV="data_converted/"
-export CODE="code/contextual_embeddings/"
-export TRAIN_DATA_PATH=${CONV}${DATASET}"_train.ner."${CONFIG}
-export TEST_A_PATH=${CONV}${DATASET}"_"${EVAL}".ner."${CONFIG}
-export OUTPUT=${DATASET}"_"${MODEL}
-export GOLD=${GOLD_BASE}${DATASET}"/"${DATASET}"_"${EVAL}"."${CONFIG}
-
-mkdir -p ${CONV}
-
-for val in "train" ${EVAL}; do
-    export original=${GOLD_BASE}"/"${DATASET}"/"${DATASET}"_"${val}"."${CONFIG}
-    export converted=${CONV}/${DATASET}"_"${val}".ner."${CONFIG}
-    # conversion of datasets to NER / BIO format by first testing the existence of files so as not to redo it each time
-    if [ ! -f ${converted} ]; then
-        echo "converting "${val}" to ner format -> in data_converted ..."
-        if [ $TOOLONG = true ];
-        then 
-            python ${CODE}conv2ner.py ${original} ${converted} --split-too-long True ${SPLIT}
-        else
-            python ${CODE}conv2ner.py ${original} ${converted}
-        fi
-    fi
-
-done
-
-if [ "$ACTION" = "train" ]; 
-then
-    if [ $HAS_PAR = true ]; 
-    then
-        echo "fine tune"
-        # fine tune
-        allennlp fine-tune -m Results_${CONFIG}/results_${PARENT}_${MODEL}/model.tar.gz -c ${CODE}configs/${MODEL}.jsonnet -s Results_${CONFIG}/results_${DATASET}-${PARENT}_${MODEL}
-    else
-        echo "train"
-        # train with config in bert.jsonnet; the config references explicitely variables TRAIN_DATA_PATH and TEST_A_PATH
-        allennlp train -s Results_${CONFIG}/results_${OUTPUT} ${CODE}configs/${MODEL}.jsonnet
-    fi
-elif [ $HAS_PAR = true ];
-then
-    if [ "$ACTION" = "test" ];
-    then
-        echo "parent test"
-        export TRAIN_DATA_PATH=${CONV}${PARENT}"_train.ner."${CONFIG}
-        export OUTPUT=${PARENT}"_"${MODEL}
-    else
-        echo "finetune test"
-        export OUTPUT=${DATASET}"-"${PARENT}"_"${MODEL}
-    fi
-fi
-
-# predict with model -> outputs json
-allennlp predict --use-dataset-reader --output-file Results_${CONFIG}/results_${OUTPUT}/${DATASET}_${EVAL}.predictions.json Results_${CONFIG}/results_${OUTPUT}/model.tar.gz ${TEST_A_PATH} --silent
-# convert to disrpt format 
-python ${CODE}json2conll.py Results_${CONFIG}/results_${OUTPUT}/${DATASET}_${EVAL}.predictions.json ${CONFIG} Results_${CONFIG}/results_${OUTPUT}/${DATASET}_${EVAL}.predictions.${CONFIG}
-# eval with disrpt script
-python code/utils/seg_eval.py $GOLD Results_${CONFIG}/results_${OUTPUT}/${DATASET}_${EVAL}.predictions.${CONFIG} >> Results_${CONFIG}/results_${OUTPUT}/${DATASET}_${EVAL}.scores