diff --git a/.gitignore b/.gitignore index 9e49a4d5214f8889f3da509dbb6a44c5ed1aa230..31a9e25abafa1f60ace5e12854fdb997c3401672 100644 --- a/.gitignore +++ b/.gitignore @@ -206,6 +206,5 @@ Results_tok/* data/* recover_underscores/* -code/contextual_embeddings/configs/* results_ignore/* scripts/* diff --git a/align/README.md b/align/README.md index b42badb2459498d4917f3dd57d86996604b5b239..ce6fecf9b1b4c91057354b4c2804046b8a5456b3 100644 --- a/align/README.md +++ b/align/README.md @@ -21,8 +21,8 @@ By default: - for ELMo models: Glove embeddings under **discut/embeddings**: -`wget -P embeddings/ "http://nlp.stanford.edu/data/glove.6B.zip" -gunzip embeddings/glove.6B.zip` +`wget -P embeddings/ "http://nlp.stanford.edu/data/glove.6B.zip"` +`gunzip embeddings/glove.6B.zip` # Compute alignment matrix diff --git a/align/elmo_get_tokid.py b/align/elmo_get_tokid.py deleted file mode 100644 index 814a04852a6ced126a3f6d2efac6a03c9a971ff2..0000000000000000000000000000000000000000 --- a/align/elmo_get_tokid.py +++ /dev/null @@ -1,10 +0,0 @@ -from allennlp.data.token_indexers.elmo_indexer import ELMoTokenCharactersIndexer -from allennlp.data.tokenizers.token import Token -from allennlp.data.vocabulary import Vocabulary -from allennlp.data.token_indexers.single_id_token_indexer import SingleIdTokenIndexer - -indexer = SingleIdTokenIndexer() -tokens = ['.', 'lfdss', 'dfsd', 'oui', 'a'] -index = indexer.tokens_to_indices([Token(token) for token in tokens], Vocabulary(), 'baba') - -print(index) diff --git a/align/get_tok_ids.py b/align/get_tok_ids.py deleted file mode 100644 index 6f006bf5ad8d45c44431c3d071a9ee5e8dd66548..0000000000000000000000000000000000000000 --- a/align/get_tok_ids.py +++ /dev/null @@ -1,12 +0,0 @@ -import numpy as np -from transformers import BertTokenizer -import torch -from torch import nn - -bert = 'bert-base-multilingual-cased' -tokenizer = BertTokenizer.from_pretrained(bert) - -tokens = ['fdsfsdfkc', 'dsfdsdf', 'The', 'to', 'because', 'since', 'that', '.', ';', ',', ':', 'is', 'a', 'he', 'said', 'the'] -for token in tokens: - print(token, tokenizer.convert_tokens_to_ids(token)) - diff --git a/align/misc/check_parse.py b/align/misc/check_parse.py deleted file mode 100644 index ecb1fb3d2cb58657026c4655e38a23ed36278632..0000000000000000000000000000000000000000 --- a/align/misc/check_parse.py +++ /dev/null @@ -1,16 +0,0 @@ -import sys -import numpy as np - -path = sys.argv[1] -data = np.load(path, allow_pickle=True) -tok_ids, toks, labels = [data[f] for f in data.files] - -for i, tok in enumerate(toks): - if isinstance(tok, str): - if len(tok) > 35: - print(tok) - else: - print(tok_ids[i]) - print(type(tok)) - - diff --git a/align/misc/check_rich.py b/align/misc/check_rich.py deleted file mode 100644 index 6233fe54f6418a7afc24cfa88c9409a32c576de3..0000000000000000000000000000000000000000 --- a/align/misc/check_rich.py +++ /dev/null @@ -1,34 +0,0 @@ -import sys -import numpy as np - -path = sys.argv[1] -data = np.load(path, allow_pickle=True) -tok_ids, toks, labels, upos, idheads, deprels = [data[f] for f in data.files] - -print(len(toks)) -print(len(upos)) - -labs_upos = np.unique(upos) -labs_idheads = np.unique(idheads) -labs_deprel = np.unique(deprels) -print(labs_upos) -print(labs_idheads) -print(labs_deprel) - -for i, tok in enumerate(toks): - if isinstance(tok, str): - if len(tok) > 35: - print(tok) - else: - print(tok_ids[i]) - print(type(tok)) - -for i, tok in enumerate(upos): - if isinstance(tok, str): - if len(tok) > 35: - print(tok) - else: - print(tok_ids[i]) - print(type(tok)) - - diff --git a/align/misc/plot_mapping.py b/align/misc/plot_mapping.py deleted file mode 100644 index 53cb9c109b0fafb033b2884cebe03badeb8cc7a9..0000000000000000000000000000000000000000 --- a/align/misc/plot_mapping.py +++ /dev/null @@ -1,183 +0,0 @@ -import numpy as np -import sys -import os -from transformers import BertModel -import torch -from torch.utils.data import DataLoader -from tqdm import tqdm -from train_model_baseline import generate_sentence_list, collate_batch -from sklearn.decomposition import PCA -import matplotlib.pyplot as plt -import argparse -from scipy.spatial.distance import cosine -from scipy.spatial.transform import Rotation - -def mapping(src_corpus, tgt_corpus, mapping, sset, fmt): - - data = {} - mapping = torch.load(mapping) - mapping_tr = mapping.transpose() - rotation = Rotation.from_matrix(mapping) - rotation_inv = rotation.inv() - print("rotation_inv", rotation_inv.as_matrix().shape) - #mapping = - mapping - - if sset == 'all': - ssets = ['train', 'test', 'dev'] - for corpus in [src_corpus, tgt_corpus]: - for s in ssets: - data[corpus][s] = generate_sentence_list(corpus, s, fmt) - data[corpus] = data[corpus]['train'] + data[corpus]['test'] + data[corpus]['dev'] - else: - for corpus in [src_corpus, tgt_corpus]: - data[corpus] = generate_sentence_list(corpus, sset, fmt) - - data = data[src_corpus] + data[tgt_corpus] - batch_size = 64 - - dataloader = DataLoader(data, batch_size=batch_size, collate_fn=collate_batch) - bert = 'bert-base-multilingual-cased' - bert_embeddings = BertModel.from_pretrained(bert) - - #words for which to collect contextual embeddings in order to plot point clouds - clouds = ['est','que','is','that'] - clouds_en = clouds[2:] - cloud_embeddings = {} - aligned_embeddings = {} - for cloud in clouds: - cloud_embeddings[cloud] = [] - for cloud in clouds_en: - aligned_embeddings[cloud] = [] - - words_fr = ['est','que','le','pour','mais','et'] - words_en = ['is','that','the','for','but','and'] - fr = {} - fr_al = {} - en = {} - en_al = {} - for word in words_fr: - fr[word] = [] - fr_al[word] = [] - for word in words_en: - en[word] = [] - en_al[word] = [] - - #write this as function - for sentence_batch in tqdm(dataloader): - bert_output = bert_embeddings(**sentence_batch.getBatchEncoding()).last_hidden_state - - for i, sentence in enumerate(sentence_batch.tokens): - bert_sentence_output = bert_output[i] - - for j, token in enumerate(sentence): - bert_token_output = bert_sentence_output[j].detach().numpy() - if token in clouds: - cloud_embeddings[token].append(bert_token_output) - if token in clouds_en: - aligned_emb = np.matmul(bert_token_output, mapping_tr) - aligned_embeddings[token].append(aligned_emb) - - if token in words_fr: - fr[token].append(bert_token_output) - aligned = np.matmul(bert_token_output, mapping_tr) - #aligned = np.matmul(mapping_tr, bert_token_output.transpose()) - #aligned = np.matmul(mapping, bert_token_output.transpose()) - fr_al[token].append(aligned) - if token in words_en: - en[token].append(bert_token_output) - aligned = np.matmul(bert_token_output, mapping_tr) - #aligned = np.matmul(mapping_tr, bert_token_output.transpose()) - #aligned = np.matmul(mapping, bert_token_output.transpose()) - en_al[token].append(aligned) - - pca = PCA(n_components=2) - #plot_clouds(cloud_embeddings, pca, 'before') - for cloud in clouds[:2]: - aligned_embeddings[cloud] = cloud_embeddings[cloud] #add unchanged target vectors - #plot_clouds(aligned_embeddings, pca, 'after') - analyze(words_fr, words_en, fr, en, fr_al, en_al) - -def plot_clouds(cloud_embeddings, pca, text): - tok_en0, tok_en1, tok_fr0, tok_fr1 = cloud_embeddings.keys() - - print(f'b= {tok_en0}') - print(f'c= {tok_en1}') - print(f'm= {tok_fr0}') - print(f'r= {tok_fr1}') - - colors = ['b', 'c', 'm', 'r'] - embs_en0 = np.array([emb for emb in cloud_embeddings[tok_en0]]) - embs_en1 = np.array([emb for emb in cloud_embeddings[tok_en1]]) - embs_fr0 = np.array([emb for emb in cloud_embeddings[tok_fr0]]) - embs_fr1 = np.array([emb for emb in cloud_embeddings[tok_fr1]]) - - n_en0 = embs_en0.shape[0] - n_en1 = embs_en1.shape[0] - n_fr0 = embs_fr0.shape[0] - n_fr1 = embs_fr1.shape[0] - - full_embs = np.concatenate((embs_en0, embs_en1, embs_fr0, embs_fr1), axis=0) - embs_reduced = pca.fit_transform(full_embs) - - print(n_en0, n_en1, n_fr0, n_fr1) - transp = embs_reduced.transpose() - red_en0 = transp[:,:n_en0] - sep = n_en0 + n_en1 - red_en1 = transp[:,n_en0:sep] - sep1 = sep + n_fr0 - red_fr0 = transp[:,sep:sep1] - red_fr1 = transp[:,sep1:] - - plt.scatter(red_en0[0], red_en0[1], marker='.', color=colors[0], label = tok_en0) - plt.scatter(red_en1[0], red_en1[1], marker='.', color=colors[1], label = tok_en1) - plt.scatter(red_fr0[0], red_fr0[1], marker='.', color=colors[2], label = tok_fr0) - plt.scatter(red_fr1[0], red_fr1[1], marker='.', color=colors[3], label = tok_fr1) - - plt.title(f'{text} alignment') - plt.legend() - plt.show() - plt.savefig(f'{text}.png') - plt.clf() - -def analyze(words_fr, words_en, fr, en, fr_al, en_al): - - print("EN aligned") - for word_fr in words_fr: - for word_en in words_en: - #dist_before = cosine(np.array(fr[word_fr]).mean(axis=0),np.array(en[word_en]).mean(axis=0)) - dist_before = np.linalg.norm(np.array(fr[word_fr]).mean(axis=0) - np.array(en[word_en]).mean(axis=0)) - #dist_after = cosine(np.array(fr[word_fr]).mean(axis=0),np.array(en_al[word_en]).mean(axis=0)) - dist_after = np.linalg.norm(np.array(fr[word_fr]).mean(axis=0) - np.array(en_al[word_en]).mean(axis=0)) - print(f'{word_fr} -- {word_en}') - print(f'Distance before = {dist_before}') - print(f'Distance after = {dist_after}') - print(f'Deplacement = {dist_before - dist_after}') - - print('\n========\nFR aligned') - for word_fr in words_fr: - for word_en in words_en: - #dist_before = cosine(np.array(fr[word_fr]).mean(axis=0),np.array(en[word_en]).mean(axis=0)) - dist_before = np.linalg.norm(np.array(fr[word_fr]).mean(axis=0) - np.array(en[word_en]).mean(axis=0)) - #dist_after = cosine(np.array(fr_al[word_fr]).mean(axis=0),np.array(en[word_en]).mean(axis=0)) - dist_after = np.linalg.norm(np.array(fr_al[word_fr]).mean(axis=0) - np.array(en[word_en]).mean(axis=0)) - print(f'{word_fr} -- {word_en}') - print(f'Distance before = {dist_before}') - print(f'Distance after = {dist_after}') - print(f'Deplacement = {dist_before - dist_after}') - -def main(): - parser = argparse.ArgumentParser(description='Compute anchors for a given dataset and a given word list') - parser.add_argument('--src_corpus', required=True, help='corpus to align') - parser.add_argument('--tgt_corpus', required=True, help='corpus on which to align') - parser.add_argument('--mapping', required=True, help='path to .pth mapping file') - parser.add_argument('--set', default='train', help='portion of the corpus to test on (train/test/dev/all)') - parser.add_argument('--format', default='conllu', help='tok or conllu') - params = parser.parse_args() - - if not os.path.isfile(params.mapping): - print(f'no file at {params.mapping}.') - sys.exit() - mapping(params.src_corpus, params.tgt_corpus, params.mapping, params.set, params.format) - -if __name__ == '__main__': - main() diff --git a/align/misc/stupid_tagger.py b/align/misc/stupid_tagger.py deleted file mode 100644 index e0c31b28e7980f2d2a6da385d33bbd5c94aee4a8..0000000000000000000000000000000000000000 --- a/align/misc/stupid_tagger.py +++ /dev/null @@ -1,202 +0,0 @@ -from typing import Dict, Optional, List, Any - -import numpy -from overrides import overrides -import torch -from torch.nn.modules.linear import Linear -import torch.nn.functional as F - -from allennlp.common.checks import check_dimensions_match, ConfigurationError -from allennlp.data import Vocabulary -from allennlp.modules import Seq2SeqEncoder, TimeDistributed, TextFieldEmbedder -from allennlp.models.model import Model -from allennlp.nn import InitializerApplicator, RegularizerApplicator -from allennlp.nn.util import get_text_field_mask, sequence_cross_entropy_with_logits -from allennlp.training.metrics import CategoricalAccuracy, SpanBasedF1Measure - - -@Model.register("stupid_tagger") -class StupidTagger(Model): - """ - This ``SimpleTagger`` simply encodes a sequence of text with a stacked ``Seq2SeqEncoder``, then - predicts a tag for each token in the sequence. - - Parameters - ---------- - vocab : ``Vocabulary``, required - A Vocabulary, required in order to compute sizes for input/output projections. - text_field_embedder : ``TextFieldEmbedder``, required - Used to embed the ``tokens`` ``TextField`` we get as input to the model. - encoder : ``Seq2SeqEncoder`` - The encoder (with its own internal stacking) that we will use in between embedding tokens - and predicting output tags. - calculate_span_f1 : ``bool``, optional (default=``None``) - Calculate span-level F1 metrics during training. If this is ``True``, then - ``label_encoding`` is required. If ``None`` and - label_encoding is specified, this is set to ``True``. - If ``None`` and label_encoding is not specified, it defaults - to ``False``. - label_encoding : ``str``, optional (default=``None``) - Label encoding to use when calculating span f1. - Valid options are "BIO", "BIOUL", "IOB1", "BMES". - Required if ``calculate_span_f1`` is true. - label_namespace : ``str``, optional (default=``labels``) - This is needed to compute the SpanBasedF1Measure metric, if desired. - Unless you did something unusual, the default value should be what you want. - verbose_metrics : ``bool``, optional (default = False) - If true, metrics will be returned per label class in addition - to the overall statistics. - initializer : ``InitializerApplicator``, optional (default=``InitializerApplicator()``) - Used to initialize the model parameters. - regularizer : ``RegularizerApplicator``, optional (default=``None``) - If provided, will be used to calculate the regularization penalty during training. - """ - - def __init__(self, vocab: Vocabulary, - text_field_embedder: TextFieldEmbedder, - encoder: Seq2SeqEncoder, - calculate_span_f1: bool = None, - label_encoding: Optional[str] = None, - label_namespace: str = "labels", - verbose_metrics: bool = False, - initializer: InitializerApplicator = InitializerApplicator(), - regularizer: Optional[RegularizerApplicator] = None) -> None: - super().__init__(vocab, regularizer) - - self.label_namespace = label_namespace - self.text_field_embedder = text_field_embedder - self.num_classes = self.vocab.get_vocab_size(label_namespace) - self.encoder = encoder - self._verbose_metrics = verbose_metrics - self.tag_projection_layer = TimeDistributed(Linear(self.encoder.get_output_dim(), - self.num_classes)) - - check_dimensions_match(text_field_embedder.get_output_dim(), encoder.get_input_dim(), - "text field embedding dim", "encoder input dim") - - # We keep calculate_span_f1 as a constructor argument for API consistency with - # the CrfTagger, even it is redundant in this class - # (label_encoding serves the same purpose). - if calculate_span_f1 and not label_encoding: - raise ConfigurationError("calculate_span_f1 is True, but " - "no label_encoding was specified.") - self.metrics = { - "accuracy": CategoricalAccuracy(), - "accuracy3": CategoricalAccuracy(top_k=3) - } - - if calculate_span_f1 or label_encoding: - self._f1_metric = SpanBasedF1Measure(vocab, - tag_namespace=label_namespace, - label_encoding=label_encoding) - else: - self._f1_metric = None - - initializer(self) - - @overrides - def forward(self, # type: ignore - tokens: Dict[str, torch.LongTensor], - tags: torch.LongTensor = None, - metadata: List[Dict[str, Any]] = None) -> Dict[str, torch.Tensor]: - # pylint: disable=arguments-differ - """ - Parameters - ---------- - tokens : Dict[str, torch.LongTensor], required - The output of ``TextField.as_array()``, which should typically be passed directly to a - ``TextFieldEmbedder``. This output is a dictionary mapping keys to ``TokenIndexer`` - tensors. At its most basic, using a ``SingleIdTokenIndexer`` this is: ``{"tokens": - Tensor(batch_size, num_tokens)}``. This dictionary will have the same keys as were used - for the ``TokenIndexers`` when you created the ``TextField`` representing your - sequence. The dictionary is designed to be passed directly to a ``TextFieldEmbedder``, - which knows how to combine different word representations into a single vector per - token in your input. - tags : torch.LongTensor, optional (default = None) - A torch tensor representing the sequence of integer gold class labels of shape - ``(batch_size, num_tokens)``. - metadata : ``List[Dict[str, Any]]``, optional, (default = None) - metadata containing the original words in the sentence to be tagged under a 'words' key. - - Returns - ------- - An output dictionary consisting of: - logits : torch.FloatTensor - A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing - unnormalised log probabilities of the tag classes. - class_probabilities : torch.FloatTensor - A tensor of shape ``(batch_size, num_tokens, tag_vocab_size)`` representing - a distribution of the tag classes per word. - loss : torch.FloatTensor, optional - A scalar loss to be optimised. - - """ - embedded_text_input = self.text_field_embedder(tokens) - batch_size, sequence_length, _ = embedded_text_input.size() - mask = get_text_field_mask(tokens) - encoded_text = self.encoder(embedded_text_input, mask) - - logits = self.tag_projection_layer(encoded_text) - reshaped_log_probs = logits.view(-1, self.num_classes) - class_probabilities = F.softmax(reshaped_log_probs, dim=-1).view([batch_size, - sequence_length, - self.num_classes]) - - output_dict = {"logits": logits, "class_probabilities": class_probabilities} - - if tags is not None: - loss = sequence_cross_entropy_with_logits(logits, tags, mask) - for metric in self.metrics.values(): - metric(logits, tags, mask.float()) - if self._f1_metric is not None: - self._f1_metric(logits, tags, mask.float()) - output_dict["loss"] = loss - - if metadata is not None: - output_dict["words"] = [x["words"] for x in metadata] - - #OVERWRITE VALUES - #output_dict['loss'] = torch.tensor(0).to(torch.long) - probas = torch.zeros(batch_size, sequence_length, self.num_classes) - probas[:,0,-1] = 1 - probas[:,1:,0] = 1 - output_dict['logits'] = probas - output_dict['class_probabilities'] = probas - - return output_dict - - @overrides - def decode(self, output_dict: Dict[str, torch.Tensor]) -> Dict[str, torch.Tensor]: - """ - Does a simple position-wise argmax over each token, converts indices to string labels, and - adds a ``"tags"`` key to the dictionary with the result. - """ - all_predictions = output_dict['class_probabilities'] - all_predictions = all_predictions.cpu().data.numpy() - if all_predictions.ndim == 3: - predictions_list = [all_predictions[i] for i in range(all_predictions.shape[0])] - else: - predictions_list = [all_predictions] - all_tags = [] - for predictions in predictions_list: - argmax_indices = numpy.argmax(predictions, axis=-1) - tags = [self.vocab.get_token_from_index(x, namespace="labels") - for x in argmax_indices] - all_tags.append(tags) - output_dict['tags'] = all_tags - return output_dict - - @overrides - def get_metrics(self, reset: bool = False) -> Dict[str, float]: - metrics_to_return = {metric_name: metric.get_metric(reset) for - metric_name, metric in self.metrics.items()} - - if self._f1_metric is not None: - f1_dict = self._f1_metric.get_metric(reset=reset) - if self._verbose_metrics: - metrics_to_return.update(f1_dict) - else: - metrics_to_return.update({ - x: y for x, y in f1_dict.items() if - "overall" in x}) - return metrics_to_return diff --git a/align/post_lstm.py b/align/post_lstm.py deleted file mode 100644 index c9aa4c7ddd344752a305509009d3d28c7f4dadb7..0000000000000000000000000000000000000000 --- a/align/post_lstm.py +++ /dev/null @@ -1,34 +0,0 @@ -import sys, os -import torch -from visualize import plot_heatmap_anchors -from transformers import BertTokenizer - -tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased') - -def make_anchor(tok_id): - input_file = os.path.join('saved_post_lstm', f'{tok_id}.pth') - matrix = torch.load(input_file) - return torch.mean(matrix, dim=0) - -def plot_post_lstm(tokens): - dico = {} - - dico['First'] = make_anchor('fst') - dico['Random'] = make_anchor('rand') - - #for token in tokens: - # tok_id = tokenizer.convert_tokens_to_ids(token) - # dico[token] = make_anchor(tok_id) - - plot_heatmap_anchors(tokens, dico, 'post_lstm_rstdt_fst_rand', n_dim=200, emb_dim=200) - -def main(): - if len(sys.argv) < 2: - print('Please enter at least one token id.') - sys.exit() - tokens = sys.argv[1:] - print(tokens) - plot_post_lstm(tokens) - -if __name__ == '__main__': - main() diff --git a/code/contextual_embeddings/configs/bert_custom.jsonnet b/code/contextual_embeddings/configs/bert_custom.jsonnet new file mode 100644 index 0000000000000000000000000000000000000000..8763334aa3e5274a22ff901f30f99cc50e7ef183 --- /dev/null +++ b/code/contextual_embeddings/configs/bert_custom.jsonnet @@ -0,0 +1,72 @@ +{ + "dataset_reader": { + "type": "custom_disrpt_reader", + "tag_label": "ner", + "coding_scheme": "BIOUL", + "token_indexers": { + "bert": { + "type": "bert-pretrained", + "pretrained_model": std.extVar("BERT_VOCAB"), + "do_lowercase": false, + "use_starting_offsets": true + }, + "token_characters": { + "type": "characters", + "min_padding_length": 3 + }, + } + }, + "train_data_path": std.extVar("TRAIN_DATA_PATH"), + "validation_data_path": std.extVar("TEST_A_PATH"), + "model": { + "type": "simple_tagger", + "text_field_embedder": { + "allow_unmatched_keys": true, + "embedder_to_indexer_map": { + "bert": ["bert", "bert-offsets"], + "token_characters": ["token_characters"] + }, + "token_embedders": { + "bert": { + "type": "bert-pretrained", + "pretrained_model": std.extVar("BERT_WEIGHTS") + }, + "token_characters": { + "type": "character_encoding", + "embedding": { + "embedding_dim": 16 + }, + "encoder": { + "type": "cnn", + "embedding_dim": 16, + "num_filters": 128, + "ngram_filter_sizes": [3], + "conv_layer_activation": "relu" + } + } + } + }, + "encoder": { + "type": "lstm", + "input_size": 896, + "hidden_size": 100, + "num_layers": 2, + "dropout": 0.5, + "bidirectional": true + } + }, + "iterator": { + "type": "basic", + "batch_size": 2 + }, + "trainer": { + "optimizer": { + "type": "bert_adam", + "lr": 0.001 + }, + "num_serialized_models_to_keep": 3, + "num_epochs": 4, + "grad_norm": 5.0, + "cuda_device": 0 + } +} diff --git a/code/contextual_embeddings/configs/elmo.jsonnet b/code/contextual_embeddings/configs/elmo.jsonnet new file mode 100644 index 0000000000000000000000000000000000000000..f09fe098f194104b8605b8b0521ca1878a8bffea --- /dev/null +++ b/code/contextual_embeddings/configs/elmo.jsonnet @@ -0,0 +1,93 @@ +// Configuration for the NER model with ELMo, modified slightly from +// the version included in "Deep Contextualized Word Representations", +// taken from AllenNLP examples +// modified for the disrpt discourse segmentation shared task -- 2019 +{ + + "dataset_reader": { + "type": "conll2003", + "tag_label": "ner", + "coding_scheme": "BIOUL", + "token_indexers": { + "tokens": { + "type": "single_id", + "lowercase_tokens": true + }, + "token_characters": { + "type": "characters", + "min_padding_length": 3 + }, + "elmo": { + "type": "elmo_characters" + } + } + }, + "train_data_path": std.extVar("TRAIN_DATA_PATH"), + "validation_data_path": std.extVar("TEST_A_PATH"), + "model": { + "type": "simple_tagger", + "text_field_embedder": { + "token_embedders": { + "tokens": { + "type": "embedding", + "embedding_dim": 50, + "pretrained_file": "../tony/embeddings/glove.6B.50d.txt", + "trainable": true + }, + "elmo":{ + "type": "elmo_token_embedder", + "options_file": "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_options.json", + "weight_file": "https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway/elmo_2x4096_512_2048cnn_2xhighway_weights.hdf5", + "do_layer_norm": false, + "dropout": 0.0 + }, + "token_characters": { + "type": "character_encoding", + "embedding": { + "embedding_dim": 16 + }, + "encoder": { + "type": "cnn", + "embedding_dim": 16, + "num_filters": 128, + "ngram_filter_sizes": [3], + "conv_layer_activation": "relu" + } + } + } + }, + "encoder": { + "type": "lstm", + "input_size": 1202, + "hidden_size": 100, + "num_layers": 1, + "dropout": 0.5, + "bidirectional": true + }, + "regularizer": [ + [ + "scalar_parameters", + { + "type": "l2", + "alpha": 0.1 + } + ] + ] + }, + "iterator": { + "type": "basic", + "batch_size": 2 + }, + "trainer": { + "optimizer": { + "type": "adam", + "lr": 0.001 + }, + //"validation_metric": "+f1_measure", + "num_serialized_models_to_keep": 3, + "num_epochs": 10, + "grad_norm": 5.0, + "patience": 2, + "cuda_device": 0 + } +} diff --git a/code/contextual_embeddings/configs/melmo.jsonnet b/code/contextual_embeddings/configs/melmo.jsonnet new file mode 100644 index 0000000000000000000000000000000000000000..4d3581793f7fbe7c8b7a02aa5f0430ec89a8a539 --- /dev/null +++ b/code/contextual_embeddings/configs/melmo.jsonnet @@ -0,0 +1,116 @@ +// Configuration for the NER model with ELMo, modified slightly from +// the version included in "Deep Contextualized Word Representations", +// taken from AllenNLP examples +// modified for the disrpt discourse segmentation shared task -- 2019 +{ + + "dataset_reader": { + "type": "custom_conll_reader", + "tag_label": "ner", + "coding_scheme": "BIOUL", + "token_indexers": { + "tokens": { + "type": "single_id", + "lowercase_tokens": true + }, + "token_characters": { + "type": "characters", + "min_padding_length": 3 + }, + "elmo": { + "type": "elmo_characters" + } + } + }, + "train_data_path": std.extVar("TRAIN_DATA_PATH"), + "validation_data_path": std.extVar("TEST_A_PATH"), + "model": { + "type": "custom_simple_tagger", + "text_field_embedder": { + "token_embedders": { + "tokens": { + "type": "embedding", + "embedding_dim": 50, + "pretrained_file": "embeddings/glove.6B.50d.txt", + "trainable": true + }, + "elmo": { + "type": "elmo_token_embedder_multilang", + "do_layer_norm": false, + "dropout": 0.3, + "scalar_mix_parameters": [ + -9e10, + 1, + -9e10 + ], + "options_files": { + "en": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/options262.json", + "es": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/options262.json", + "fr": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/options262.json", + "it": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/options262.json", + "pt": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/options262.json", + "sv": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/options262.json", + "de": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/options262.json" + }, + "weight_files": { + "en": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/en_weights.hdf5", + "es": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/es_weights.hdf5", + "fr": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/fr_weights.hdf5", + "it": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/it_weights.hdf5", + "pt": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/pt_weights.hdf5", + "sv": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/sv_weights.hdf5", + "de": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/de_weights.hdf5" + } + }, + "token_characters": { + "type": "character_encoding", + "embedding": { + "embedding_dim": 16 + }, + "encoder": { + "type": "cnn", + "embedding_dim": 16, + "num_filters": 128, + "ngram_filter_sizes": [3], + "conv_layer_activation": "relu" + } + } + } + }, + "encoder": { + "type": "lstm", + "input_size": 1202, + "hidden_size": 100, + "num_layers": 1, + "dropout": 0.5, + "bidirectional": true + }, + "regularizer": [ + [ + "scalar_parameters", + { + "type": "l2", + "alpha": 0.1 + } + ] + ] + }, + "iterator": { + "type": "same_language", + "batch_size": 2, + "sorting_keys": [["words", "num_tokens"]], + "instances_per_epoch": 32000 + }, + "trainer": { + "optimizer": { + "type": "adam", + "lr": 0.001 + }, + //"validation_metric": "+f1_measure", + "num_serialized_models_to_keep": 3, + "num_epochs": 4, + "grad_norm": 5.0, + "patience": 2, + "cuda_device": 0 + } +} diff --git a/code/contextual_embeddings/configs/melmo_aligned.jsonnet b/code/contextual_embeddings/configs/melmo_aligned.jsonnet new file mode 100644 index 0000000000000000000000000000000000000000..637c04c2b69b511d64375ee023bd81fa0b788b91 --- /dev/null +++ b/code/contextual_embeddings/configs/melmo_aligned.jsonnet @@ -0,0 +1,125 @@ +// Configuration for the NER model with ELMo, modified slightly from +// the version included in "Deep Contextualized Word Representations", +// taken from AllenNLP examples +// modified for the disrpt discourse segmentation shared task -- 2019 +{ + + "dataset_reader": { + "type": "custom_conll_reader", + "tag_label": "ner", + "coding_scheme": "BIOUL", + "token_indexers": { + "tokens": { + "type": "single_id", + "lowercase_tokens": true + }, + "token_characters": { + "type": "characters", + "min_padding_length": 3 + }, + "elmo": { + "type": "elmo_characters" + } + } + }, + "train_data_path": std.extVar("TRAIN_DATA_PATH"), + "validation_data_path": std.extVar("TEST_A_PATH"), + "model": { + "type": "custom_simple_tagger", + "text_field_embedder": { + "token_embedders": { + "tokens": { + "type": "embedding", + "embedding_dim": 50, + "pretrained_file": "embeddings/glove.6B.50d.txt", + "trainable": true + }, + "elmo": { + "type": "elmo_token_embedder_multilang", + "do_layer_norm": false, + "dropout": 0.3, + "scalar_mix_parameters": [ + -9e10, + 1, + -9e10 + ], + "options_files": { + "en": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/options262.json", + "es": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/options262.json", + "fr": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/options262.json", + "it": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/options262.json", + "pt": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/options262.json", + "sv": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/options262.json", + "de": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/options262.json" + }, + "weight_files": { + "en": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/en_weights.hdf5", + "es": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/es_weights.hdf5", + "fr": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/fr_weights.hdf5", + "it": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/it_weights.hdf5", + "pt": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/pt_weights.hdf5", + "sv": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/sv_weights.hdf5", + "de": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/de_weights.hdf5" + }, + "aligning_files": { + "en": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/en_best_mapping.pth", + "es": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/es_best_mapping.pth", + "fr": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/fr_best_mapping.pth", + "it": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/it_best_mapping.pth", + "pt": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/pt_best_mapping.pth", + "sv": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/sv_best_mapping.pth", + "de": "https://s3-us-west-2.amazonaws.com/allennlp/models/multilingual_elmo/de_best_mapping.pth" + }, + }, + "token_characters": { + "type": "character_encoding", + "embedding": { + "embedding_dim": 16 + }, + "encoder": { + "type": "cnn", + "embedding_dim": 16, + "num_filters": 128, + "ngram_filter_sizes": [3], + "conv_layer_activation": "relu" + } + } + } + }, + "encoder": { + "type": "lstm", + "input_size": 1202, + "hidden_size": 100, + "num_layers": 1, + "dropout": 0.5, + "bidirectional": true + }, + "regularizer": [ + [ + "scalar_parameters", + { + "type": "l2", + "alpha": 0.1 + } + ] + ] + }, + "iterator": { + "type": "same_language", + "batch_size": 2, + "sorting_keys": [["words", "num_tokens"]], + "instances_per_epoch": 32000 + }, + "trainer": { + "optimizer": { + "type": "adam", + "lr": 0.001 + }, + //"validation_metric": "+f1_measure", + "num_serialized_models_to_keep": 3, + "num_epochs": 4, + "grad_norm": 5.0, + "patience": 2, + "cuda_device": 0 + } +} diff --git a/code/contextual_embeddings/expes_elmo.sh b/code/contextual_embeddings/expes_elmo.sh deleted file mode 100644 index 4a4755ac770ee4cd6eba98db551507049b9fb005..0000000000000000000000000000000000000000 --- a/code/contextual_embeddings/expes_elmo.sh +++ /dev/null @@ -1,110 +0,0 @@ -# usage -# bash expes.sh dataset config model action [parent] - -echo "data=$1, config=$2, model=$3, action=$4" - -export DATASET=${1} -# eg "eng.rst.gum" - -export CONFIG=${2} -# options: conll tok split.tok - -export MODEL=${3} -# options: bert xlm - -export ACTION=${4} -# options: train test - -if [ -z "$5" ]; -then - export HAS_PAR=false - export TOOLONG=false -elif [ "${5}" = "--s" ]; -then - export TOOLONG=true - export SPLIT=${6} -else - export HAS_PAR=true - export TOOLONG=false - export PARENT=${5} -fi - -if [ $# -eq 7 ] && [ "${6}" = "--s" ]; -then - export TOOLONG=true - export SPLIT=${7} -fi - -#if [ "$MODEL" = "xlm" ]; -#then -# export BERT_VOCAB="xlm-roberta-base" -# export BERT_WEIGHTS="xlm-roberta-base" -#else -# export BERT_VOCAB="bert-base-multilingual-cased" -# export BERT_WEIGHTS="bert-base-multilingual-cased" -#fi - -if [ "$ACTION" = "train" ]; -then - export EVAL=dev -else - export EVAL=test -fi - -export GOLD_BASE="data/" -export CONV="data_converted/" -export CODE="code/contextual_embeddings/" -export TRAIN_DATA_PATH=${CONV}${DATASET}"_train.ner."${CONFIG} -export TEST_A_PATH=${CONV}${DATASET}"_"${EVAL}".ner."${CONFIG} -export OUTPUT=${DATASET}"_"${MODEL} -export GOLD=${GOLD_BASE}${DATASET}"/"${DATASET}"_"${EVAL}"."${CONFIG} - -mkdir -p ${CONV} - -for val in "train" ${EVAL}; do - export original=${GOLD_BASE}"/"${DATASET}"/"${DATASET}"_"${val}"."${CONFIG} - export converted=${CONV}/${DATASET}"_"${val}".ner."${CONFIG} - # conversion of datasets to NER / BIO format by first testing the existence of files so as not to redo it each time - if [ ! -f ${converted} ]; then - echo "converting "${val}" to ner format -> in data_converted ..." - if [ $TOOLONG = true ]; - then - python ${CODE}conv2ner.py ${original} ${converted} --split-too-long True ${SPLIT} - else - python ${CODE}conv2ner.py ${original} ${converted} - fi - fi - -done - -if [ "$ACTION" = "train" ]; -then - if [ $HAS_PAR = true ]; - then - echo "fine tune" - # fine tune - allennlp fine-tune -m Results_${CONFIG}/results_${PARENT}_${MODEL}/model.tar.gz -c ${CODE}configs/${MODEL}.jsonnet -s Results_${CONFIG}/results_${DATASET}-${PARENT}_${MODEL} - else - echo "train" - # train with config in bert.jsonnet; the config references explicitely variables TRAIN_DATA_PATH and TEST_A_PATH - allennlp train -s Results_${CONFIG}/results_${OUTPUT} ${CODE}configs/${MODEL}.jsonnet - fi -elif [ $HAS_PAR = true ]; -then - if [ "$ACTION" = "test" ]; - then - echo "parent test" - export TRAIN_DATA_PATH=${CONV}${PARENT}"_train.ner."${CONFIG} - export OUTPUT=${PARENT}"_"${MODEL} - else - echo "finetune test" - export OUTPUT=${DATASET}"-"${PARENT}"_"${MODEL} - fi -fi - -# predict with model -> outputs json -allennlp predict --use-dataset-reader --output-file Results_${CONFIG}/results_${OUTPUT}/${DATASET}_${EVAL}.predictions.json Results_${CONFIG}/results_${OUTPUT}/model.tar.gz ${TEST_A_PATH} --silent -# convert to disrpt format -python ${CODE}json2conll.py Results_${CONFIG}/results_${OUTPUT}/${DATASET}_${EVAL}.predictions.json ${CONFIG} Results_${CONFIG}/results_${OUTPUT}/${DATASET}_${EVAL}.predictions.${CONFIG} -# eval with disrpt script -python code/utils/seg_eval.py $GOLD Results_${CONFIG}/results_${OUTPUT}/${DATASET}_${EVAL}.predictions.${CONFIG} >> Results_${CONFIG}/results_${OUTPUT}/${DATASET}_${EVAL}.scores