diff --git a/notebooks/TP7_m2LiTL_RNN_2324.ipynb b/notebooks/TP7_m2LiTL_RNN_2324.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..fa072f7e5b113ef936d434780b0b47becccee203 --- /dev/null +++ b/notebooks/TP7_m2LiTL_RNN_2324.ipynb @@ -0,0 +1,944 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "toc_visible": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + }, + "accelerator": "GPU", + "gpuClass": "standard" + }, + "cells": [ + { + "cell_type": "markdown", + "source": [ + "\n", + "# TP7: RNNs\n", + "\n", + "# Part1: RNNs: implementing an LSTM with Pytorch\n", + "\n", + "* Modifications in the PyTorch code to use RNNs for classification\n", + "* POS tagging with RNNs\n", + "\n", + "You need to upload:\n", + "* the files from the Allocine corpus\n", + "* the file corresponding to the Fasttext embeddings: cc.fr.300.10000.vec\n", + "* the file *reader_pytorch_tp5.py* that contains some functions used during the TP (see the import line below). Organizing your code with modules improves readability!" + ], + "metadata": { + "id": "ov8EeZiWBpBC" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "tnlfNu2bBm2u" + }, + "outputs": [], + "source": [ + "from reader_pytorch_tp5 import Dataset, load_vectors, load_weights_matrix\n", + "#, load_weights_matrix, load_vectors\n", + "# torch and torch modules to deal with text data\n", + "import torch\n", + "import torch.nn as nn\n", + "from torchtext.data.utils import get_tokenizer\n", + "from torchtext.vocab import build_vocab_from_iterator\n", + "from torch.utils.data import DataLoader\n", + "# you can use scikit to print scores\n", + "from sklearn.metrics import classification_report" + ] + }, + { + "cell_type": "code", + "source": [ + "# CUDA for PyTorch\n", + "use_cuda = torch.cuda.is_available()\n", + "device = torch.device(\"cuda\" if use_cuda else \"cpu\")\n", + "print(device)" + ], + "metadata": { + "id": "KzNS86rVEv-M" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Paths to data:" + ], + "metadata": { + "id": "taGY9N-PJvWS" + } + }, + { + "cell_type": "code", + "source": [ + "# Data files\n", + "train_file = \"allocine_train.tsv\"\n", + "dev_file = \"allocine_dev.tsv\"\n", + "test_file = \"allocine_test.tsv\"\n", + "# embeddings\n", + "embed_file='cc.fr.300.10000.vec'" + ], + "metadata": { + "id": "kGty4hWCJurB" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## 0- Read and load the data (code given)\n", + "\n", + "We are going to make experiments using either an architecture based on:\n", + "* an embedding bag layer: word embeddings are summed or averaged.\n", + "* an LSTM layer: each sequence of word embeddings goes through the LSTM layer and the resulting vector is used as a new representation of the input sequence.\n", + "\n", + "**Note on batches:**\n", + "* to be able to use batches with the embedding bag layer, we need to use the *offsets* in the *collate_fn* function since the sequences are concatenated.\n", + "* but this is different with the LSTM, since we're not concatenating the sequences. However, we need to have all sequences of the same lenght. This is done by *padding*.\n", + "\n", + "--> We thus should have **2 different *collate_fn* functions, and 2 versions of the training and evaluating functions**, to take into acocunt the *offsets*. **Here, to make things simpler, we don't use batches.**" + ], + "metadata": { + "id": "Wv6H41YoFycw" + } + }, + { + "cell_type": "code", + "source": [ + "# Load the training and development data\n", + "trainset = Dataset( train_file )\n", + "devset = Dataset( dev_file, vocab=trainset.vocab )\n", + "\n", + "train_loader = DataLoader(trainset, shuffle=True)\n", + "dev_loader = DataLoader(devset, shuffle=False)" + ], + "metadata": { + "id": "2Sqqf4dQJmHB" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Load embeddings\n", + "vectors = load_vectors( embed_file )\n", + "print( 'Version with', len( vectors), 'tokens')\n", + "print(vectors.keys() )\n", + "\n", + "# Compute weights matrix\n", + "weights_matrix = load_weights_matrix( trainset, vectors, emb_dim=300 )" + ], + "metadata": { + "id": "a0XTQXaKJ7IH" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## 1- Model definition\n", + "\n" + ], + "metadata": { + "id": "TJghdjf-Leun" + } + }, + { + "cell_type": "markdown", + "source": [ + "### 1.1 FFNN with an embedding bag layer (code given)\n", + "\n", + "Below is the code written during TP4 for a classic FeedForward Neural Network using pretrained word embeddings (but without offsets, see the note about batches above)." + ], + "metadata": { + "id": "avYLIUvWMZWL" + } + }, + { + "cell_type": "code", + "source": [ + "class FeedforwardNeuralNetModel(nn.Module):\n", + " def __init__(self, hidden_dim, output_dim, weights_matrix):\n", + " # calls the init function of nn.Module. Dont get confused by syntax,\n", + " # just always do it in an nn.Module\n", + " super(FeedforwardNeuralNetModel, self).__init__()\n", + "\n", + " # Embedding layer\n", + " # mode (string, optional) – \"sum\", \"mean\" or \"max\". Default=mean.\n", + " self.embedding_bag = nn.EmbeddingBag.from_pretrained(\n", + " weights_matrix,\n", + " mode='mean')\n", + " embed_dim = self.embedding_bag.embedding_dim\n", + "\n", + " # Linear function\n", + " self.fc1 = nn.Linear(embed_dim, hidden_dim)\n", + "\n", + " # Non-linearity\n", + " self.sigmoid = nn.Sigmoid()\n", + "\n", + " # Linear function (readout)\n", + " self.fc2 = nn.Linear(hidden_dim, output_dim)\n", + "\n", + " def forward(self, text):\n", + " # Embedding layer\n", + " embedded = self.embedding_bag(text)\n", + "\n", + " # Linear function\n", + " out = self.fc1(embedded)\n", + "\n", + " # Non-linearity\n", + " out = self.sigmoid(out)\n", + "\n", + " # Linear function (readout)\n", + " out = self.fc2(out)\n", + " return out" + ], + "metadata": { + "id": "hZKG5VRhJmJw" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### 1.2 Exercise: From FFNN to LSTM\n", + "\n", + "We want to replace our hidden layer with an LSTM layer: the LSTM layers takes the word embeddings as input and the output will be directly fed to the output layer.\n", + "* you thus need to replace the embeddingBag layer with a simple embedding layer taking pretrained word embeddings, see the documentation here (search 'from_pretrained') https://pytorch.org/docs/stable/generated/torch.nn.Embedding.html\n", + "* then you need to define an LSTM layer, see the doc here https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html.\n", + "\n", + "\n", + "The *forward* function is given. Note that:\n", + "* the LSTM has 2 outputs: ht and ct (hidden and memory state)\n", + "* the output of an LSTM, the 'y', is now the last hidden state computed for the entire sequence (classification task)\n", + "\n", + "In addition, note that in the forward pass, we need to reshape the data using:\n", + "```\n", + "x = x.view(len(x), 1, -1)\n", + "```\n", + "\n", + "We need to reshape our input data before passing it to the LSTM layer, because it takes a 3D tensor with *(Sequence lenght, Batch size, Input size)*.\n", + "This is done with the 'view' method, the pytorch 'reshape' function for tensors. (there's also a format with batch size first, more easy to understand)" + ], + "metadata": { + "id": "Ws0VVWYYMShP" + } + }, + { + "cell_type": "markdown", + "source": [ + "-------------\n", + "SOLUTION" + ], + "metadata": { + "id": "7L7Zw3YS3icG" + } + }, + { + "cell_type": "code", + "source": [ + "\n", + "class LSTMModel(nn.Module):\n", + " def __init__(self, hidden_dim, output_dim, weights_matrix, batch_first=True ):\n", + " super(LSTMModel, self).__init__()\n", + "\n", + " # Define an embedding layer\n", + " # -- SOLUTION\n", + " #self.embedding = ?\n", + " #embedding_dim = ?\n", + "\n", + " # Define an LSTM layer\n", + " # -- SOLUTION\n", + " #self.lstm = ?\n", + "\n", + " self.fc2 = nn.Linear(hidden_dim, output_dim) #out, (ht, ct) = self.lstm( embeds )\n", + "\n", + " def forward(self, text):\n", + " embeds = self.embedding(text)\n", + " # print( text) # a tensor of indices representing the tokens in a sequence\n", + " # print( embeds.shape) # (batch, seq, features) eg 1, 107, 300\n", + " # We need: (seq, batch, feature)\n", + " x = embeds.view(text.shape[1], 1, -1) # -1 allows to guess a dimension\n", + " # print( x.shape) # (seq, batch, features) eg 107, 1, 300\n", + " out, (ht, ct) = self.lstm( x ) # <--- here the real 'out' is ht\n", + " y = self.fc2(ht[-1]) # <--- we keep only the last hidden state\n", + " return y" + ], + "metadata": { + "id": "VV1vVgtmMShQ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## 2- Running experiments\n", + "\n", + "The code to train and evaluate your network is given below (again version without offsets)." + ], + "metadata": { + "id": "ykWFAXIFLs3W" + } + }, + { + "cell_type": "code", + "source": [ + "from sklearn.metrics import classification_report, accuracy_score\n", + "\n", + "def train( model, train_loader, optimizer, num_epochs=5, trace=False ):\n", + " for epoch in range(num_epochs):\n", + " train_loss, total_acc, total_count = 0, 0, 0\n", + " for input, label in train_loader:\n", + " input = input.to(device)\n", + " label = label.to(device)\n", + " # Step1. Clearing the accumulated gradients\n", + " optimizer.zero_grad()\n", + " # Step 2. Forward pass to get output/logits\n", + " outputs = model( input )\n", + " if trace:\n", + " print(input) # <---- call with trace=True to 'see' the input\n", + " trace=False\n", + " # Step 3. Compute the loss, gradients, and update the parameters by\n", + " # calling optimizer.step()\n", + " # - Calculate Loss: softmax --> cross entropy loss\n", + " loss = criterion(outputs, label)\n", + " # - Getting gradients w.r.t. parameters\n", + " loss.backward()\n", + " # - Updating parameters\n", + " optimizer.step()\n", + " # Accumulating the loss over time\n", + " train_loss += loss.item()\n", + " total_acc += (outputs.argmax(1) == label).sum().item()\n", + " total_count += label.size(0)\n", + " # Compute accuracy on train set at each epoch\n", + " print('Epoch: {}. Loss: {}. ACC {} '.format(epoch, train_loss/total_count, total_acc/total_count))\n", + " total_acc, total_count = 0, 0\n", + " train_loss = 0\n", + "\n", + "def evaluate( model, dev_loader ):\n", + " predictions = []\n", + " gold = []\n", + " with torch.no_grad():\n", + " for input, label in dev_loader:\n", + " input = input.to(device)\n", + " label = label.to(device)\n", + " probs = model(input)\n", + " predictions.append( torch.argmax(probs, dim=1).cpu().numpy()[0] )\n", + " gold.append(int(label))\n", + " print(classification_report(gold, predictions))\n", + " return gold, predictions" + ], + "metadata": { + "id": "B5_-75IOJmMY" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### 2.1 Exercise: Experiment with an FFNN with an Embedding bag layer\n", + "\n", + "Run first experiments with the embedding bag layer architecture, using the values below for the hyper-parameters." + ], + "metadata": { + "id": "w3IbjQfOL29y" + } + }, + { + "cell_type": "code", + "source": [ + "# Set the values of the hyperparameters\n", + "hidden_dim = 4\n", + "learning_rate = 0.1\n", + "num_epochs = 10\n", + "criterion = nn.CrossEntropyLoss()\n", + "output_dim = 2" + ], + "metadata": { + "id": "FJ913fY3KTKq" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Initialize the model\n", + "model_ffnn = FeedforwardNeuralNetModel( hidden_dim, output_dim, weights_matrix)\n", + "optimizer = torch.optim.SGD(model_ffnn.parameters(), lr=learning_rate)\n", + "model_ffnn = model_ffnn.to(device)\n", + "# Train the model\n", + "train( model_ffnn, train_loader, optimizer, num_epochs=num_epochs )\n", + "# Evaluate on dev\n", + "gold, pred = evaluate( model_ffnn, dev_loader )" + ], + "metadata": { + "id": "SSjMfiIaKTKv" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### 2.2 Exercise: Experiment with an LSTM layer\n", + "\n", + "Run then experiments using the architecture based on an LSTM layer: Are results better? What about the computation time?\n", + "\n", + "Try a (very) few hyper-parameter variations to see if you can get better results with each model." + ], + "metadata": { + "id": "p7eoYgQ5MA-y" + } + }, + { + "cell_type": "code", + "source": [ + "# Set the values of the hyperparameters\n", + "hidden_dim = 4\n", + "learning_rate = 0.1\n", + "num_epochs = 30\n", + "criterion = nn.CrossEntropyLoss()\n", + "output_dim = 2" + ], + "metadata": { + "id": "84PEAiufH84d" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "-----------------\n", + "SOLUTION" + ], + "metadata": { + "id": "Xw5_SEO3nPCr" + } + }, + { + "cell_type": "code", + "source": [ + "# Initialization of the model\n", + "\n", + "\n", + "# Train the model\n", + "\n", + "\n", + "# Evaluate on dev\n" + ], + "metadata": { + "id": "ZF6_VD4unSuQ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "SW215Z9AnSxc" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "W6jY1_qgnS0O" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# Part 2: LSTM for POS Tagging\n", + "\n", + "In the previous part, LSTM is used to encode a sequence, ie as a way to get a better representation than a bag of embeddings.\n", + "\n", + "RNNs are powerful for sequence labelling task, where the goal is to ouput a label for each token in the input sequence, eg POS tagging, NER, sentence/discourse segmentation (anything annotated with BIO scheme)..." + ], + "metadata": { + "id": "rTsfKWeP7Jdl" + } + }, + { + "cell_type": "markdown", + "source": [ + "## 1- Small tutorial\n", + "\n", + "The code for an LSTM tagger is given below:\n", + "* the input is still word embeddings (here intialized randomly)\n", + "* the definition of the LSTM layer is the same as previously\n", + "* the output layer has (probably) more output dimensions, as given here by 'tagset_size' = here all the possible POS tags\n", + "\n", + "The difference is in the forward function:\n", + "* now we need to keep all the hidden states for each input token in the sequence = tag_space (instead for only outputing the last hidden state)\n", + "\n", + "Note also that here we apply a softmax function at the end, because the loss used doesn't include it.\n", + "\n", + "From: https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html" + ], + "metadata": { + "id": "7OOuoZLlIJI2" + } + }, + { + "cell_type": "code", + "source": [ + "import torch\n", + "import torch.nn as nn\n", + "import torch.nn.functional as F\n", + "\n", + "class LSTMTagger(nn.Module):\n", + "\n", + " def __init__(self, embedding_dim, hidden_dim, vocab_size, tagset_size):\n", + " super(LSTMTagger, self).__init__()\n", + " self.hidden_dim = hidden_dim\n", + "\n", + " self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)\n", + "\n", + " # The LSTM takes word embeddings as inputs, and outputs hidden states\n", + " # with dimensionality hidden_dim.\n", + " self.lstm = nn.LSTM(embedding_dim, hidden_dim)\n", + "\n", + " # The linear layer that maps from hidden state space to tag space\n", + " self.hidden2tag = nn.Linear(hidden_dim, tagset_size)\n", + "\n", + " def forward(self, sentence):\n", + " embeds = self.word_embeddings(sentence)\n", + " #print('embeds.shape', embeds.shape)\n", + " #print(embeds.view(len(sentence), 1, -1).shape)\n", + " lstm_out, _ = self.lstm(embeds.view(len(sentence), 1, -1))\n", + " tag_space = self.hidden2tag(lstm_out.view(len(sentence), -1)) # the whole output, vs output[-1] for classif\n", + " tag_scores = F.log_softmax(tag_space, dim=1) # required with nn.NLLLoss()\n", + " return tag_scores" + ], + "metadata": { + "id": "7O_41eHi71SO" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### 1.1 Prepare data\n", + "\n", + "As usual, an important step is to well prepare the data to be given to our model.\n", + "Below is some code to prepare a small toy dataset.\n", + "\n", + "- we need to go over the input sequences (sentences) and retrieve the vocabulary (words) and the tag set (POS).\n", + "- we build dictionaries to map words and POS to indices, and transform our data to list of indices." + ], + "metadata": { + "id": "sfyJF7bIJv8M" + } + }, + { + "cell_type": "code", + "source": [ + "# Transform a list of tokens into a list of indices using the dict given\n", + "def prepare_sequence(seq, to_ix):\n", + " '''\n", + " - seq: an input sequence of tokens\n", + " - to_ix: a dictionary mapping token to indices\n", + " output: a tensor of indices representing the input sequence\n", + " '''\n", + " idxs = [to_ix[w] for w in seq]\n", + " return torch.tensor(idxs, dtype=torch.long)\n", + "\n", + "# Toy dataset\n", + "training_data = [\n", + " # Tags are: DET - determiner; NN - noun; V - verb\n", + " # For example, the word \"The\" is a determiner\n", + " (\"The dog ate the apple\".split(), [\"DET\", \"NN\", \"V\", \"DET\", \"NN\"]),\n", + " (\"Everybody read that book\".split(), [\"NN\", \"V\", \"DET\", \"NN\"])\n", + "]\n", + "\n", + "# Build the mapping from word to indices\n", + "word_to_ix = {}\n", + "# For each words-list (sentence) and tags-list in each tuple of training_data\n", + "for sent, tags in training_data:\n", + " for word in sent:\n", + " if word not in word_to_ix: # word has not been assigned an index yet\n", + " word_to_ix[word] = len(word_to_ix) # Assign each word with a unique index\n", + "print(word_to_ix)\n", + "\n", + "# Here the mapping for POS tags is given\n", + "tag_to_ix = {\"DET\": 0, \"NN\": 1, \"V\": 2} # Assign each tag with a unique index\n", + "ix_to_tag = {v: k for k, v in tag_to_ix.items()}" + ], + "metadata": { + "id": "6A5Z-EsT7w_V" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### 1.3 Run the model\n", + "\n", + "Now we can train the POS tagger over the toy dataset." + ], + "metadata": { + "id": "bJiuDcmyK9Ba" + } + }, + { + "cell_type": "code", + "source": [ + "# These will usually be more like 32 or 64 dimensional.\n", + "# We will keep them small, so we can see how the weights change as we train.\n", + "EMBEDDING_DIM = 6\n", + "HIDDEN_DIM = 6\n", + "\n", + "\n", + "model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))\n", + "loss_function = nn.NLLLoss() # does not include the softmax\n", + "optimizer = torch.optim.SGD(model.parameters(), lr=0.1)" + ], + "metadata": { + "id": "jiSYatB48Brn" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Look at the output of the code below: it corresponds to the scores for each POS (3 possibilities) for each token in the first training sentence (5 words) BEFORE TRAINING." + ], + "metadata": { + "id": "TRY-H1iMLVmM" + } + }, + { + "cell_type": "code", + "source": [ + "# See what the scores are before training\n", + "# Note that element i,j of the output is the score for tag j for word i.\n", + "# Here we don't need to train, so the code is wrapped in torch.no_grad()\n", + "with torch.no_grad():\n", + " inputs = prepare_sequence(training_data[0][0], word_to_ix)\n", + " tag_scores = model(inputs)\n", + " print(tag_scores)" + ], + "metadata": { + "id": "XdC_GJufLN1d" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "Now we train the model:" + ], + "metadata": { + "id": "Igkyu3W1LlnM" + } + }, + { + "cell_type": "code", + "source": [ + "for epoch in range(300): # again, normally you would NOT do 300 epochs, it is toy data\n", + " for sentence, tags in training_data:\n", + " # Step 1. Remember that Pytorch accumulates gradients.\n", + " # We need to clear them out before each instance\n", + " model.zero_grad()\n", + "\n", + " # Step 2. Get our inputs ready for the network, that is, turn them into\n", + " # Tensors of word indices.\n", + " sentence_in = prepare_sequence(sentence, word_to_ix)\n", + " targets = prepare_sequence(tags, tag_to_ix)\n", + "\n", + " # Step 3. Run our forward pass.\n", + " tag_scores = model(sentence_in)\n", + "\n", + " # Step 4. Compute the loss, gradients, and update the parameters by\n", + " # calling optimizer.step()\n", + " loss = loss_function(tag_scores, targets)\n", + " loss.backward()\n", + " optimizer.step()" + ], + "metadata": { + "id": "hW9FiA4dLHRw" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "And we look at the score after training: what do you see?" + ], + "metadata": { + "id": "4JmZd7P4Lp46" + } + }, + { + "cell_type": "code", + "source": [ + "# See what the scores are after training\n", + "with torch.no_grad():\n", + " inputs = prepare_sequence(training_data[0][0], word_to_ix)\n", + " tag_scores = model(inputs)\n", + " predictions = torch.argmax(tag_scores, dim=1).cpu().numpy()\n", + " print(tag_scores)\n", + " print(predictions)\n", + " print(training_data[0][0])\n", + " print( [ix_to_tag[p] for p in predictions])\n", + "\n", + " # The sentence is \"the dog ate the apple\". i,j corresponds to score for tag j\n", + " # for word i. The predicted tag is the maximum scoring tag.\n", + " # Here, we can see the predicted sequence below is 0 1 2 0 1\n", + " # since 0 is index of the maximum value of row 1,\n", + " # 1 is the index of maximum value of row 2, etc.\n", + " # Which is DET NOUN VERB DET NOUN, the correct sequence!\n", + " #print(tag_scores)" + ], + "metadata": { + "id": "Q_xgiu1MBAnB" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## 2- Training a POS tagger on a large set of data" + ], + "metadata": { + "id": "OiMM4pDON_kB" + } + }, + { + "cell_type": "code", + "source": [ + "!pip install torchdata\n", + "!pip install portalocker>=2.0.0" + ], + "metadata": { + "id": "qWakPj_QQwPx" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from torch.utils.data import Dataset\n", + "from torchtext import datasets" + ], + "metadata": { + "id": "VyyuXUhmUg5r" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "#ud_dataset = torchtext.datasets.UDPOS(root: str = '.data', split: Union[Tuple[str], str] = ('train', 'valid', 'test'))\n", + "\n", + "train_iter, test_iter = datasets.UDPOS(split=('train', 'valid'))\n" + ], + "metadata": { + "id": "1CszON2uJmUb" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "ex = next(iter(train_iter))\n", + "print(ex[0])\n", + "print(ex[1])\n", + "print(ex[2])" + ], + "metadata": { + "id": "guxlgcFuRRR_" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "from collections import Counter\n", + "from torchtext.vocab import vocab\n", + "\n", + "# Build vocabulary\n", + "training_size = 0\n", + "counter = Counter()\n", + "for (tokens, tags, _) in train_iter:\n", + " training_size += 1\n", + " counter.update(tokens)\n", + "vocab = vocab(counter, min_freq=10, specials=('<unk>', '<BOS>', '<EOS>', '<PAD>'))\n", + "print( \"total number of example in training set:\", training_size)" + ], + "metadata": { + "id": "xdoeEWyPX8pO" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "print(\"The length of the new vocab is\", len(vocab))\n", + "# token to indices\n", + "stoi = vocab.get_stoi()\n", + "print(\"The index of '<BOS>' is\", stoi['<BOS>'])\n", + "# indice to token\n", + "itos = vocab.get_itos()\n", + "print(\"The token at index 2 is\", itos[2])\n", + "print(\"The token at index 42 is\", itos[42])" + ], + "metadata": { + "id": "P-WrgNL0YCJK" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Transform a list of tokens into a list of indices using the dict given\n", + "def prepare_sequence(seq, to_ix):\n", + " '''\n", + " - seq: an input sequence of tokens\n", + " - to_ix: a dictionary mapping token to indices\n", + " output: a tensor of indices representing the input sequence\n", + " '''\n", + " idxs = [to_ix[w] for w in seq]\n", + " return torch.tensor(idxs, dtype=torch.long)\n", + "\n", + "\n", + "# Build the mapping from word to indices, and from POS to indices\n", + "word_to_ix, tag_to_ix = {}, {}\n", + "# For each words-list (sentence) and tags-list in each tuple of training_data\n", + "for sent, tags, _ in train_iter:\n", + " for word in sent:\n", + " if word not in word_to_ix: # word has not been assigned an index yet\n", + " word_to_ix[word] = len(word_to_ix) # Assign each word with a unique index\n", + " for tag in tags:\n", + " if tag not in tag_to_ix:\n", + " tag_to_ix[tag] = len(tag_to_ix)\n", + "\n", + "# Reverse mapping\n", + "ix_to_tag = {v: k for k, v in tag_to_ix.items()}\n", + "\n", + "# Print the mapping dictionaries\n", + "print(word_to_ix)\n", + "print(tag_to_ix)" + ], + "metadata": { + "id": "Va4-PPpSSPjc" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Train a model\n", + "\n", + "EMBEDDING_DIM = 6\n", + "HIDDEN_DIM = 6\n", + "\n", + "model = LSTMTagger(EMBEDDING_DIM, HIDDEN_DIM, len(word_to_ix), len(tag_to_ix))\n", + "loss_function = nn.NLLLoss() # does not include the softmax\n", + "optimizer = torch.optim.SGD(model.parameters(), lr=0.1)\n", + "\n", + "epoch_acc, epoch_loss = 0, 0\n", + "\n", + "total_count = 0\n", + "\n", + "for epoch in range(3): # again, normally you would NOT do 300 epochs, it is toy data\n", + " index = 0\n", + " for sentence, tags, _ in train_iter:\n", + " index += 1\n", + " # Step 1. Remember that Pytorch accumulates gradients.\n", + " # We need to clear them out before each instance\n", + " model.zero_grad()\n", + "\n", + " # Step 2. Get our inputs ready for the network, that is, turn them into\n", + " # Tensors of word indices.\n", + " sentence_in = prepare_sequence(sentence, word_to_ix)\n", + " targets = prepare_sequence(tags, tag_to_ix)\n", + "\n", + " # Step 3. Run our forward pass.\n", + " tag_scores = model(sentence_in)\n", + "\n", + " # Compute accuracy score per token\n", + " predictions = tag_scores.view(-1, tag_scores.shape[-1])\n", + " max_preds = predictions.argmax(dim = 1, keepdim = True) # get the index of the max probability\n", + " correct = max_preds.squeeze(1).eq(targets)\n", + " acc_sentence = correct.sum()\n", + " #acc_sentence = correct.sum() / targets.shape[0]\n", + "\n", + "\n", + " if index in [29,55, 930]: # selection of short sentences #, 1150\n", + " print( \"SENTENCE:\", sentence )\n", + " print(\"GOLD (original):\", tags )\n", + " print(\"GOLD (indices):\", targets)\n", + " print( \"SCORES\", tag_scores )\n", + " print(\"PRED (indices):\", list(max_preds.squeeze(1)) )\n", + " print(\"PRED (mapped):\", [ix_to_tag[i.item()] for i in list(max_preds.squeeze(1))] )\n", + " print( \"CORRECT / UNCORRECT list:\", correct)\n", + " print(\"# CORRECT PREDicted POS =\", acc_sentence.item() )\n", + "\n", + " print( '\\n')\n", + "\n", + " # Step 4. Compute the loss, gradients, and update the parameters by\n", + " # calling optimizer.step()\n", + " loss = loss_function(tag_scores, targets)\n", + " loss.backward()\n", + " optimizer.step()\n", + "\n", + " epoch_loss += loss.item()\n", + " epoch_acc += acc_sentence.item()\n", + " total_count += len(sentence)\n", + "\n", + " # Compute accuracy on train set at each epoch\n", + " print('Epoch: {}. Loss: {}. ACC {}.'.format(epoch, epoch_loss/total_count,\n", + " round( (epoch_acc/total_count)*100, 2)))\n", + " epoch_acc, epoch_loss, total_count = 0, 0, 0\n", + " #print(epoch_acc / len(train_iter))\n", + "\n", + "\n", + "# {'PROPN': 0, 'PUNCT': 1, 'ADJ': 2, 'NOUN': 3, 'VERB': 4, 'DET': 5, 'ADP': 6, 'AUX': 7, 'PRON': 8, 'PART': 9, 'SCONJ': 10, 'NUM': 11, 'ADV': 12, 'CCONJ': 13, 'X': 14, 'INTJ': 15, 'SYM': 16}" + ], + "metadata": { + "id": "M2zfqhVmSxIC" + }, + "execution_count": null, + "outputs": [] + } + ] +} \ No newline at end of file diff --git a/notebooks/TP8_m2LiTL_transformers_learning_2324.ipynb b/notebooks/TP8_m2LiTL_transformers_learning_2324.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..6f12fca34ec78751783bf91052ef6d2b5f1e8d67 --- /dev/null +++ b/notebooks/TP8_m2LiTL_transformers_learning_2324.ipynb @@ -0,0 +1,761 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "-bb49S7B50eh" + }, + "source": [ + "# TP 8: Transformers et transfert de modèles\n", + "\n", + "Dans cette séance, nous verrons comment utiliser un modèle pré-entrainé pour l'adapter à une nouvelle tâche (transfert). Ce TP fait suite au TP6.\n", + "\n", + "Rappel = le code ci-dessous vous permet d'installer : \n", + "- le module *transformers*, qui contient les modèles de langue https://pypi.org/project/transformers/\n", + "- la librairie de datasets pour accéder à des jeux de données\n", + "- la librairie *evaluate* : utilisée pour évaluer et comparer des modèles https://pypi.org/project/evaluate/" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "9UoSnFV250el" + }, + "outputs": [], + "source": [ + "!pip install -U transformers\n", + "!pip install accelerate -U\n", + "!pip install datasets\n", + "!pip install evaluate" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Finally, if the installation is successful, we can import the transformers library:" + ], + "metadata": { + "id": "StClx_Hh9PDm" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "ZBQcA9Ol50en" + }, + "outputs": [], + "source": [ + "import transformers\n", + "from datasets import load_dataset\n", + "import evaluate\n", + "import numpy as np\n", + "import sklearn" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "3TIXCS5P50en" + }, + "outputs": [], + "source": [ + "from transformers import AutoModelForSequenceClassification, AutoTokenizer\n", + "from transformers import AutoModelForTokenClassification" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "vCLf1g8z50ep" + }, + "outputs": [], + "source": [ + "import pandas as pds\n", + "from tqdm import tqdm" + ] + }, + { + "cell_type": "markdown", + "source": [ + "# 1- Transformers pipeline\n", + "\n", + "As seen during the course, the current state of the art for NLP is based on large language models trained using the Transformer architecture.\n", + "\n", + "In the next exercises, we will learn how to use pretrained models that are available in the HuggingFace library, starting with Trnasformers pipelines." + ], + "metadata": { + "id": "uGZBOXpTXA72" + } + }, + { + "cell_type": "code", + "source": [ + "from transformers import pipeline" + ], + "metadata": { + "id": "Od8TVRnQJ8TH" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### 1-1 ▶▶ Exercise: use a pretrained model for French\n", + "\n", + "Load the adapted version of **FlauBERT** fine-tuned for sentiment analysis for French.\n" + ], + "metadata": { + "id": "dQo8pS93BJKf" + } + }, + { + "cell_type": "code", + "source": [ + "!pip install sacremoses" + ], + "metadata": { + "id": "i5t_Ik688rIX" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "-----------\n", + "SOLUTION" + ], + "metadata": { + "id": "qvxrerHI2MYs" + } + }, + { + "cell_type": "code", + "source": [ + "\n" + ], + "metadata": { + "id": "Qpuldij38AwO" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### 1-2 Using our own dataset for evaluation\n", + "\n", + "Here, we're simply going to load our dataset and evaluate a pretrained language model on it.\n", + "\n", + "HuggingFace has a library dedicated to datasets:\n", + "* 'load_dataset' can load data from a tsv/csv file, see the code below\n", + "* it directly creates the training/validation/test sets from the dictionary of input files.\n", + "\n", + "https://huggingface.co/course/chapter5/2?fw=pt\n", + "https://huggingface.co/docs/datasets/tabular_load#csv-files\n", + "https://huggingface.co/docs/datasets/v2.8.0/en/package_reference/loading_methods#datasets.load_dataset.split" + ], + "metadata": { + "id": "-xFvKUiFBnL1" + } + }, + { + "cell_type": "code", + "source": [ + "from datasets import load_dataset\n", + "\n", + "file_dict = {\n", + " \"train\" : \"allocine_train.tsv\",\n", + " \"dev\" : \"allocine_dev.tsv\",\n", + " \"test\" : \"allocine_test.tsv\"\n", + "}\n", + "\n", + "dataset = load_dataset(\n", + " 'csv', #type of files\n", + " data_files=file_dict, #input files\n", + " delimiter='\\t', # delimiter in the csv format\n", + " column_names=['movie_id', 'user_id', 'sentiment', 'review'], #column names in the csv file\n", + " skiprows=1, #skip the first line\n", + ")\n", + "\n", + "print(dataset[\"train\"])\n", + "\n", + "# Print a few examples\n", + "sample = dataset[\"train\"].shuffle(seed=42).select(range(1000))\n", + "# Peek at the first few examples\n", + "sample[:3]\n" + ], + "metadata": { + "id": "gb5KqKSYJmW3" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "#### ▶▶ Exercise: evaluate the pretrained model on your data\n", + "\n", + "* Using the model FlauBERT for sentiment analysis for French and the *pipeline* method, make predictions on some examples in the dataset\n", + "* Take a look at the predictions: do you understand the output?\n", + "* Write a piece of code to compute the score obtained by this pretrained model on your validation / dev set.\n", + " * Compute the predicted labels for all samples: what are the labels used in the model?\n", + " * Define a mapping to the labels used in the dataset.\n", + " * Compare the predicted labels to the gold ones and compute an accuracy score.\n" + ], + "metadata": { + "id": "n1kbUmQ3H3H9" + } + }, + { + "cell_type": "markdown", + "source": [ + "-----------------------------------------\n", + "SOLUTION" + ], + "metadata": { + "id": "dqheBv3jIOX6" + } + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "Ai4ZMcO6jnLJ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "fDNnNw9qjnS2" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "OQiv_oGMjrXv" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "xTWcz0lijrfC" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "# 2- Transfert / fine-tuning : analyse de sentiment\n", + "\n", + "Dans cette partie, nous allons fine-tuner / affiner un modèle de langue pré-entraîné (agnostique) pour l'adapter à la tâche d'analyse de sentiment.\n", + "\n", + "On travaillera sur des données en anglais (corpus IMDb, que l'on peut directement charger depuis HuggingFace)." + ], + "metadata": { + "id": "HUx1kHH8eUjE" + } + }, + { + "cell_type": "markdown", + "source": [ + "### 2-1 Charger un modèle pré-entraîné : DistilBERT\n", + "\n", + "Ici on ne va pas passer par la pipeline, pour pouvoir plus simplement gérer les éléments du modèle : le modèle et le tokenizer associé.\n", + "\n", + "On utilise ici le modèle DistilBERT, une version plus petite et rapide du modèle transformer BERT.\n", + "\n", + "Plus d'info ici: https://huggingface.co/distilbert-base-uncased.\n" + ], + "metadata": { + "id": "c40x3RDbB3Qo" + } + }, + { + "cell_type": "code", + "source": [ + "# Chosing the pre-trained model\n", + "# - distilBERT: specific, faster and lighter version of BERT\n", + "# - base vs large\n", + "# - uncased: ignore upper case\n", + "base_model = \"distilbert-base-uncased\"" + ], + "metadata": { + "id": "UtdppwkoB3Qp" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### 2-2 Tokenizer\n", + "\n", + "Définir un tokenizer et un modèle associés au modèle pré-entraîné DistilBERT." + ], + "metadata": { + "id": "NUus9JUNB3Qq" + } + }, + { + "cell_type": "markdown", + "source": [ + "-------------------\n", + "SOLUTION\n" + ], + "metadata": { + "id": "xq9sUFYg9Wd7" + } + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "MIPGuRrLju-6" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### 2-3 ▶▶ Exercise: Load new data for transfer\n", + "\n", + "Charger l'ensemble de données IMDB qui correspond à de l'analyse de sentiment sur des reviews de films (en anglais).\n", + "On va utiliser ces données pour affiner notre modèle pré-entraîné (agnostique) sur la tâche d'analyse de sentiments." + ], + "metadata": { + "id": "8lt8MjqYIZCl" + } + }, + { + "cell_type": "markdown", + "source": [ + "---------------\n", + "SOLUTION" + ], + "metadata": { + "id": "yamXvQ3q9mbQ" + } + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "dJqulEuKjxjw" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [], + "metadata": { + "id": "imbWkM_cjx1f" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "### 2-4 Tokenization des données\n", + "\n", + "Le code ci-dessous permet d'obtenir une version tokenisée du corpus." + ], + "metadata": { + "id": "SbjUad2-tecl" + } + }, + { + "cell_type": "markdown", + "source": [ + "#### ▶▶ Exercice Tokenisation :\n", + "\n", + "Regardez la doc pour vérifier que vous comprenez la fonction des paramètres utilisées : https://huggingface.co/docs/transformers/v4.25.1/en/main_classes/tokenizer#transformers.PreTrainedTokenizer.\n", + "\n", + "- à quoi sert le padding ?\n", + "- à quoi correspond le paramètre 'truncation' ?\n", + "\n", + "Note: pour plus de détails sur la fonction *Map()* https://huggingface.co/docs/datasets/process et aussi https://huggingface.co/docs/datasets/v2.7.1/en/package_reference/main_classes#datasets.Dataset.map" + ], + "metadata": { + "id": "HY-5WQapfCTV" + } + }, + { + "cell_type": "markdown", + "source": [ + "------------\n", + "SOLUTION\n" + ], + "metadata": { + "id": "2irkWDLEuSTp" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "-Kj0bW3_50et" + }, + "outputs": [], + "source": [ + "def tokenize_function(examples):\n", + " return tokenizer(examples[\"text\"], padding=\"max_length\", truncation=True)\n", + "\n", + "\n", + "tokenized_datasets = dataset.map(tokenize_function, batched=True)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "Notez que le tokenizer retourne deux éléments:\n", + "\n", + "- input_ids: the numbers representing the tokens in the text.\n", + "- attention_mask: indicates whether a token should be masked or not.\n", + "\n", + "Plus d'info sur les datasets: https://huggingface.co/docs/datasets/use_dataset" + ], + "metadata": { + "id": "ATFZVbiYwD34" + } + }, + { + "cell_type": "code", + "source": [ + "tokenized_datasets" + ], + "metadata": { + "id": "TKTi2eO8d-JJ" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "## 2-5 Entraînement / Fine-tuning\n", + "\n", + "Pour l'entraînement du modèle, on définit d'abord\n", + "- une configuration via la classe *TrainingArguments*.\n", + "- un niveau de 'verbosité'\n", + "- une métrique d'évaluation" + ], + "metadata": { + "id": "HYws35k8xCq0" + } + }, + { + "cell_type": "code", + "source": [ + "from transformers import TrainingArguments, Trainer\n", + "training_args = TrainingArguments(output_dir=\"test_trainer\",\n", + " no_cuda=False, # sur ordi perso sans bon GPU\n", + " per_device_train_batch_size=4,\n", + " #evaluation_strategy=\"steps\",\n", + " #eval_steps=100,\n", + " num_train_epochs=5,\n", + " do_eval=True )" + ], + "metadata": { + "id": "uLVIKxZcgOpb" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "JUtftrdy50ev" + }, + "outputs": [], + "source": [ + "from transformers.utils import logging\n", + "\n", + "logging.set_verbosity_error()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "F8O_Jmcx50ew" + }, + "outputs": [], + "source": [ + "metric = evaluate.load(\"accuracy\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "UZk65ZKH50ew" + }, + "outputs": [], + "source": [ + "def compute_metrics(eval_pred):\n", + " logits, labels = eval_pred\n", + " predictions = np.argmax(logits, axis=-1)\n", + " return metric.compute(predictions=predictions, references=labels)" + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Trainer\n", + "\n", + "Une instance de la classe *Trainer* correspond à une boucle d'entraînement classique, basée sur les éléments définis précédemment.\n", + "\n", + "https://huggingface.co/docs/transformers/main_classes/trainer" + ], + "metadata": { + "id": "8FEJYEhDxoCp" + } + }, + { + "cell_type": "markdown", + "source": [ + "On va sélectionner un sous-ensemble des données ici, pour que l'entraînement soit un peu moins long." + ], + "metadata": { + "id": "4QUvGEbOvRTH" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "Dgfoqbx950eu" + }, + "outputs": [], + "source": [ + "small_train_dataset = tokenized_datasets[\"train\"].shuffle(seed=42).select(range(1000))\n", + "small_eval_dataset = tokenized_datasets[\"test\"].shuffle(seed=42).select(range(100))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "uX2nBPnk50ew" + }, + "outputs": [], + "source": [ + "trainer = Trainer(\n", + " model=model,\n", + " args=training_args,\n", + " train_dataset=small_train_dataset,\n", + " eval_dataset=small_eval_dataset,\n", + " compute_metrics=compute_metrics,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "source": [ + "### Lancer l'entraînement\n", + "\n", + "Et on peut lancer l'entraînement en utilisant la méthode *train()*." + ], + "metadata": { + "id": "GhGLiCEVx-8v" + } + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "IN58_eaV50ex" + }, + "outputs": [], + "source": [ + "import os\n", + "os.environ[\"WANDB_DISABLED\"] = \"true\"\n", + "trainer.train( )" + ] + }, + { + "cell_type": "markdown", + "source": [ + "## 2-6 Evaluation" + ], + "metadata": { + "id": "MgJpr49WySMd" + } + }, + { + "cell_type": "markdown", + "source": [ + "#### Evaluation sur un exemple\n", + "\n", + "On teste le modèle sur un exemple de l'ensemble d'évaluation." + ], + "metadata": { + "id": "2bE7kBlEH4es" + } + }, + { + "cell_type": "code", + "source": [ + "ex_eval = small_eval_dataset[1][\"text\"]\n", + "input = tokenizer(ex_eval, return_tensors=\"pt\")\n", + "input_ids = input.input_ids.to(\"cuda\")\n", + "print(input_ids.shape)\n", + "output = model(input_ids)\n", + "\n", + "print(\"gold\", small_eval_dataset[1][\"label\"])\n", + "\n", + "print(output)" + ], + "metadata": { + "id": "uyky-X_bzGpS" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "output[\"logits\"]" + ], + "metadata": { + "id": "JDcpli3k2d_f" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "pred = np.argmax(output[\"logits\"].cpu().detach().numpy(), axis=-1)\n", + "print(\"Pred\", pred)" + ], + "metadata": { + "id": "3DeTwx2oz-Ek" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "print(tokenizer.tokenize(ex_eval))" + ], + "metadata": { + "id": "HsgQ6Ekd21IP" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "#### ▶▶ Exercice : Analyse d'erreurs\n", + "\n", + "Le code ci-dessous permet d'afficher les exemples sur lesquels le modèle a fait une erreur de prédiction.\n", + "Pour chaque exemple, affichez le label gold, le label prédit et le texte de l'exemple correspondant.\n", + "Inspecter les erreurs commises par le modèle.\n", + "\n", + "\n", + "Doc de Trainer https://huggingface.co/docs/transformers/main_classes/trainer#transformers.Trainer\n", + "\n" + ], + "metadata": { + "id": "A-cx4sdZGcz2" + } + }, + { + "cell_type": "code", + "source": [ + "# --- correction\n", + "if training_args.do_eval:\n", + " prob_labels,_,_ = trainer.predict( test_dataset=small_eval_dataset)\n", + " pred_labels = [ np.argmax(logits, axis=-1) for logits in prob_labels ]\n", + " #print( pred_labels)\n", + " gold_labels = [ inst[\"label\"] for inst in small_eval_dataset]\n", + "\n", + " for i in range( len( small_eval_dataset ) ):\n", + " ## -- Print pred, gold\n", + " #print(pred_labels[i], gold_labels[i])\n", + " if pred_labels[i] != gold_labels[i]:\n", + " print(i, gold_labels[i], pred_labels[i], small_eval_dataset[i][\"text\"] )" + ], + "metadata": { + "id": "L9phpmPnII-O" + }, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "source": [ + "On affiche finalement le score du modèle sur l'ensemble d'évaluation." + ], + "metadata": { + "id": "VaBD1-jaoR3w" + } + }, + { + "cell_type": "code", + "source": [ + "if training_args.do_eval:\n", + " metrics = trainer.evaluate(eval_dataset=small_eval_dataset)\n", + " print(metrics)" + ], + "metadata": { + "id": "3IdSk-1XHiVK" + }, + "execution_count": null, + "outputs": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "visual", + "language": "python", + "name": "visual" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.5" + }, + "colab": { + "provenance": [], + "collapsed_sections": [ + "-XRoTAe-50e1" + ], + "toc_visible": true + }, + "accelerator": "GPU", + "gpuClass": "standard" + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/slides/MasterLiTL_2324_Course5_160124.pdf b/slides/MasterLiTL_2324_Course5_160124.pdf new file mode 100644 index 0000000000000000000000000000000000000000..dff3a29710ed6d50f80e447c24cf94d4b67f774f Binary files /dev/null and b/slides/MasterLiTL_2324_Course5_160124.pdf differ