camembert classifier

210c70ae · Julien Breton · 3260d268 · 210c70ae · 210c70ae · 210c70ae
Commit 210c70ae authored 1 year ago by Julien Breton
--- a/data/dataQS.json
+++ b/data/dataQS.json
--- a/modules/camembert/camembert-classifier.ipynb
+++ b/modules/camembert/camembert-classifier.ipynb
+{
+ "cells": [
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "# Dataset",
+   "id": "8ced2e3ca31fb46c"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": [
+    "tag2id = {'action': 1, 'actor': 2, 'artifact': 3, 'condition': 4, 'location': 5, 'modality': 6, 'reference': 7, 'time': 8}\n",
+    "id2tag = {v:k for k, v in tag2id.items()}"
+   ],
+   "id": "757a8bf026156e77"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": [
+    "label2id = {\n",
+    "    'O': 0,\n",
+    "    **{f'B-{k}': 2*v - 1 for k, v in tag2id.items()},\n",
+    "    **{f'I-{k}': 2*v for k, v in tag2id.items()}\n",
+    "}\n",
+    "\n",
+    "id2label = {v:k for k, v in label2id.items()}"
+   ],
+   "id": "be3a4c320f9d4a5"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": [
+    "from datasets import Dataset\n",
+    "train_ds = Dataset.from_json(\"data/annotations.train.jsonlines\")\n",
+    "val_ds = Dataset.from_json(\"data/annotations.eval.jsonlines\")"
+   ],
+   "id": "2aa2fefac95e7f04"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": "modelId = '../../../models/CamemBERT-large'",
+   "id": "9e0a21356e7701a1"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "# Tokenization",
+   "id": "66e00d5a79a66753"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "tokenizer = AutoTokenizer.from_pretrained(modelId)"
+   ],
+   "id": "e6459259f5ab2d98"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": [
+    "def get_token_role_in_span(token_start: int, token_end: int, span_start: int, span_end: int):\n",
+    "    \"\"\"\n",
+    "    Check if the token is inside a span.\n",
+    "    Args:\n",
+    "      - token_start, token_end: Start and end offset of the token\n",
+    "      - span_start, span_end: Start and end of the span\n",
+    "    Returns:\n",
+    "      - \"B\" if beginning\n",
+    "      - \"I\" if inner\n",
+    "      - \"O\" if outer\n",
+    "      - \"N\" if not valid token (like <SEP>, <CLS>, <UNK>)\n",
+    "    \"\"\"\n",
+    "    if token_end <= token_start:\n",
+    "        return \"N\"\n",
+    "    if token_start < span_start or token_end > span_end:\n",
+    "        return \"O\"\n",
+    "    if token_start > span_start:\n",
+    "        return \"I\"\n",
+    "    else:\n",
+    "        return \"B\"\n",
+    "\n",
+    "MAX_LENGTH = 256\n",
+    "\n",
+    "def tokenize_and_adjust_labels(sample):\n",
+    "    \"\"\"\n",
+    "    Args:\n",
+    "        - sample (dict): {\"id\": \"...\", \"text\": \"...\", \"tags\": [{\"start\": ..., \"end\": ..., \"tag\": ...}, ...]\n",
+    "    Returns:\n",
+    "        - The tokenized version of `sample` and the labels of each token.\n",
+    "    \"\"\"\n",
+    "    # Tokenize the text, keep the start and end positions of tokens with `return_offsets_mapping` option\n",
+    "    # Use max_length and truncation to ajust the text length\n",
+    "    tokenized = tokenizer(sample[\"text\"],\n",
+    "                          return_offsets_mapping=True,\n",
+    "                          padding=\"max_length\",\n",
+    "                          max_length=MAX_LENGTH,\n",
+    "                          truncation=True)\n",
+    "\n",
+    "    # We are doing a multilabel classification task at each token, we create a list of size len(label2id)=13 \n",
+    "    # for the 13 labels\n",
+    "    labels = [[0 for _ in label2id.keys()] for _ in range(MAX_LENGTH)]\n",
+    "\n",
+    "    # Scan all the tokens and spans, assign 1 to the corresponding label if the token lies at the beginning\n",
+    "    # or inside the spans\n",
+    "    for (token_start, token_end), token_labels in zip(tokenized[\"offset_mapping\"], labels):\n",
+    "        for span in sample[\"tags\"]:\n",
+    "            role = get_token_role_in_span(token_start, token_end, span[\"start\"], span[\"end\"])\n",
+    "            if role == \"B\":\n",
+    "                token_labels[label2id[f\"B-{span['tag']}\"]] = 1\n",
+    "            elif role == \"I\":\n",
+    "                token_labels[label2id[f\"I-{span['tag']}\"]] = 1\n",
+    "\n",
+    "    return {**tokenized, \"labels\": labels}"
+   ],
+   "id": "8c96680645f077fb"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": [
+    "tokenized_train_ds = train_ds.map(tokenize_and_adjust_labels, remove_columns=train_ds.column_names)\n",
+    "tokenized_val_ds = val_ds.map(tokenize_and_adjust_labels, remove_columns=val_ds.column_names)"
+   ],
+   "id": "53310845f13e9d70"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": [
+    "from transformers import DataCollatorWithPadding\n",
+    "data_collator = DataCollatorWithPadding(tokenizer, padding=True)"
+   ],
+   "id": "6990d89800dbb440"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "# Adapt the model",
+   "id": "668dcf9750404d1c"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": [
+    "import numpy as np\n",
+    "from sklearn.metrics import multilabel_confusion_matrix\n",
+    "\n",
+    "n_labels = len(id2label)\n",
+    "\n",
+    "def divide(a: int, b: int):\n",
+    "    return a / b if b > 0 else 0\n",
+    "\n",
+    "def compute_metrics(p):\n",
+    "    \"\"\"\n",
+    "    Customize the `compute_metrics` of `transformers`\n",
+    "    Args:\n",
+    "        - p (tuple):      2 numpy arrays: predictions and true_labels\n",
+    "    Returns:\n",
+    "        - metrics (dict): f1 score on \n",
+    "    \"\"\"\n",
+    "    # (1)\n",
+    "    predictions, true_labels = p\n",
+    "\n",
+    "    # (2)\n",
+    "    predicted_labels = np.where(predictions > 0, np.ones(predictions.shape), np.zeros(predictions.shape))\n",
+    "    metrics = {}\n",
+    "\n",
+    "    # (3)\n",
+    "    cm = multilabel_confusion_matrix(true_labels.reshape(-1, n_labels), predicted_labels.reshape(-1, n_labels))\n",
+    "\n",
+    "    # (4) \n",
+    "    for label_idx, matrix in enumerate(cm):\n",
+    "        if label_idx == 0:\n",
+    "            continue # We don't care about the label \"O\"\n",
+    "        tp, fp, fn = matrix[1, 1], matrix[0, 1], matrix[1, 0]\n",
+    "        precision = divide(tp, tp + fp)\n",
+    "        recall = divide(tp, tp + fn)\n",
+    "        f1 = divide(2 * precision * recall, precision + recall)\n",
+    "        metrics[f\"f1_{id2label[label_idx]}\"] = f1\n",
+    "\n",
+    "    # (5)\n",
+    "    macro_f1 = sum(list(metrics.values())) / (n_labels - 1)\n",
+    "    metrics[\"macro_f1\"] = macro_f1\n",
+    "\n",
+    "    return metrics"
+   ],
+   "id": "7bd0cddab7ddb448"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": [
+    "from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer\n",
+    "from transformers import RobertaPreTrainedModel, RobertaModel\n",
+    "from transformers.utils import (\n",
+    "    add_code_sample_docstrings,\n",
+    "    add_start_docstrings,\n",
+    "    add_start_docstrings_to_model_forward,\n",
+    "    logging,\n",
+    "    replace_return_docstrings,\n",
+    ")\n",
+    "from transformers.models.roberta.modeling_roberta import (\n",
+    "    ROBERTA_INPUTS_DOCSTRING,\n",
+    "    ROBERTA_START_DOCSTRING,\n",
+    "    RobertaEmbeddings,\n",
+    ")\n",
+    "from typing import Optional, Union, Tuple\n",
+    "from transformers.modeling_outputs import TokenClassifierOutput\n",
+    "import torch\n",
+    "from torch import nn\n",
+    "\n",
+    "class RobertaForSpanCategorization(RobertaPreTrainedModel):\n",
+    "    _keys_to_ignore_on_load_unexpected = [r\"pooler\"]\n",
+    "    _keys_to_ignore_on_load_missing = [r\"position_ids\"]\n",
+    "\n",
+    "    def __init__(self, config):\n",
+    "        super().__init__(config)\n",
+    "        self.num_labels = config.num_labels\n",
+    "        self.roberta = RobertaModel(config, add_pooling_layer=False)\n",
+    "        classifier_dropout = (\n",
+    "            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob\n",
+    "        )\n",
+    "        self.dropout = nn.Dropout(classifier_dropout)\n",
+    "        self.classifier = nn.Linear(config.hidden_size, config.num_labels)\n",
+    "        # Initialize weights and apply final processing\n",
+    "        self.post_init()\n",
+    "\n",
+    "    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format(\"batch_size, sequence_length\"))\n",
+    "    def forward(\n",
+    "            self,\n",
+    "            input_ids: Optional[torch.LongTensor] = None,\n",
+    "            attention_mask: Optional[torch.FloatTensor] = None,\n",
+    "            token_type_ids: Optional[torch.LongTensor] = None,\n",
+    "            position_ids: Optional[torch.LongTensor] = None,\n",
+    "            head_mask: Optional[torch.FloatTensor] = None,\n",
+    "            inputs_embeds: Optional[torch.FloatTensor] = None,\n",
+    "            labels: Optional[torch.LongTensor] = None,\n",
+    "            output_attentions: Optional[bool] = None,\n",
+    "            output_hidden_states: Optional[bool] = None,\n",
+    "            return_dict: Optional[bool] = None,\n",
+    "    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:\n",
+    "        r\"\"\"\n",
+    "        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):\n",
+    "            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.\n",
+    "        \"\"\"\n",
+    "        return_dict = return_dict if return_dict is not None else self.config.use_return_dict\n",
+    "        outputs = self.roberta(\n",
+    "            input_ids,\n",
+    "            attention_mask=attention_mask,\n",
+    "            token_type_ids=token_type_ids,\n",
+    "            position_ids=position_ids,\n",
+    "            head_mask=head_mask,\n",
+    "            inputs_embeds=inputs_embeds,\n",
+    "            output_attentions=output_attentions,\n",
+    "            output_hidden_states=output_hidden_states,\n",
+    "            return_dict=return_dict,\n",
+    "        )\n",
+    "        sequence_output = outputs[0]\n",
+    "        sequence_output = self.dropout(sequence_output)\n",
+    "        logits = self.classifier(sequence_output)\n",
+    "\n",
+    "        loss = None\n",
+    "        if labels is not None:\n",
+    "            loss_fct = nn.BCEWithLogitsLoss()\n",
+    "            loss = loss_fct(logits, labels.float())\n",
+    "        if not return_dict:\n",
+    "            output = (logits,) + outputs[2:]\n",
+    "            return ((loss,) + output) if loss is not None else output\n",
+    "        return TokenClassifierOutput(\n",
+    "            loss=loss,\n",
+    "            logits=logits,\n",
+    "            hidden_states=outputs.hidden_states,\n",
+    "            attentions=outputs.attentions,\n",
+    "        )"
+   ],
+   "id": "ea5d16f59728e2b9"
+  },
+  {
+   "metadata": {},
+   "cell_type": "markdown",
+   "source": "# Fine-tuning",
+   "id": "77f4fc68394aa754"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": [
+    "training_args = TrainingArguments(\n",
+    "    output_dir=\"./models/fine_tune_bert_output_span_cat\",\n",
+    "    evaluation_strategy=\"epoch\",\n",
+    "    learning_rate=2.5e-4,\n",
+    "    per_device_train_batch_size=16,\n",
+    "    per_device_eval_batch_size=16,\n",
+    "    num_train_epochs=100,\n",
+    "    weight_decay=0.01,\n",
+    "    logging_steps = 100,\n",
+    "    save_strategy='epoch',\n",
+    "    save_total_limit=2,\n",
+    "    load_best_model_at_end=True,\n",
+    "    metric_for_best_model='macro_f1',\n",
+    "    log_level='critical',\n",
+    "    seed=12345\n",
+    ")"
+   ],
+   "id": "79161ed938cad895"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": [
+    "def model_init():\n",
+    "    # For reproducibility\n",
+    "    return RobertaForSpanCategorization.from_pretrained(modelId, id2label=id2label, label2id=label2id)\n",
+    "\n",
+    "trainer = Trainer(\n",
+    "    model_init=model_init,\n",
+    "    args=training_args,\n",
+    "    train_dataset=tokenized_train_ds,\n",
+    "    eval_dataset=tokenized_val_ds,\n",
+    "    data_collator=data_collator,\n",
+    "    tokenizer=tokenizer,\n",
+    "    compute_metrics=compute_metrics\n",
+    ")\n",
+    "trainer.train()"
+   ],
+   "id": "931792b554582a9f"
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": "trainer.model.save_pretrained(\"../../../models/Fine-tuned_CamemBERT-large\")",
+   "id": "89c997e6a944bc70"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
+%% Cell type:markdown id:8ced2e3ca31fb46c tags:
+
+# Dataset
+
+%% Cell type:code id:757a8bf026156e77 tags:
+
+``` python
+tag2id = {'action': 1, 'actor': 2, 'artifact': 3, 'condition': 4, 'location': 5, 'modality': 6, 'reference': 7, 'time': 8}
+id2tag = {v:k for k, v in tag2id.items()}
+```
+
+%% Cell type:code id:be3a4c320f9d4a5 tags:
+
+``` python
+label2id = {
+    'O': 0,
+    **{f'B-{k}': 2*v - 1 for k, v in tag2id.items()},
+    **{f'I-{k}': 2*v for k, v in tag2id.items()}
+}
+
+id2label = {v:k for k, v in label2id.items()}
+```
+
+%% Cell type:code id:2aa2fefac95e7f04 tags:
+
+``` python
+from datasets import Dataset
+train_ds = Dataset.from_json("data/annotations.train.jsonlines")
+val_ds = Dataset.from_json("data/annotations.eval.jsonlines")
+```
+
+%% Cell type:code id:9e0a21356e7701a1 tags:
+
+``` python
+modelId = '../../../models/CamemBERT-large'
+```
+
+%% Cell type:markdown id:66e00d5a79a66753 tags:
+
+# Tokenization
+
+%% Cell type:code id:e6459259f5ab2d98 tags:
+
+``` python
+from transformers import AutoTokenizer
+tokenizer = AutoTokenizer.from_pretrained(modelId)
+```
+
+%% Cell type:code id:8c96680645f077fb tags:
+
+``` python
+def get_token_role_in_span(token_start: int, token_end: int, span_start: int, span_end: int):
+    """
+    Check if the token is inside a span.
+    Args:
+      - token_start, token_end: Start and end offset of the token
+      - span_start, span_end: Start and end of the span
+    Returns:
+      - "B" if beginning
+      - "I" if inner
+      - "O" if outer
+      - "N" if not valid token (like <SEP>, <CLS>, <UNK>)
+    """
+    if token_end <= token_start:
+        return "N"
+    if token_start < span_start or token_end > span_end:
+        return "O"
+    if token_start > span_start:
+        return "I"
+    else:
+        return "B"
+
+MAX_LENGTH = 256
+
+def tokenize_and_adjust_labels(sample):
+    """
+    Args:
+        - sample (dict): {"id": "...", "text": "...", "tags": [{"start": ..., "end": ..., "tag": ...}, ...]
+    Returns:
+        - The tokenized version of `sample` and the labels of each token.
+    """
+    # Tokenize the text, keep the start and end positions of tokens with `return_offsets_mapping` option
+    # Use max_length and truncation to ajust the text length
+    tokenized = tokenizer(sample["text"],
+                          return_offsets_mapping=True,
+                          padding="max_length",
+                          max_length=MAX_LENGTH,
+                          truncation=True)
+
+    # We are doing a multilabel classification task at each token, we create a list of size len(label2id)=13
+    # for the 13 labels
+    labels = [[0 for _ in label2id.keys()] for _ in range(MAX_LENGTH)]
+
+    # Scan all the tokens and spans, assign 1 to the corresponding label if the token lies at the beginning
+    # or inside the spans
+    for (token_start, token_end), token_labels in zip(tokenized["offset_mapping"], labels):
+        for span in sample["tags"]:
+            role = get_token_role_in_span(token_start, token_end, span["start"], span["end"])
+            if role == "B":
+                token_labels[label2id[f"B-{span['tag']}"]] = 1
+            elif role == "I":
+                token_labels[label2id[f"I-{span['tag']}"]] = 1
+
+    return {**tokenized, "labels": labels}
+```
+
+%% Cell type:code id:53310845f13e9d70 tags:
+
+``` python
+tokenized_train_ds = train_ds.map(tokenize_and_adjust_labels, remove_columns=train_ds.column_names)
+tokenized_val_ds = val_ds.map(tokenize_and_adjust_labels, remove_columns=val_ds.column_names)
+```
+
+%% Cell type:code id:6990d89800dbb440 tags:
+
+``` python
+from transformers import DataCollatorWithPadding
+data_collator = DataCollatorWithPadding(tokenizer, padding=True)
+```
+
+%% Cell type:markdown id:668dcf9750404d1c tags:
+
+# Adapt the model
+
+%% Cell type:code id:7bd0cddab7ddb448 tags:
+
+``` python
+import numpy as np
+from sklearn.metrics import multilabel_confusion_matrix
+
+n_labels = len(id2label)
+
+def divide(a: int, b: int):
+    return a / b if b > 0 else 0
+
+def compute_metrics(p):
+    """
+    Customize the `compute_metrics` of `transformers`
+    Args:
+        - p (tuple):      2 numpy arrays: predictions and true_labels
+    Returns:
+        - metrics (dict): f1 score on
+    """
+    # (1)
+    predictions, true_labels = p
+
+    # (2)
+    predicted_labels = np.where(predictions > 0, np.ones(predictions.shape), np.zeros(predictions.shape))
+    metrics = {}
+
+    # (3)
+    cm = multilabel_confusion_matrix(true_labels.reshape(-1, n_labels), predicted_labels.reshape(-1, n_labels))
+
+    # (4)
+    for label_idx, matrix in enumerate(cm):
+        if label_idx == 0:
+            continue # We don't care about the label "O"
+        tp, fp, fn = matrix[1, 1], matrix[0, 1], matrix[1, 0]
+        precision = divide(tp, tp + fp)
+        recall = divide(tp, tp + fn)
+        f1 = divide(2 * precision * recall, precision + recall)
+        metrics[f"f1_{id2label[label_idx]}"] = f1
+
+    # (5)
+    macro_f1 = sum(list(metrics.values())) / (n_labels - 1)
+    metrics["macro_f1"] = macro_f1
+
+    return metrics
+```
+
+%% Cell type:code id:ea5d16f59728e2b9 tags:
+
+``` python
+from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
+from transformers import RobertaPreTrainedModel, RobertaModel
+from transformers.utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.models.roberta.modeling_roberta import (
+    ROBERTA_INPUTS_DOCSTRING,
+    ROBERTA_START_DOCSTRING,
+    RobertaEmbeddings,
+)
+from typing import Optional, Union, Tuple
+from transformers.modeling_outputs import TokenClassifierOutput
+import torch
+from torch import nn
+
+class RobertaForSpanCategorization(RobertaPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.roberta = RobertaModel(config, add_pooling_layer=False)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(ROBERTA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def forward(
+            self,
+            input_ids: Optional[torch.LongTensor] = None,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            token_type_ids: Optional[torch.LongTensor] = None,
+            position_ids: Optional[torch.LongTensor] = None,
+            head_mask: Optional[torch.FloatTensor] = None,
+            inputs_embeds: Optional[torch.FloatTensor] = None,
+            labels: Optional[torch.LongTensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.roberta(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = nn.BCEWithLogitsLoss()
+            loss = loss_fct(logits, labels.float())
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+```
+
+%% Cell type:markdown id:77f4fc68394aa754 tags:
+
+# Fine-tuning
+
+%% Cell type:code id:79161ed938cad895 tags:
+
+``` python
+training_args = TrainingArguments(
+    output_dir="./models/fine_tune_bert_output_span_cat",
+    evaluation_strategy="epoch",
+    learning_rate=2.5e-4,
+    per_device_train_batch_size=16,
+    per_device_eval_batch_size=16,
+    num_train_epochs=100,
+    weight_decay=0.01,
+    logging_steps = 100,
+    save_strategy='epoch',
+    save_total_limit=2,
+    load_best_model_at_end=True,
+    metric_for_best_model='macro_f1',
+    log_level='critical',
+    seed=12345
+)
+```
+
+%% Cell type:code id:931792b554582a9f tags:
+
+``` python
+def model_init():
+    # For reproducibility
+    return RobertaForSpanCategorization.from_pretrained(modelId, id2label=id2label, label2id=label2id)
+
+trainer = Trainer(
+    model_init=model_init,
+    args=training_args,
+    train_dataset=tokenized_train_ds,
+    eval_dataset=tokenized_val_ds,
+    data_collator=data_collator,
+    tokenizer=tokenizer,
+    compute_metrics=compute_metrics
+)
+trainer.train()
+```
+
+%% Cell type:code id:89c997e6a944bc70 tags:
+
+``` python
+trainer.model.save_pretrained("../../../models/Fine-tuned_CamemBERT-large")
+```
--- a/modules/data_generator/eval_dataset_generator_camembert.py
+++ b/modules/data_generator/eval_dataset_generator_camembert.py
+import json
+import jsonlines
+import pandas as pd
+
+
+def find_all_occurrences(text, phrase):
+    start = 0
+    while True:
+        start = text.find(phrase, start)
+        if start == -1: return
+        yield start
+        start += len(phrase)  # déplacez start après cette occurrence pour trouver la suivante
+
+
+with open('config.json', 'r', encoding='utf-8') as f:
+    data = json.load(f)
+    global_output = []
+
+    for index, (sentence, annot) in enumerate(data.items()):
+        sentence_output = {
+            'id': f"train_{index}",
+            'text': sentence,
+            'tags': []
+        }
+
+        for tag, spans in annot.items():
+            for span in spans:
+                occurrences = find_all_occurrences(sentence, span)
+                for (start_index, end_index) in occurrences:
+                    sentence_output['tags'].append({
+                        "end": end_index,
+                        "start": start_index,
+                        "tag": tag
+                    })
+
+    with jsonlines.open('../../data/annotations.eval.jsonlines', mode='w') as writer:
+        for item in global_output:
+            writer.write(item)
--- a/modules/data_generator/training_dataset_generator_camembert.py
+++ b/modules/data_generator/training_dataset_generator_camembert.py
+import json
+import pandas as pd
+import importlib
+
+spec = importlib.util.spec_from_file_location("utils", "../llm/utils.py")
+utils = importlib.util.module_from_spec(spec)
+spec.loader.exec_module(utils)
+
+data = pd.read_csv('../../data/dataQS.csv', delimiter=',', low_memory=False, encoding='utf-8')
+
+output = {}
+
+# Converting the original CSV and grouping by sentence
+for index, row in data.iterrows():
+
+    if type(row['content']) == float:
+        print(f"The row {index} has a float value")
+        continue
+
+    if row['sentence'] not in output.keys():
+        output[row['sentence']] = {}
+
+    if row['concept'] not in output[row['sentence']]:
+        output[row['sentence']][row['concept']] = []
+
+    output[row['sentence']][row['concept']].append(row['content'])
+
+    if row['content'] not in row['sentence']:
+        print(f"The row {index + 2} is not a part of the sentence : {row['content']}")
+
+# Create the fine-tuned dataset
+filter = ["action", "actor", "artifact", "condition", "location", "modality", "reference", "time"]
+
+input_for_finetuned = []
+output_for_finetuned = []
+
+for sentence in output:
+
+    temp_output = {}
+
+    for concept in output[sentence]:
+        if concept in filter:
+            temp_output[concept] = output[sentence][concept]
+
+    input_for_finetuned.append(f'<s> [INST] {utils.get_pre_prompt_zero_shot()} [/INST]\nUser: {sentence}\nAssistant: ')
+    output_for_finetuned.append(f'{json.dumps(temp_output, ensure_ascii=False)} </s>')
+
+finetuned_dataset = {'input': input_for_finetuned, 'output': output_for_finetuned}
+df = pd.DataFrame(finetuned_dataset)
+df.to_csv('../../data/finetuned_dataset.csv', index=False)
\ No newline at end of file
--- a/modules/marker_enhancer/s2.py
+++ b/modules/marker_enhancer/s2.py
@@ -21,8 +21,8 @@ class MarkerEnhancerLegalRoberta:
                        self.sentenceStorage[element].add(sentence['sentence'])

    def exec(self):
-        tokenizer = AutoTokenizer.from_pretrained("joelito/legal-french-roberta-large")
-        model = AutoModelForMaskedLM.from_pretrained("joelito/legal-french-roberta-large")
+        tokenizer = AutoTokenizer.from_pretrained("joelito/legal-french-camembert-large")
+        model = AutoModelForMaskedLM.from_pretrained("joelito/legal-french-camembert-large")

        for marker, sentences in self.sentenceStorage.items():
            for sentence in sentences:

--- a/modules/roberta/roberta-classifier-expe.ipynb
+++ b/modules/roberta/roberta-classifier-expe.ipynb
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "initial_id",
-   "metadata": {
-    "collapsed": true
-   },
-   "outputs": [],
-   "source": [
-    ""
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "Python 3",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 2
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython2",
-   "version": "2.7.6"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
-%% Cell type:code id:initial_id tags:
-
-``` python
-
-```