eval_dataset_generator_camembert.py

import json
import jsonlines
import pandas as pd


def find_all_occurrences(text, phrase):
    start = 0
    while True:
        start = text.find(phrase, start)
        if start == -1: return
        yield start
        start += len(phrase)  # déplacez start après cette occurrence pour trouver la suivante


with open('config.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
    global_output = []

    for index, (sentence, annot) in enumerate(data.items()):
        sentence_output = {
            'id': f"train_{index}",
            'text': sentence,
            'tags': []
        }

        for tag, spans in annot.items():
            for span in spans:
                occurrences = find_all_occurrences(sentence, span)
                for (start_index, end_index) in occurrences:
                    sentence_output['tags'].append({
                        "end": end_index,
                        "start": start_index,
                        "tag": tag
                    })

    with jsonlines.open('../../data/annotations.eval.jsonlines', mode='w') as writer:
        for item in global_output:
            writer.write(item)