-
Julien Breton authoredJulien Breton authored
eval_dataset_generator_camembert.py 1.13 KiB
import json
import jsonlines
import pandas as pd
def find_all_occurrences(text, phrase):
start = 0
while True:
start = text.find(phrase, start)
if start == -1: return
yield start
start += len(phrase) # déplacez start après cette occurrence pour trouver la suivante
with open('config.json', 'r', encoding='utf-8') as f:
data = json.load(f)
global_output = []
for index, (sentence, annot) in enumerate(data.items()):
sentence_output = {
'id': f"train_{index}",
'text': sentence,
'tags': []
}
for tag, spans in annot.items():
for span in spans:
occurrences = find_all_occurrences(sentence, span)
for (start_index, end_index) in occurrences:
sentence_output['tags'].append({
"end": end_index,
"start": start_index,
"tag": tag
})
with jsonlines.open('../../data/annotations.eval.jsonlines', mode='w') as writer:
for item in global_output:
writer.write(item)