From 2427614c10b3983e91d8d8da97900bc7aeacf5c4 Mon Sep 17 00:00:00 2001 From: "Julien B." <xm9q8f80@jlnbrtn.me> Date: Mon, 19 Aug 2024 09:17:34 +0200 Subject: [PATCH] First commit --- .gitignore | 3 +- api/__init__.py | 0 api/internal_services/__init__.py | 0 api/internal_services/background_worker.py | 36 +++++++ api/internal_services/database.py | 16 +++ api/internal_services/logger.py | 5 + api/internal_services/neo4j.py | 102 +++++++++++++++++++ api/internal_services/spacy.py | 108 +++++++++++++++++++++ api/main.py | 12 +++ api/routers/__init__.py | 0 api/routers/pipeline_endpoint.py | 16 +++ api/setup.py | 0 compose.yml | 12 +++ 13 files changed, 309 insertions(+), 1 deletion(-) create mode 100644 api/__init__.py create mode 100644 api/internal_services/__init__.py create mode 100644 api/internal_services/background_worker.py create mode 100644 api/internal_services/database.py create mode 100644 api/internal_services/logger.py create mode 100644 api/internal_services/neo4j.py create mode 100644 api/internal_services/spacy.py create mode 100644 api/main.py create mode 100644 api/routers/__init__.py create mode 100644 api/routers/pipeline_endpoint.py create mode 100644 api/setup.py create mode 100644 compose.yml diff --git a/.gitignore b/.gitignore index 4bca2f8..b92f26a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ .env -db.json \ No newline at end of file +db.json +.idea \ No newline at end of file diff --git a/api/__init__.py b/api/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/api/internal_services/__init__.py b/api/internal_services/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/api/internal_services/background_worker.py b/api/internal_services/background_worker.py new file mode 100644 index 0000000..0f08b77 --- /dev/null +++ b/api/internal_services/background_worker.py @@ -0,0 +1,36 @@ +from queue import Queue +from threading import Thread +from api.internal_services.logger import logger +from api.internal_services.spacy import spacy_process + +sentence_queue = Queue() +worker_thread = None + + +def process_queue(): + global worker_thread + while sentence_queue.qsize() != 0: + sentence = sentence_queue.get() + logger.debug(f"Processing the sentence : {sentence}") + if sentence is None: + break + + # all process + spacy_process(sentence) + + sentence_queue.task_done() + + logger.debug("Closing the worker thread") + worker_thread = None + + +def start_worker_thread(): + global worker_thread + if worker_thread is None or not worker_thread.is_alive(): + logger.debug("Starting the worker thread to process the queue") + worker_thread = Thread(target=process_queue, daemon=True) + worker_thread.start() + +def add_sentence_to_queue(sentence): + sentence_queue.put(sentence) + start_worker_thread() \ No newline at end of file diff --git a/api/internal_services/database.py b/api/internal_services/database.py new file mode 100644 index 0000000..9f65482 --- /dev/null +++ b/api/internal_services/database.py @@ -0,0 +1,16 @@ +from tinydb import TinyDB, Query, where + +db = TinyDB('db.json') + +def getLastIndex(): + result = db.search(where('key') == 'last_index') + if not result: + created_object = {'key': 'last_index', 'value': 0} + db.insert(created_object) + return 0 + else: + return result[0]['value'] + +def updateLastIndex(value): + db.update({'value': value}, where('key') == 'last_index') + return value \ No newline at end of file diff --git a/api/internal_services/logger.py b/api/internal_services/logger.py new file mode 100644 index 0000000..602fd02 --- /dev/null +++ b/api/internal_services/logger.py @@ -0,0 +1,5 @@ +# logger_config.py +import logging + +logger = logging.getLogger('uvicorn.error') +logger.setLevel(logging.DEBUG) \ No newline at end of file diff --git a/api/internal_services/neo4j.py b/api/internal_services/neo4j.py new file mode 100644 index 0000000..0290491 --- /dev/null +++ b/api/internal_services/neo4j.py @@ -0,0 +1,102 @@ +from neo4j import GraphDatabase + +uri = "bolt://localhost:7687" # Modifier l'URI en fonction de votre configuration +username = "neo4j" +password = "password" + +# Connexion à la base de données Neo4j +driver = GraphDatabase.driver(uri, auth=(username, password)) + + +def createWordNode(tx, id, text, lemma, pos, root): + tx.run(''' + CREATE ( + n:Word { + id: $id, + text: $text, + lemma: $lemma, + pos: $pos, + root: $root + } + )''', + id=id, text=text, lemma=lemma, pos=pos, root=root) + + +def createConstituentNode(tx, id, type): + tx.run(''' + CREATE ( + n:Constituent { + id: $id, + type: $type + } + )''', + id=id, type=type) + + +def createConceptNode(tx, concept, id): + tx.run(''' + CREATE ( + n:Concept { + type: $concept, + id: $id + } + )''', + concept=concept, id=id) + + +def createNextWordRelation(tx, idFrom, idTo): + tx.run(''' + MATCH + (a:Word), + (b:Word) + WHERE a.id = $idFrom AND b.id = $idTo + CREATE (a)-[r:NEXT]->(b) + ''', + idFrom=idFrom, idTo=idTo + ) + + +def createDeprelRelation(tx, idFrom, idTo, type): + tx.run(''' + MATCH + (a:Word), + (b:Word) + WHERE a.id = $idFrom AND b.id = $idTo + CREATE (a)-[r:DEPREL {type: $type}]->(b) + ''', + idFrom=idFrom, idTo=idTo, type=type + ) + + +def createConceptRelation(tx, idFrom, idTo): + tx.run(''' + MATCH + (a:Concept), + (b:Word) + WHERE a.id = $idFrom AND b.id = $idTo + CREATE (a)-[r:LINKED]->(b) + ''', + idFrom=idFrom, idTo=idTo + ) + +def createConstituentRelation(tx, idFrom, idTo): + tx.run(''' + MATCH + (a:Constituent), + (b:Word|Constituent) + WHERE a.id = $idFrom AND b.id = $idTo + CREATE (a)-[r:CONSREL]->(b) + ''', + idFrom=idFrom, idTo=idTo + ) + +def createRelation(tx, idFrom, idTo, id, type): + tx.run(''' + MATCH + (a:Concept), + (b:Concept) + WHERE a.id = $idFrom AND b.id = $idTo + CREATE (a)-[r:RELATION {id: $id, type: $type}]->(b) + ''', + idFrom=idFrom, idTo=idTo, id=id, type=type + ) \ No newline at end of file diff --git a/api/internal_services/spacy.py b/api/internal_services/spacy.py new file mode 100644 index 0000000..5e3da85 --- /dev/null +++ b/api/internal_services/spacy.py @@ -0,0 +1,108 @@ +import benepar, spacy +import warnings +warnings.filterwarnings("ignore") + +from api.internal_services.database import getLastIndex, updateLastIndex +from api.internal_services.logger import logger +from api.internal_services.neo4j import createConstituentNode, driver, createConstituentRelation, createWordNode, \ + createNextWordRelation, createConceptRelation, createDeprelRelation + +benepar.download('benepar_fr2') +nlp = spacy.load('fr_dep_news_trf') +nlp.add_pipe('benepar', config={'model': 'benepar_fr2'}) + + +def spacy_process(sentence): + with (driver.session() as session): + doc = nlp(sentence) + lastIndex = getLastIndex() + + for i, sentence in enumerate(doc.sents): + lastIndex = updateLastIndex(lastIndex + 1) + #constituentDone = set() + logger.debug(sentence._.parse_string) + + for constituent in sentence._.constituents: + constituentId = f"{lastIndex}.{i}.{constituent.start}-{constituent.end}" + logger.debug(f"Processing constituent : {constituentId} - {constituent._.labels}") + + logger.debug(f"{constituent._.labels} and {constituent.root.text != constituent.text}") + if constituent._.labels and constituent.root.text != constituent.text: + # Créer le consituant + session.execute_write( + createConstituentNode, + f"{lastIndex}.{i}.{constituent.start}-{constituent.end}", + constituent._.labels[0] + ) + + if constituent._.parent is not None: + # parent existe alors on crée le lien + session.execute_write( + createConstituentRelation, + f"{lastIndex}.{i}.{constituent._.parent.start}-{constituent._.parent.end}", + constituentId + ) + + else: + # Créer le mot et le constituant solitaire si nécessaire + if constituent._.labels: + # Créer le consituant + session.execute_write( + createConstituentNode, + f"{lastIndex}.{i}.{constituent.start}-{constituent.end}", + constituent._.labels[0] + ) + + #Création du mot en noeud neo4j + session.execute_write( + createWordNode, + '.'.join(map(str, [lastIndex, i, constituent.root.i])), + constituent.text, + None if not hasattr(constituent, 'lemma_') else constituent.lemma_, + constituent.root.pos_, + True if constituent.root.dep_ == "root" else False + ) + logger.debug(f"Creating word : {constituent.text}") + + session.execute_write( + createConstituentRelation, + f"{lastIndex}.{i}.{constituent.start}-{constituent.end}", + '.'.join(map(str, [lastIndex, i, constituent.root.i])), + ) + + session.execute_write( + createConstituentRelation, + f"{lastIndex}.{i}.{constituent._.parent.start}-{constituent._.parent.end}", + f"{lastIndex}.{i}.{constituent.start}-{constituent.end}", + ) + + else: + #Création du mot en noeud neo4j + session.execute_write( + createWordNode, + '.'.join(map(str, [lastIndex, i, constituent.root.i])), + constituent.text, + None if not hasattr(constituent, 'lemma_') else constituent.lemma_, + constituent.root.pos_, + True if constituent.root.dep_ == "root" else False + ) + logger.debug(f"Creating word : {constituent.text}") + + # parent existe alors on crée le lien + session.execute_write( + createConstituentRelation, + f"{lastIndex}.{i}.{constituent._.parent.start}-{constituent._.parent.end}", + '.'.join(map(str, [lastIndex, i, constituent.root.i])), + ) + + for token in sentence: + #Création d'un lien de succession + if token.i != 0: + idFrom = '.'.join(map(str, [lastIndex, i, token.i - 1])) + idTo = '.'.join(map(str, [lastIndex, i, token.i])) + session.execute_write(createNextWordRelation, idFrom, idTo) + + #dépendances syntaxiques + idFrom = '.'.join(map(str, [lastIndex, i, token.head.i])) + idTo = '.'.join(map(str, [lastIndex, i, token.i])) + session.execute_write(createDeprelRelation, idFrom, idTo, token.dep_) diff --git a/api/main.py b/api/main.py new file mode 100644 index 0000000..0003258 --- /dev/null +++ b/api/main.py @@ -0,0 +1,12 @@ +# > fastapi dev main.py +# > uvicorn api.main:app --reload +from fastapi import FastAPI +from .routers import pipeline_endpoint + +app = FastAPI() + +app.include_router(pipeline_endpoint.router) + +@app.get("/") +async def root(): + return {"message": "ALA plateform is running !"} \ No newline at end of file diff --git a/api/routers/__init__.py b/api/routers/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/api/routers/pipeline_endpoint.py b/api/routers/pipeline_endpoint.py new file mode 100644 index 0000000..c2310e4 --- /dev/null +++ b/api/routers/pipeline_endpoint.py @@ -0,0 +1,16 @@ +from pydantic import BaseModel +from fastapi import APIRouter +from api.internal_services.background_worker import add_sentence_to_queue +from api.internal_services.logger import logger + +router = APIRouter() + + +class Sentence(BaseModel): + sentence: str + +@router.post("/sentences") +def add_sentence_to_process(sentence: Sentence): + logger.debug(f"New sentence added to queue : {sentence.sentence}") + add_sentence_to_queue(sentence.sentence) + return {"message": "Sentence added to the queue for processing."} \ No newline at end of file diff --git a/api/setup.py b/api/setup.py new file mode 100644 index 0000000..e69de29 diff --git a/compose.yml b/compose.yml new file mode 100644 index 0000000..af53df7 --- /dev/null +++ b/compose.yml @@ -0,0 +1,12 @@ +name: ala-plateform +services: + neo4j: + ports: + - 7474:7474 + - 7687:7687 + volumes: + - neo4j:/data + image: neo4j + +volumes: + neo4j: \ No newline at end of file -- GitLab