From 2427614c10b3983e91d8d8da97900bc7aeacf5c4 Mon Sep 17 00:00:00 2001
From: "Julien B." <xm9q8f80@jlnbrtn.me>
Date: Mon, 19 Aug 2024 09:17:34 +0200
Subject: [PATCH] First commit

---
 .gitignore                                 |   3 +-
 api/__init__.py                            |   0
 api/internal_services/__init__.py          |   0
 api/internal_services/background_worker.py |  36 +++++++
 api/internal_services/database.py          |  16 +++
 api/internal_services/logger.py            |   5 +
 api/internal_services/neo4j.py             | 102 +++++++++++++++++++
 api/internal_services/spacy.py             | 108 +++++++++++++++++++++
 api/main.py                                |  12 +++
 api/routers/__init__.py                    |   0
 api/routers/pipeline_endpoint.py           |  16 +++
 api/setup.py                               |   0
 compose.yml                                |  12 +++
 13 files changed, 309 insertions(+), 1 deletion(-)
 create mode 100644 api/__init__.py
 create mode 100644 api/internal_services/__init__.py
 create mode 100644 api/internal_services/background_worker.py
 create mode 100644 api/internal_services/database.py
 create mode 100644 api/internal_services/logger.py
 create mode 100644 api/internal_services/neo4j.py
 create mode 100644 api/internal_services/spacy.py
 create mode 100644 api/main.py
 create mode 100644 api/routers/__init__.py
 create mode 100644 api/routers/pipeline_endpoint.py
 create mode 100644 api/setup.py
 create mode 100644 compose.yml

diff --git a/.gitignore b/.gitignore
index 4bca2f8..b92f26a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,2 +1,3 @@
 .env
-db.json
\ No newline at end of file
+db.json
+.idea
\ No newline at end of file
diff --git a/api/__init__.py b/api/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/api/internal_services/__init__.py b/api/internal_services/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/api/internal_services/background_worker.py b/api/internal_services/background_worker.py
new file mode 100644
index 0000000..0f08b77
--- /dev/null
+++ b/api/internal_services/background_worker.py
@@ -0,0 +1,36 @@
+from queue import Queue
+from threading import Thread
+from api.internal_services.logger import logger
+from api.internal_services.spacy import spacy_process
+
+sentence_queue = Queue()
+worker_thread = None
+
+
+def process_queue():
+    global worker_thread
+    while sentence_queue.qsize() != 0:
+        sentence = sentence_queue.get()
+        logger.debug(f"Processing the sentence : {sentence}")
+        if sentence is None:
+            break
+
+        # all process
+        spacy_process(sentence)
+
+        sentence_queue.task_done()
+
+    logger.debug("Closing the worker thread")
+    worker_thread = None
+
+
+def start_worker_thread():
+    global worker_thread
+    if worker_thread is None or not worker_thread.is_alive():
+        logger.debug("Starting the worker thread to process the queue")
+        worker_thread = Thread(target=process_queue, daemon=True)
+        worker_thread.start()
+
+def add_sentence_to_queue(sentence):
+    sentence_queue.put(sentence)
+    start_worker_thread()
\ No newline at end of file
diff --git a/api/internal_services/database.py b/api/internal_services/database.py
new file mode 100644
index 0000000..9f65482
--- /dev/null
+++ b/api/internal_services/database.py
@@ -0,0 +1,16 @@
+from tinydb import TinyDB, Query, where
+
+db = TinyDB('db.json')
+
+def getLastIndex():
+    result = db.search(where('key') == 'last_index')
+    if not result:
+        created_object = {'key': 'last_index', 'value': 0}
+        db.insert(created_object)
+        return 0
+    else:
+        return result[0]['value']
+
+def updateLastIndex(value):
+    db.update({'value': value}, where('key') == 'last_index')
+    return value
\ No newline at end of file
diff --git a/api/internal_services/logger.py b/api/internal_services/logger.py
new file mode 100644
index 0000000..602fd02
--- /dev/null
+++ b/api/internal_services/logger.py
@@ -0,0 +1,5 @@
+# logger_config.py
+import logging
+
+logger = logging.getLogger('uvicorn.error')
+logger.setLevel(logging.DEBUG)
\ No newline at end of file
diff --git a/api/internal_services/neo4j.py b/api/internal_services/neo4j.py
new file mode 100644
index 0000000..0290491
--- /dev/null
+++ b/api/internal_services/neo4j.py
@@ -0,0 +1,102 @@
+from neo4j import GraphDatabase
+
+uri = "bolt://localhost:7687"  # Modifier l'URI en fonction de votre configuration
+username = "neo4j"
+password = "password"
+
+# Connexion à la base de données Neo4j
+driver = GraphDatabase.driver(uri, auth=(username, password))
+
+
+def createWordNode(tx, id, text, lemma, pos, root):
+    tx.run('''
+            CREATE (
+                n:Word {
+                    id: $id,
+                    text: $text,
+                    lemma: $lemma,
+                    pos: $pos,
+                    root: $root
+                }
+            )''',
+           id=id, text=text, lemma=lemma, pos=pos, root=root)
+
+
+def createConstituentNode(tx, id, type):
+    tx.run('''
+            CREATE (
+                n:Constituent {
+                    id: $id,
+                    type: $type
+                }
+            )''',
+           id=id, type=type)
+
+
+def createConceptNode(tx, concept, id):
+    tx.run('''
+            CREATE (
+                n:Concept {
+                    type: $concept,
+                    id: $id
+                }
+            )''',
+           concept=concept, id=id)
+
+
+def createNextWordRelation(tx, idFrom, idTo):
+    tx.run('''
+                MATCH
+                (a:Word),
+                (b:Word)
+                WHERE a.id = $idFrom AND b.id = $idTo
+                CREATE (a)-[r:NEXT]->(b)
+           ''',
+           idFrom=idFrom, idTo=idTo
+           )
+
+
+def createDeprelRelation(tx, idFrom, idTo, type):
+    tx.run('''
+                MATCH
+                (a:Word),
+                (b:Word)
+                WHERE a.id = $idFrom AND b.id = $idTo
+                CREATE (a)-[r:DEPREL {type: $type}]->(b)
+           ''',
+           idFrom=idFrom, idTo=idTo, type=type
+           )
+
+
+def createConceptRelation(tx, idFrom, idTo):
+    tx.run('''
+                MATCH
+                (a:Concept),
+                (b:Word)
+                WHERE a.id = $idFrom AND b.id = $idTo
+                CREATE (a)-[r:LINKED]->(b)
+           ''',
+           idFrom=idFrom, idTo=idTo
+           )
+
+def createConstituentRelation(tx, idFrom, idTo):
+    tx.run('''
+                MATCH
+                (a:Constituent),
+                (b:Word|Constituent)
+                WHERE a.id = $idFrom AND b.id = $idTo
+                CREATE (a)-[r:CONSREL]->(b)
+           ''',
+           idFrom=idFrom, idTo=idTo
+           )
+
+def createRelation(tx, idFrom, idTo, id, type):
+    tx.run('''
+                MATCH
+                (a:Concept),
+                (b:Concept)
+                WHERE a.id = $idFrom AND b.id = $idTo
+                CREATE (a)-[r:RELATION {id: $id, type: $type}]->(b)
+           ''',
+           idFrom=idFrom, idTo=idTo, id=id, type=type
+           )
\ No newline at end of file
diff --git a/api/internal_services/spacy.py b/api/internal_services/spacy.py
new file mode 100644
index 0000000..5e3da85
--- /dev/null
+++ b/api/internal_services/spacy.py
@@ -0,0 +1,108 @@
+import benepar, spacy
+import warnings
+warnings.filterwarnings("ignore")
+
+from api.internal_services.database import getLastIndex, updateLastIndex
+from api.internal_services.logger import logger
+from api.internal_services.neo4j import createConstituentNode, driver, createConstituentRelation, createWordNode, \
+    createNextWordRelation, createConceptRelation, createDeprelRelation
+
+benepar.download('benepar_fr2')
+nlp = spacy.load('fr_dep_news_trf')
+nlp.add_pipe('benepar', config={'model': 'benepar_fr2'})
+
+
+def spacy_process(sentence):
+    with (driver.session() as session):
+        doc = nlp(sentence)
+        lastIndex = getLastIndex()
+
+        for i, sentence in enumerate(doc.sents):
+            lastIndex = updateLastIndex(lastIndex + 1)
+            #constituentDone = set()
+            logger.debug(sentence._.parse_string)
+
+            for constituent in sentence._.constituents:
+                constituentId = f"{lastIndex}.{i}.{constituent.start}-{constituent.end}"
+                logger.debug(f"Processing constituent : {constituentId} - {constituent._.labels}")
+
+                logger.debug(f"{constituent._.labels} and {constituent.root.text != constituent.text}")
+                if constituent._.labels and constituent.root.text != constituent.text:
+                    # Créer le consituant
+                    session.execute_write(
+                        createConstituentNode,
+                        f"{lastIndex}.{i}.{constituent.start}-{constituent.end}",
+                        constituent._.labels[0]
+                    )
+
+                    if constituent._.parent is not None:
+                        # parent existe alors on crée le lien
+                        session.execute_write(
+                            createConstituentRelation,
+                            f"{lastIndex}.{i}.{constituent._.parent.start}-{constituent._.parent.end}",
+                            constituentId
+                        )
+
+                else:
+                    # Créer le mot et le constituant solitaire si nécessaire
+                    if constituent._.labels:
+                        # Créer le consituant
+                        session.execute_write(
+                            createConstituentNode,
+                            f"{lastIndex}.{i}.{constituent.start}-{constituent.end}",
+                            constituent._.labels[0]
+                        )
+
+                        #Création du mot en noeud neo4j
+                        session.execute_write(
+                            createWordNode,
+                            '.'.join(map(str, [lastIndex, i, constituent.root.i])),
+                            constituent.text,
+                            None if not hasattr(constituent, 'lemma_') else constituent.lemma_,
+                            constituent.root.pos_,
+                            True if constituent.root.dep_ == "root" else False
+                        )
+                        logger.debug(f"Creating word : {constituent.text}")
+
+                        session.execute_write(
+                            createConstituentRelation,
+                            f"{lastIndex}.{i}.{constituent.start}-{constituent.end}",
+                            '.'.join(map(str, [lastIndex, i, constituent.root.i])),
+                        )
+
+                        session.execute_write(
+                            createConstituentRelation,
+                            f"{lastIndex}.{i}.{constituent._.parent.start}-{constituent._.parent.end}",
+                            f"{lastIndex}.{i}.{constituent.start}-{constituent.end}",
+                        )
+
+                    else:
+                        #Création du mot en noeud neo4j
+                        session.execute_write(
+                            createWordNode,
+                            '.'.join(map(str, [lastIndex, i, constituent.root.i])),
+                            constituent.text,
+                            None if not hasattr(constituent, 'lemma_') else constituent.lemma_,
+                            constituent.root.pos_,
+                            True if constituent.root.dep_ == "root" else False
+                        )
+                        logger.debug(f"Creating word : {constituent.text}")
+
+                        # parent existe alors on crée le lien
+                        session.execute_write(
+                            createConstituentRelation,
+                            f"{lastIndex}.{i}.{constituent._.parent.start}-{constituent._.parent.end}",
+                            '.'.join(map(str, [lastIndex, i, constituent.root.i])),
+                        )
+
+            for token in sentence:
+                #Création d'un lien de succession
+                if token.i != 0:
+                    idFrom = '.'.join(map(str, [lastIndex, i, token.i - 1]))
+                    idTo = '.'.join(map(str, [lastIndex, i, token.i]))
+                    session.execute_write(createNextWordRelation, idFrom, idTo)
+
+                #dépendances syntaxiques
+                idFrom = '.'.join(map(str, [lastIndex, i, token.head.i]))
+                idTo = '.'.join(map(str, [lastIndex, i, token.i]))
+                session.execute_write(createDeprelRelation, idFrom, idTo, token.dep_)
diff --git a/api/main.py b/api/main.py
new file mode 100644
index 0000000..0003258
--- /dev/null
+++ b/api/main.py
@@ -0,0 +1,12 @@
+# > fastapi dev main.py
+# > uvicorn api.main:app --reload
+from fastapi import FastAPI
+from .routers import pipeline_endpoint
+
+app = FastAPI()
+
+app.include_router(pipeline_endpoint.router)
+
+@app.get("/")
+async def root():
+    return {"message": "ALA plateform is running !"}
\ No newline at end of file
diff --git a/api/routers/__init__.py b/api/routers/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/api/routers/pipeline_endpoint.py b/api/routers/pipeline_endpoint.py
new file mode 100644
index 0000000..c2310e4
--- /dev/null
+++ b/api/routers/pipeline_endpoint.py
@@ -0,0 +1,16 @@
+from pydantic import BaseModel
+from fastapi import APIRouter
+from api.internal_services.background_worker import add_sentence_to_queue
+from api.internal_services.logger import logger
+
+router = APIRouter()
+
+
+class Sentence(BaseModel):
+    sentence: str
+
+@router.post("/sentences")
+def add_sentence_to_process(sentence: Sentence):
+    logger.debug(f"New sentence added to queue : {sentence.sentence}")
+    add_sentence_to_queue(sentence.sentence)
+    return {"message": "Sentence added to the queue for processing."}
\ No newline at end of file
diff --git a/api/setup.py b/api/setup.py
new file mode 100644
index 0000000..e69de29
diff --git a/compose.yml b/compose.yml
new file mode 100644
index 0000000..af53df7
--- /dev/null
+++ b/compose.yml
@@ -0,0 +1,12 @@
+name: ala-plateform
+services:
+  neo4j:
+    ports:
+      - 7474:7474
+      - 7687:7687
+    volumes:
+      - neo4j:/data
+    image: neo4j
+
+volumes:
+  neo4j:
\ No newline at end of file
-- 
GitLab