diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 0000000000000000000000000000000000000000..8af27fb075133510f9f060f32b52d624c6f6b163 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,4 @@ +__pycache__ +.git +.idea +Dockerfile \ No newline at end of file diff --git a/.gitignore b/.gitignore index 3a5c4f70924010b9dbb31ef46d1f789358b5a49f..0e598329bda32a3d03e6661cb8ba4a8c2b9c78c9 100644 --- a/.gitignore +++ b/.gitignore @@ -2,4 +2,6 @@ venv fin_cache .idea __pycache__ -.ipynb_checkpoints \ No newline at end of file +.ipynb_checkpoints +fin.bin +propmatch.tar.gz \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000000000000000000000000000000000000..b405f0ef8d945d9f143be9b2736045cf36c06b79 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,25 @@ +FROM ubuntu:latest + +ARG DEBIAN_FRONTEND=noninteractive + +RUN apt-get update +RUN apt install software-properties-common -y +RUN add-apt-repository ppa:deadsnakes/ppa + +RUN apt-get install -y python3.8 +RUN apt-get install -y python3-pip +RUN apt-get install -y python3.8-dev +RUN apt-get install -y python3.8-distutils + +RUN apt-get install -y git + +WORKDIR /app + +COPY . . +RUN python3.8 -m pip install -r requirements.txt + +CMD ["python3.8", "-m", "uvicorn", "--workers", "1", "--port", "8080", "--host", "0.0.0.0", "main:app"] + +EXPOSE 8080 + + diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..b6a92203211a7a0968e8e50b3e099f75e1fc6b0f --- /dev/null +++ b/README.md @@ -0,0 +1,30 @@ +# PropMatch: Property Matcher + +PropMatch is a Python 3.9-based ontology property matching system designed to find better alignment of properties across different ontologies. This system uses lexical matching methods and alignment extension combined with different embeddings to increase the amount of correspondences found between properties. + +## Download + +A packaged version of PropMatch is available for download [here](https://drive.google.com/file/d/1UShYKSO8fle-VWC4o1YZ2xxsgVVyELZ4/view?usp=drive_link). It follows the MELT Web API protocol packaged in a Docker container. + +## Development + +PropMatch was tested on Python 3.9. To run PropMatch you also need to download the Finnish word embeddings from +http://dl.turkunlp.org/finnish-embeddings/finnish_4B_parsebank_skgram.bin. + +To install the required dependencies, run: + +```bash +pip install -r requirements.txt +``` + +## Contributing + +Contributions to PropMatch are welcome! If you encounter issues or have suggestions for improvements, please feel free to open an issue or submit a pull request in the [PropMatch GitHub repository](https://github.com/guihcs/propalign). + +## License + +PropMatch is released under the [MIT License](https://opensource.org/licenses/MIT). + +--- + +For inquiries and support, contact us at Guilherme.Santos-Sousa@irit.fr. \ No newline at end of file diff --git a/main.py b/main.py index 9c84b8e6de7bee36272c96239ebac37a78255897..8c48c885c1bdf66f4996223531e8b9b68b041b68 100644 --- a/main.py +++ b/main.py @@ -1,28 +1,37 @@ +import sys + from sentence_transformers import SentenceTransformer from models import Finbank -import random -import torch -import numpy as np from property_matching import PropertyMatcher -from tqdm.auto import tqdm -from property_matching import most_common_pair -import matplotlib.pyplot as plt +import os +import requests import argparse import rdflib import tempfile from urllib import parse, request from om.ont import get_namespace +import json +from typing import Union +import re +from fastapi.middleware.cors import CORSMiddleware +from fastapi import FastAPI, Form, Response, UploadFile, File +from fastapi.responses import PlainTextResponse, Response +from typing_extensions import Annotated -def parse_arguments(): - arg_parser = argparse.ArgumentParser(description='LD similarity.') +app = FastAPI() - arg_parser.add_argument('source', help='Source ontology path.') - arg_parser.add_argument('target', help='Target ontology path.') - arg_parser.add_argument('--output', dest='output', default='./output', help='Folder to save the results.') - arg_parser.add_argument('--format', dest='format', default='align', choices=['align', 'sssom'], help='Output format.') +app.add_middleware( + CORSMiddleware, + allow_origins='*', + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) - return arg_parser.parse_args() +wm = Finbank('./fin.bin') +model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') +property_matcher = PropertyMatcher(wm, model) def toAlignFormat(aligns, onto1, onto2, location1, location2): @@ -62,6 +71,7 @@ def toAlignFormat(aligns, onto1, onto2, location1, location2): return '\n'.join(data) + def ssom(aligns): lines = ['subject_id\tpredicate_id\tobject_id\tmapping_justification\tconfidence'] for (entity1, entity2), confidence in aligns.items(): @@ -69,30 +79,45 @@ def ssom(aligns): return "\n".join(lines) -if __name__ == '__main__': - args = parse_arguments() - wm = Finbank('/home/guilherme/Documents/kg/fin.bin') - model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') - property_matcher = PropertyMatcher(wm, model) - o1 = rdflib.Graph().parse(args.source) - o2 = rdflib.Graph().parse(args.target) +@app.post('/match') +async def match(source: Union[str, UploadFile] = Form(...), + target: Union[str, UploadFile] = Form(...), + inputAlignment: Annotated[Union[str, None], Form()] = None, + parameters: Annotated[Union[str, None], Form()] = None): + outputFile = type(source) != str - p, it = property_matcher.match_ontologies(o1, o2, 0.65) + if type(source) == str: + o1 = rdflib.Graph().parse(source) + o2 = rdflib.Graph().parse(target) + else: + o1 = rdflib.Graph().parse(source.file, format=re.split(r'\W', source.content_type)[-1]) + o2 = rdflib.Graph().parse(target.file, format=re.split(r'\W', target.content_type)[-1]) + params = {} - # Parser + if parameters is not None: + with open(parameters) as f: + params = json.load(f) + p, it = property_matcher.match_ontologies(o1, o2, 0.65, + sim_weights=params['sim_weights'] if 'sim_weights' in params else None) - if args.format == 'sssom': + if 'format' in params and params['format'] == 'sssom': result = ssom(p) suffix = '.tsv' else: - result = toAlignFormat(p, get_namespace(o1), get_namespace(o2), args.source, args.target) + if outputFile: + source = source.filename + target = target.filename + result = toAlignFormat(p, get_namespace(o1), get_namespace(o2), source, target) suffix = '.rdf' - with tempfile.NamedTemporaryFile('w', prefix='alignment_', suffix=suffix, delete=False) as out_file: - out_file.write(result) + if outputFile: + return Response(result, media_type='application/rdf+xml') + else: + with tempfile.NamedTemporaryFile('w', prefix='alignment_', suffix=suffix, delete=False) as out_file: + out_file.write(result) - print(parse.urljoin("file:", request.pathname2url(out_file.name))) + return PlainTextResponse(out_file.name) diff --git a/models.py b/models.py index 7f0c72dedbca2abdb86b9b324ec84de505764daa..a44ca56cd01c2cedcfcaa684a12d20b241ed42dc 100644 --- a/models.py +++ b/models.py @@ -29,7 +29,6 @@ class Finbank: self.ep = ep if not os.path.exists('./fin_cache'): - print('Embeddings cache not found. Building...') os.mkdir('./fin_cache') with open('./fin_cache/fbk.txt', 'w') as fbk: diff --git a/nlp.py b/nlp.py index 50b95a78b8b589be761ae6d9fa2f2a14a5fbfec0..e118e6939c6f758fb8babf2affc4ccd071be48ea 100644 --- a/nlp.py +++ b/nlp.py @@ -1,5 +1,6 @@ import nltk +nltk.download('averaged_perceptron_tagger') def get_core_concept(e1): t1 = nltk.pos_tag(e1) diff --git a/requirements.txt b/requirements.txt index a5a27dea0f0c189d0052e7faa87e6d11d678f900..c7bbb5d062ec9fb69d551e2d2bebde5e118ffdf4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -11,4 +11,7 @@ rdflib termcolor py-stringmatching scikit-learn -jupyter \ No newline at end of file +jupyter +fastapi +python-multipart +uvicorn[standard]