diff --git a/main.ipynb b/main.ipynb index 18aaee7883f49cddfd1767223ed9274064b2880f..9f77ed8fa13d24cc4678dc2f20cb320cbed7dfc6 100644 --- a/main.ipynb +++ b/main.ipynb @@ -5,8 +5,8 @@ "execution_count": 1, "metadata": { "ExecuteTime": { - "end_time": "2023-05-05T16:58:30.285996286Z", - "start_time": "2023-05-05T16:58:27.520729058Z" + "end_time": "2023-07-19T08:57:37.490880538Z", + "start_time": "2023-07-19T08:57:35.005710556Z" } }, "outputs": [], @@ -38,8 +38,8 @@ "execution_count": 2, "metadata": { "ExecuteTime": { - "end_time": "2023-05-05T16:58:33.643656486Z", - "start_time": "2023-05-05T16:58:30.289546804Z" + "end_time": "2023-07-19T08:57:46.777266756Z", + "start_time": "2023-07-19T08:57:37.493551556Z" } }, "outputs": [], @@ -51,24 +51,22 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 3, "metadata": { "ExecuteTime": { - "end_time": "2023-05-04T23:11:23.647059226Z", - "start_time": "2023-05-04T23:08:45.535290656Z" + "end_time": "2023-07-19T09:00:31.330783432Z", + "start_time": "2023-07-19T08:57:46.780973332Z" } }, "outputs": [ { "data": { + "text/plain": " 0%| | 0/21 [00:00<?, ?it/s]", "application/vnd.jupyter.widget-view+json": { - "model_id": "dc8966ab2d924576890dd6f0598da481", "version_major": 2, - "version_minor": 0 - }, - "text/plain": [ - " 0%| | 0/21 [00:00<?, ?it/s]" - ] + "version_minor": 0, + "model_id": "4d5d3d0f0d60477d840af6d66d63615a" + } }, "metadata": {}, "output_type": "display_data" @@ -88,13 +86,13 @@ "Loading o1\n", "Loading o2\n", "0\n", - "ontology iterations: 1804, (0.0, 0, 0.0), aligns: 0, po1: 22, po2: 41\n", + "ontology iterations: 1804, (0, 0, 0.0), aligns: 0, po1: 22, po2: 41\n", "----------------------------------------------------------------------------------------------------\n", "cmt.owl Conference.owl\n", "Loading o1\n", "Loading o2\n", "3\n", - "ontology iterations: 7552, (0.25, 0.3333333333333333, 0.28571428571428575), aligns: 3, po1: 59, po2: 64\n", + "ontology iterations: 7552, (0.3333333333333333, 0.3333333333333333, 0.3333333333333333), aligns: 3, po1: 59, po2: 64\n", "----------------------------------------------------------------------------------------------------\n", "cmt.owl iasted.owl\n", "Loading o1\n", @@ -166,13 +164,13 @@ "Loading o1\n", "Loading o2\n", "2\n", - "ontology iterations: 3068, (0.6666666666666666, 1.0, 0.8), aligns: 2, po1: 59, po2: 26\n", + "ontology iterations: 3068, (1.0, 1.0, 1.0), aligns: 2, po1: 59, po2: 26\n", "----------------------------------------------------------------------------------------------------\n", "Conference.owl edas.owl\n", "Loading o1\n", "Loading o2\n", "3\n", - "ontology iterations: 6400, (0.6666666666666666, 0.6666666666666666, 0.6666666666666666), aligns: 3, po1: 64, po2: 50\n", + "ontology iterations: 6400, (1.0, 0.6666666666666666, 0.8), aligns: 3, po1: 64, po2: 50\n", "----------------------------------------------------------------------------------------------------\n", "cmt.owl edas.owl\n", "Loading o1\n", @@ -190,7 +188,7 @@ "Loading o1\n", "Loading o2\n", "3\n", - "ontology iterations: 3328, (0.5, 0.3333333333333333, 0.4), aligns: 3, po1: 64, po2: 26\n", + "ontology iterations: 3328, (1.0, 0.3333333333333333, 0.5), aligns: 3, po1: 64, po2: 26\n", "----------------------------------------------------------------------------------------------------\n", "confOf.owl edas.owl\n", "Loading o1\n", @@ -203,7 +201,7 @@ "Loading o2\n", "4\n", "ontology iterations: 4608, (1.0, 0.5, 0.6666666666666666), aligns: 4, po1: 64, po2: 36\n", - "iterations: 74590, (0.6857142857142857, 0.5217391304347826, 0.5925925925925927)\n" + "iterations: 74590, (0.8275862068965517, 0.5217391304347826, 0.64)\n" ] } ], @@ -213,23 +211,23 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 4, "metadata": { "ExecuteTime": { - "end_time": "2023-05-04T23:13:14.075532740Z", - "start_time": "2023-05-04T23:13:13.788604059Z" + "end_time": "2023-06-01T13:30:47.366350949Z", + "start_time": "2023-06-01T13:30:46.430444165Z" } }, "outputs": [ { - "ename": "NameError", - "evalue": "name 'results' is not defined", + "ename": "TypeError", + "evalue": "'float' object is not iterable", "output_type": "error", "traceback": [ "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", - "\u001B[0;31mNameError\u001B[0m Traceback (most recent call last)", - "Cell \u001B[0;32mIn[1], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m p, r, f \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mzip\u001B[39m(\u001B[38;5;241m*\u001B[39m\u001B[43mresults\u001B[49m)\n\u001B[1;32m 3\u001B[0m x \u001B[38;5;241m=\u001B[39m np\u001B[38;5;241m.\u001B[39marange(\u001B[38;5;241m0.0\u001B[39m, \u001B[38;5;241m1\u001B[39m, \u001B[38;5;241m0.01\u001B[39m)\n\u001B[1;32m 5\u001B[0m plt\u001B[38;5;241m.\u001B[39mplot(x, p, label\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mprecision\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n", - "\u001B[0;31mNameError\u001B[0m: name 'results' is not defined" + "\u001B[0;31mTypeError\u001B[0m Traceback (most recent call last)", + "Cell \u001B[0;32mIn[4], line 1\u001B[0m\n\u001B[0;32m----> 1\u001B[0m p, r, f \u001B[38;5;241m=\u001B[39m \u001B[38;5;28;43mzip\u001B[39;49m\u001B[43m(\u001B[49m\u001B[38;5;241;43m*\u001B[39;49m\u001B[43mresults\u001B[49m\u001B[43m)\u001B[49m\n\u001B[1;32m 3\u001B[0m x \u001B[38;5;241m=\u001B[39m np\u001B[38;5;241m.\u001B[39marange(\u001B[38;5;241m0.0\u001B[39m, \u001B[38;5;241m1\u001B[39m, \u001B[38;5;241m0.01\u001B[39m)\n\u001B[1;32m 5\u001B[0m plt\u001B[38;5;241m.\u001B[39mplot(x, p, label\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mprecision\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n", + "\u001B[0;31mTypeError\u001B[0m: 'float' object is not iterable" ] } ], diff --git a/main.py b/main.py new file mode 100644 index 0000000000000000000000000000000000000000..9c84b8e6de7bee36272c96239ebac37a78255897 --- /dev/null +++ b/main.py @@ -0,0 +1,98 @@ +from sentence_transformers import SentenceTransformer +from models import Finbank +import random +import torch +import numpy as np +from property_matching import PropertyMatcher +from tqdm.auto import tqdm +from property_matching import most_common_pair +import matplotlib.pyplot as plt +import argparse +import rdflib +import tempfile +from urllib import parse, request +from om.ont import get_namespace + + +def parse_arguments(): + arg_parser = argparse.ArgumentParser(description='LD similarity.') + + arg_parser.add_argument('source', help='Source ontology path.') + arg_parser.add_argument('target', help='Target ontology path.') + arg_parser.add_argument('--output', dest='output', default='./output', help='Folder to save the results.') + arg_parser.add_argument('--format', dest='format', default='align', choices=['align', 'sssom'], help='Output format.') + + return arg_parser.parse_args() + + +def toAlignFormat(aligns, onto1, onto2, location1, location2): + data = ["""<?xml version='1.0' encoding='utf-8' standalone='no'?> +<rdf:RDF xmlns='http://knowledgeweb.semanticweb.org/heterogeneity/alignment#' + xmlns:rdf='http://www.w3.org/1999/02/22-rdf-syntax-ns#' + xmlns:xsd='http://www.w3.org/2001/XMLSchema#' + xmlns:align='http://knowledgeweb.semanticweb.org/heterogeneity/alignment#'>"""] + + data.append(f""" <Alignment> + <xml>yes</xml> + <level>0</level> + <type>**</type> + <onto1> + <Ontology rdf:about="{onto1}"> + <location>{location1}</location> + </Ontology> + </onto1> + <onto2> + <Ontology rdf:about="{onto2}"> + <location>{location2}</location> + </Ontology> + </onto2>""") + + for (entity1, entity2), confidence in aligns.items(): + data.append(f""" <map> + <Cell> + <entity1 rdf:resource="{entity1}"/> + <entity2 rdf:resource="{entity2}"/> + <relation>=</relation> + <measure rdf:datatype="http://www.w3.org/2001/XMLSchema#float">{confidence}</measure> + </Cell> + </map>""") + + data.append(""" </Alignment> +</rdf:RDF>""") + + return '\n'.join(data) + +def ssom(aligns): + lines = ['subject_id\tpredicate_id\tobject_id\tmapping_justification\tconfidence'] + for (entity1, entity2), confidence in aligns.items(): + lines.append(f"{entity1}\tskos:exactMatch\t{entity2}\tsemapv:LexicalMatching\t{confidence}") + + return "\n".join(lines) + +if __name__ == '__main__': + args = parse_arguments() + wm = Finbank('/home/guilherme/Documents/kg/fin.bin') + model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') + property_matcher = PropertyMatcher(wm, model) + + o1 = rdflib.Graph().parse(args.source) + o2 = rdflib.Graph().parse(args.target) + + p, it = property_matcher.match_ontologies(o1, o2, 0.65) + + + + # Parser + + + if args.format == 'sssom': + result = ssom(p) + suffix = '.tsv' + else: + result = toAlignFormat(p, get_namespace(o1), get_namespace(o2), args.source, args.target) + suffix = '.rdf' + + with tempfile.NamedTemporaryFile('w', prefix='alignment_', suffix=suffix, delete=False) as out_file: + out_file.write(result) + + print(parse.urljoin("file:", request.pathname2url(out_file.name))) diff --git a/property_matching.py b/property_matching.py index 6a0509c594c9c2ca6ae75f74628344e312660b14..f3bf11f81d6f3c309648bfcc095999cc289111fa 100644 --- a/property_matching.py +++ b/property_matching.py @@ -14,6 +14,7 @@ from collections import Counter from tqdm.auto import tqdm import math + def get_type_h(e, g, ml=1): if type(e) is Literal: return [e.datatype] @@ -283,9 +284,6 @@ def get_prop(e, g, p): return s, objc - - - def build_tf_models(o1, o2): a_entities = set(filter(lambda x: is_property(x, o1), o1.subjects())) b_entities = set(filter(lambda x: is_property(x, o2), o2.subjects())) @@ -380,7 +378,6 @@ class PropertyMatcher: sim = 0 label_confidence = sim - if sim_weights: conf = [] if 0 in sim_weights: @@ -403,7 +400,6 @@ class PropertyMatcher: if tr is not None: trm = [[0, 0] for _ in tr] - for r, k1, k2 in tqdm(list(onts(base, ref))): print('-' * 100) @@ -457,7 +453,8 @@ class PropertyMatcher: trm[i][0] += len(pa.intersection(cp)) trm[i][1] += len(cp) - print(f'ontology iterations: {oi}, {metrics(len(pa.intersection(cp)), len(cp), current_total)}, aligns: {current_total}, po1: {len(a_entities)}, po2: {len(b_entities)}') + print( + f'ontology iterations: {oi}, {metrics(len(pa.intersection(cp)), len(cp), current_total)}, aligns: {current_total}, po1: {len(a_entities)}, po2: {len(b_entities)}') # for a1, a2 in pa.intersection(p): # print(colored('✓', 'green'), get_n(a1, o1), get_n(a2, o2)) @@ -471,8 +468,9 @@ class PropertyMatcher: # print(colored('X', 'red'), get_n(d1, o1), get_n(a1, o1), get_n(r1, o1), colored('<>', 'green'), # get_n(d2, o2), get_n(a2, o2), get_n(r2, o2)) - # print( - # f'ontology iterations: {oi}, {metrics(current_correct, current_pred, current_total)}, aligns: {current_total}, po1: {len(a_entities)}, po2: {len(b_entities)}') + print( + f'ontology iterations: {oi}, {metrics(current_correct, current_pred, current_total)}, aligns: {current_total}, po1: {len(a_entities)}, po2: {len(b_entities)}') + print(f'iterations: {iterations}, {metrics(correct, pred, total)}') if tr is not None: res = [] @@ -540,6 +538,4 @@ class PropertyMatcher: pm[iv1] = (iv2, sim) pm[iv2] = (iv1, sim) - return p, iterations -