From 25f2031091e151be32028b0810a82aafdf9e124a Mon Sep 17 00:00:00 2001 From: Guilherme Henrique <guihss.cs@gmail.com> Date: Tue, 22 Nov 2022 16:18:32 +0100 Subject: [PATCH] changed similarity metric --- .../complex/ComplexAlignmentGeneration.java | 8 +++--- .../irit/complex/answer/SingleAnswer.java | 3 ++- src/main/java/irit/resource/IRI.java | 1 - .../irit/similarity/EmbeddingManager.java | 26 ++++++++++++++----- 4 files changed, 26 insertions(+), 12 deletions(-) diff --git a/src/main/java/irit/complex/ComplexAlignmentGeneration.java b/src/main/java/irit/complex/ComplexAlignmentGeneration.java index a993346..89ca501 100755 --- a/src/main/java/irit/complex/ComplexAlignmentGeneration.java +++ b/src/main/java/irit/complex/ComplexAlignmentGeneration.java @@ -23,6 +23,7 @@ import org.apache.jena.rdf.model.RDFNode; import java.io.IOException; import java.nio.file.Paths; +import java.time.Instant; import java.util.*; import java.util.concurrent.*; @@ -32,7 +33,6 @@ public class ComplexAlignmentGeneration { public static void main(String[] args) throws SparqlEndpointUnreachableException, SparqlQueryMalFormedException, ExecutionException, InterruptedException, IncompleteSubstitutionException, IOException { - ArgumentParser parser = buildArgumentParser(); @@ -67,6 +67,7 @@ public class ComplexAlignmentGeneration { parser.handleError(e); } + } @@ -162,7 +163,6 @@ public class ComplexAlignmentGeneration { public static void align(SparqlSelect sq, String sourceEndpoint, String targetEndpoint, int maxMatches, boolean reassess, List<Float> th, OutputManager outputManager) throws SparqlEndpointUnreachableException, SparqlQueryMalFormedException, ExecutionException, InterruptedException, IncompleteSubstitutionException { Set<Answer> matchedAnswers = getMatchedAnswers(sq, sourceEndpoint, targetEndpoint, maxMatches); - for (float threshold : th) { List<SubgraphForOutput> subgraphForOutputs = buildSingleOutput(matchedAnswers, sq, sourceEndpoint, targetEndpoint, threshold, reassess); @@ -252,7 +252,6 @@ public class ComplexAlignmentGeneration { private static List<SubgraphForOutput> buildSingleOutput(Set<Answer> matchedAnswers, SparqlSelect sq, String sourceEndpoint, String targetEndpoint, float threshold, boolean reassess) throws SparqlEndpointUnreachableException, SparqlQueryMalFormedException { HashSet<InstantiatedSubgraph> goodSubgraphs = new HashSet<>(); for (Answer ans : matchedAnswers) { - HashSet<InstantiatedSubgraph> localSubgraphs = ans.findCorrespondingSubGraph(sq, targetEndpoint, threshold); goodSubgraphs.addAll(localSubgraphs); } @@ -279,13 +278,13 @@ public class ComplexAlignmentGeneration { } } - if (reassess) { for (SubgraphForOutput s : output) { s.reassessSimilarityWithCounterExamples(sourceEndpoint, targetEndpoint, sq); } } + Collections.sort(output); ArrayList<SubgraphForOutput> singleOutput = new ArrayList<>(); if (output.size() > 0 && output.get(output.size() - 1).getSimilarity() < 0.6 && output.get(output.size() - 1).getSimilarity() > 0.01) { @@ -297,6 +296,7 @@ public class ComplexAlignmentGeneration { if (output.get(i).getSimilarity() == sim) { singleOutput.add(output.get(i)); + } else { moreCorrespondences = false; } diff --git a/src/main/java/irit/complex/answer/SingleAnswer.java b/src/main/java/irit/complex/answer/SingleAnswer.java index b37e5dc..4be44ba 100755 --- a/src/main/java/irit/complex/answer/SingleAnswer.java +++ b/src/main/java/irit/complex/answer/SingleAnswer.java @@ -62,6 +62,7 @@ public class SingleAnswer extends Answer { HashSet<InstantiatedSubgraph> goodTriples = new HashSet<>(); int count = 0; + for (IRI iri : res.getSimilarIRIs()) { if (count < numberMaxOfExploredAnswers) { @@ -150,7 +151,7 @@ public class SingleAnswer extends Answer { iri.getValue() + " ?predicate ?object." + "MINUS{ " + iri.getValue() + " <http://www.w3.org/2002/07/owl#sameAs> ?object.}" - + "}LIMIT 500"; + + "} LIMIT 500"; List<Map<String, RDFNode>> result = SparqlProxy.query(targetEndpoint, query); diff --git a/src/main/java/irit/resource/IRI.java b/src/main/java/irit/resource/IRI.java index 170e8ed..6eadbc5 100755 --- a/src/main/java/irit/resource/IRI.java +++ b/src/main/java/irit/resource/IRI.java @@ -124,7 +124,6 @@ public class IRI extends Resource { } } - /*Check if a match is in the target dataset*/ for (IRI match : allMatches) { if ( DatasetManager.getInstance().labelMaps.get(targetEndpoint).exists(match.toString())) { similarIRIs.add(match); diff --git a/src/main/java/irit/similarity/EmbeddingManager.java b/src/main/java/irit/similarity/EmbeddingManager.java index aa627ef..468595f 100644 --- a/src/main/java/irit/similarity/EmbeddingManager.java +++ b/src/main/java/irit/similarity/EmbeddingManager.java @@ -13,12 +13,15 @@ import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; +import java.util.regex.Matcher; +import java.util.regex.Pattern; public class EmbeddingManager { private static Map<String, INDArray> embs1 = new HashMap<>(); public static long[] embshape; + private static final Pattern pattern = Pattern.compile("([^>]+)[#/]([A-Za-z0-9_-]+)"); public static void load(String n1, String e1) throws IOException { @@ -31,9 +34,20 @@ public class EmbeddingManager { } - public static double getSim(String s1, String s2){ + public static double getSim(String s1, String s2) { + s1 = getSuffix(s1).toLowerCase(); + s2 = getSuffix(s2).toLowerCase(); + return 1 - LevenshteinDistance.getDefaultInstance().apply(s1, s2) / (float) Math.max(s1.length(), s2.length()); + } + + private static String getSuffix(String value) { - return LevenshteinDistance.getDefaultInstance().apply(s1, s2) / (float) Math.max(s1.length(), s2.length()); + Matcher matcher = pattern.matcher(value); + if (matcher.find()) { + return matcher.group(2); + } else { + return value; + } } private static Map<String, INDArray> loadEmbs(String n1, String e1) throws IOException { @@ -54,17 +68,17 @@ public class EmbeddingManager { return embsMap; } - public static INDArray get(String e1){ + public static INDArray get(String e1) { if (!embs1.containsKey(e1)) return Nd4j.zeros(DataType.DOUBLE, embshape); return embs1.get(e1); } - private static String processLabel(String line){ + private static String processLabel(String line) { line = line.replaceAll("\\\\n", "\\n").trim(); - if (line.startsWith("http://") && line.contains("#")){ + if (line.startsWith("http://") && line.contains("#")) { String[] split = line.split("#"); - if (split.length > 1){ + if (split.length > 1) { line = split[1]; } else { line = split[0]; -- GitLab