diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000000000000000000000000000000000000..7e511a4f71a14e36b90af9746b481a88cde92475 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +.idea +lib +out +/output/ +target \ No newline at end of file diff --git a/pom.xml b/pom.xml new file mode 100644 index 0000000000000000000000000000000000000000..4349fcd61e9617101af90e526d344bab2aebe391 --- /dev/null +++ b/pom.xml @@ -0,0 +1,107 @@ +<?xml version="1.0" encoding="UTF-8"?> +<project xmlns="http://maven.apache.org/POM/4.0.0" + xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <groupId>org.example</groupId> + <artifactId>CanardE</artifactId> + <version>1.0-SNAPSHOT</version> + + <properties> + <maven.compiler.source>18</maven.compiler.source> + <maven.compiler.target>18</maven.compiler.target> + </properties> + + <dependencies> + <dependency> + <groupId>org.deeplearning4j</groupId> + <artifactId>deeplearning4j-core</artifactId> + <version>1.0.0-M1.1</version> + </dependency> + <dependency> + <groupId>org.nd4j</groupId> + <artifactId>nd4j-native-platform</artifactId> + <version>1.0.0-M1.1</version> + </dependency> + <dependency> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-core</artifactId> + <version>2.12.1</version> + <scope>compile</scope> + </dependency> + <dependency> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-annotations</artifactId> + <version>2.12.1</version> + <scope>compile</scope> + </dependency> + <dependency> + <groupId>com.fasterxml.jackson.core</groupId> + <artifactId>jackson-databind</artifactId> + <version>2.12.1</version> + <scope>compile</scope> + </dependency> + <dependency> + <groupId>fr.inrialpes.exmo</groupId> + <artifactId>procalign</artifactId> + <version>4.9</version> + <exclusions> + <exclusion> + <artifactId>log4j-core</artifactId> + <groupId>org.apache.logging.log4j</groupId> + </exclusion> + </exclusions> + <scope>system</scope> + <systemPath>${project.basedir}/lib/procalign.jar</systemPath> + </dependency> + <dependency> + <groupId>fr.inrialpes.exmo.</groupId> + <artifactId>ontowrap</artifactId> + <version>4.9</version> + <scope>system</scope> + <systemPath>${project.basedir}/lib/ontowrap.jar</systemPath> + </dependency> + <dependency> + <groupId>org.semanticweb.owl.align</groupId> + <artifactId>align</artifactId> + <version>4.9</version> + <scope>system</scope> + <systemPath>${project.basedir}/lib/align.jar</systemPath> + </dependency> + + + <dependency> + <groupId>commons-codec</groupId> + <artifactId>commons-codec</artifactId> + <version>1.15</version> + </dependency> + + + <!-- https://mvnrepository.com/artifact/org.slf4j/slf4j-nop --> + <dependency> + <groupId>org.slf4j</groupId> + <artifactId>slf4j-nop</artifactId> + <version>1.7.36</version> + </dependency> + + + <dependency> + <groupId>org.apache.jena</groupId> + <artifactId>apache-jena-libs</artifactId> + <type>pom</type> + <version>4.4.0</version> + </dependency> + + + <!-- https://mvnrepository.com/artifact/org.apache.commons/commons-text --> + <dependency> + <groupId>org.apache.commons</groupId> + <artifactId>commons-text</artifactId> + <version>1.9</version> + </dependency> + + + </dependencies> + +</project> \ No newline at end of file diff --git a/src/main/java/irit/complex/ComplexAlignmentGeneration.java b/src/main/java/irit/complex/ComplexAlignmentGeneration.java new file mode 100755 index 0000000000000000000000000000000000000000..b3ec9f1e0ea7e0be6df5c5c7948be0644d5a4af5 --- /dev/null +++ b/src/main/java/irit/complex/ComplexAlignmentGeneration.java @@ -0,0 +1,372 @@ +package irit.complex; + +import irit.complex.answer.Answer; +import irit.complex.answer.PairAnswer; +import irit.complex.answer.SingleAnswer; +import irit.complex.subgraphs.*; +import irit.dataset.DatasetManager; +import irit.misc.Progress; +import irit.output.OutputManager; +import irit.resource.IRI; +import irit.resource.Resource; +import irit.similarity.EmbeddingManager; +import irit.sparql.exceptions.IncompleteSubstitutionException; +import irit.sparql.SparqlProxy; +import irit.sparql.query.exception.SparqlEndpointUnreachableException; +import irit.sparql.query.exception.SparqlQueryMalFormedException; +import irit.sparql.query.select.SparqlSelect; +import org.apache.jena.rdf.model.RDFNode; + +import java.io.FileNotFoundException; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.*; +import java.util.concurrent.*; + + +public class ComplexAlignmentGeneration { + + + public static void main(String[] args) throws SparqlEndpointUnreachableException, SparqlQueryMalFormedException, ExecutionException, InterruptedException, IncompleteSubstitutionException { + + System.out.println("==============================================================================="); + System.out.println("CanardE"); + System.out.println("==============================================================================="); + + String datasets = "/home/guilherme/IdeaProjects/conference-dataset-population-elodie/populated_datasets/data_100"; + String source = "cmt_100.ttl"; + String target = "conference_100.ttl"; + + Set<String> stringSet = Set.of(source, target); + + Map<String, String> ds = new HashMap<>(); + + try { + Files.walk(Paths.get(datasets), 1).forEach(path -> { + if (!path.toString().endsWith(".ttl") && !stringSet.contains(path.getFileName().toString())) return; + ds.put(path.getFileName().toString().split("_")[0], path.toString()); + }); + } catch (IOException e) { + throw new RuntimeException(e); + } + + System.out.println("Found " + ds.size() + " datasets."); + + String needs = "/home/guilherme/IdeaProjects/ComplexAlignmentGenerator/needs"; + + Map<String, String> nd = new HashMap<>(); + Map<String, List<SparqlSelect>> cqas = new HashMap<>(); + + try { + Files.walk(Paths.get(needs), 1).forEach(path -> { + String ont = path.getFileName().toString(); + if (!ds.containsKey(ont)) return; + nd.put(ont, path.toString()); + + + try { + Files.walk(path, 1).forEach(path1 -> { + if (Files.isDirectory(path1)) return; + Scanner squery = null; + try { + squery = new Scanner(path1); + } catch (IOException e) { + throw new RuntimeException(e); + } + + String query = squery.useDelimiter("\\Z").next(); + SparqlSelect sq = new SparqlSelect(query); + cqas.computeIfAbsent(ont, s -> new ArrayList<>()).add(sq); + squery.close(); + }); + } catch (IOException e) { + throw new RuntimeException(e); + } + + + + + }); + } catch (IOException e) { + throw new RuntimeException(e); + } + + for (String s : ds.keySet()) { + if (nd.containsKey(s)) continue; + System.out.println("⚠️ Not found CQAs for " + s + "."); + } + + String embeddings = "/home/guilherme/Documents/canard/run/glove"; + Map<String, String[]> embs = new HashMap<>(); + try { + Files.walk(Paths.get(embeddings), 1).forEach(path -> { + if (Files.isDirectory(path)) return; + String f = path.getFileName().toString(); + String[] split = f.split("[_.]"); + + if (split[1].equals("n")) embs.computeIfAbsent(split[0], s -> new String[2])[0] = path.toString(); + else if (split[1].equals("e")) embs.computeIfAbsent(split[0], s -> new String[2])[1] = path.toString(); + + }); + } catch (IOException e) { + throw new RuntimeException(e); + } + + String range = "0:1.1:0.1"; + + String[] split = range.split(":"); + + List<Float> ths = new ArrayList<>(); + + for (float th = Float.parseFloat(split[0]); th <= Float.parseFloat(split[1]); th += Float.parseFloat(split[2])) { + ths.add(th); + } + + embs.forEach((name, paths) -> { + try { + EmbeddingManager.load(paths[0], paths[1]); + } catch (IOException e) { + throw new RuntimeException(e); + } + }); + + + ds.forEach((name, path) -> { + DatasetManager.getInstance().load(name, path); + }); + + + List<String[]> datasetArgs = new ArrayList<>(); + + + ds.forEach((s, s2) -> { + if (source != null && !source.startsWith(s)) return; + ds.forEach((s1, s21) -> { + if (s.equals(s1)) return; + if (target != null && !target.startsWith(s1)) return; + datasetArgs.add(new String[]{s, s1}); + }); + }); + + + + int tc = 1; +// ExecutorService executorService = Executors.newFixedThreadPool(tc); + + String output = "output"; + + for (String[] datasetArg : datasetArgs) { + run(datasetArg[0], datasetArg[1], cqas.get(datasetArg[0]), ths, 10, false, output); +// progress.step(); + } + + + } + + + public static void run(String sourceEndpoint, String targetEndpoint, List<SparqlSelect> queries, List<Float> th, int maxMatches, boolean reassess, String outputPath) throws SparqlEndpointUnreachableException, SparqlQueryMalFormedException, ExecutionException, InterruptedException, IncompleteSubstitutionException { + OutputManager outputManager = new OutputManager(); + outputManager.initOutputEdoal(sourceEndpoint, targetEndpoint, th, outputPath); + + + + + for (SparqlSelect sq : queries) { + align(sq, sourceEndpoint, targetEndpoint, maxMatches, reassess, th, outputManager); + } + + + outputManager.endOutput(); + } + + + public static void align(SparqlSelect sq, String sourceEndpoint, String targetEndpoint, int maxMatches, boolean reassess, List<Float> th, OutputManager outputManager) throws SparqlEndpointUnreachableException, SparqlQueryMalFormedException, ExecutionException, InterruptedException, IncompleteSubstitutionException { + Set<Answer> matchedAnswers = getMatchedAnswers(sq, sourceEndpoint, targetEndpoint, maxMatches); + + + for (float threshold : th) { + + List<SubgraphForOutput> subgraphForOutputs = buildSingleOutput(matchedAnswers, sq, sourceEndpoint, targetEndpoint, threshold, reassess); + + if (!subgraphForOutputs.isEmpty()) { + outputManager.addToOutput(threshold, sq, subgraphForOutputs); + } + + } + + + } + + + public static Set<Answer> getMatchedAnswers(SparqlSelect sq, String sourceEndpoint, String targetEndpoint, int maxMatches) throws SparqlEndpointUnreachableException, SparqlQueryMalFormedException, IncompleteSubstitutionException { + HashMap<String, IRI> iriList = sq.getIRIList(); + for (Map.Entry<String, IRI> m : iriList.entrySet()) { + m.getValue().retrieveLabels(sourceEndpoint); + } + + ArrayList<Answer> answers = new ArrayList<>(); + HashSet<Answer> matchedAnswers = new HashSet<>(); + int offsetMatch = 0; + + boolean noMoreSourceAnswers = false; + int offset = 0; + int limit = 2000; + while (!noMoreSourceAnswers && matchedAnswers.size() < maxMatches) { + + String queryLimit = " LIMIT " + limit + "\n OFFSET " + offset; + + if (sq.getFocusLength() == 1) { + List<Map<String, RDFNode>> ret = loadUnary(sourceEndpoint, sq, answers, queryLimit); + if (ret.size() < limit) { + noMoreSourceAnswers = true; + } + } else if (sq.getFocusLength() == 2) { + List<Map<String, RDFNode>> ret = loadBinary(sourceEndpoint, sq, answers, queryLimit); + if (ret.size() < limit) { + noMoreSourceAnswers = true; + } + } else { + System.out.println("ERROR for query : " + sq.toUnchangedString()); + System.err.println("Problem detected: too many variables in SELECT: can only deal with 1 or 2"); + noMoreSourceAnswers = true; + } + + + if (!noMoreSourceAnswers) { + offset += limit; + } + + while (matchedAnswers.size() < maxMatches && offsetMatch < answers.size()) { + + Answer ans = answers.get(offsetMatch); + ans.getExistingMatches(sourceEndpoint, targetEndpoint); + if (ans.hasMatch()) { + matchedAnswers.add(ans); + } + offsetMatch++; + } + + + } + + + if (matchedAnswers.isEmpty()) { + + Iterator<Answer> ansIt = answers.iterator(); + while (matchedAnswers.size() < maxMatches && ansIt.hasNext()) { + Answer ans = ansIt.next(); + ans.retrieveIRILabels(sourceEndpoint); + ans.getSimilarIRIs(targetEndpoint); + if (ans.hasMatch()) { + matchedAnswers.add(ans); + } + } + + + } + + + return matchedAnswers; + } + + + private static List<SubgraphForOutput> buildSingleOutput(Set<Answer> matchedAnswers, SparqlSelect sq, String sourceEndpoint, String targetEndpoint, float threshold, boolean reassess) throws SparqlEndpointUnreachableException, SparqlQueryMalFormedException { + HashSet<InstantiatedSubgraph> goodSubgraphs = new HashSet<>(); + for (Answer ans : matchedAnswers) { + + HashSet<InstantiatedSubgraph> localSubgraphs = ans.findCorrespondingSubGraph(sq, targetEndpoint, threshold); + goodSubgraphs.addAll(localSubgraphs); + } + ArrayList<SubgraphForOutput> output = new ArrayList<>(); + for (InstantiatedSubgraph t : goodSubgraphs) { + boolean added = false; + Iterator<SubgraphForOutput> it = output.iterator(); + while (it.hasNext() && !added) { + SubgraphForOutput subG = it.next(); + if (t instanceof Triple && subG instanceof TripleSubgraph) { + added = ((TripleSubgraph) subG).addSubgraph((Triple) t); + } + if (t instanceof Path && subG instanceof PathSubgraph) { + added = ((PathSubgraph) subG).addSubgraph((Path) t); + } + } + if (!added) { + if (t instanceof Triple) { + output.add(new TripleSubgraph((Triple) t)); + } + if (t instanceof Path) { + output.add(new PathSubgraph((Path) t)); + } + } + } + +// System.out.println("Number of correspondences found (" + threshold + "): " + output.size()); + + if (reassess) { + System.out.println("Reassessing similarity"); + for (SubgraphForOutput s : output) { + s.reassessSimilarityWithCounterExamples(sourceEndpoint, targetEndpoint, sq); + } + } + + Collections.sort(output); + ArrayList<SubgraphForOutput> singleOutput = new ArrayList<>(); + if (output.size() > 0 && output.get(output.size() - 1).getSimilarity() < 0.6 && output.get(output.size() - 1).getSimilarity() > 0.01) { + double sim = output.get(output.size() - 1).getSimilarity(); + boolean moreCorrespondences = true; + int i = output.size() - 1; + while (i >= 0 && moreCorrespondences) { + + if (output.get(i).getSimilarity() == sim) { + singleOutput.add(output.get(i)); + + } else { + moreCorrespondences = false; + } + i--; + } + } else { + for (SubgraphForOutput s : output) { + if (s.getSimilarity() >= 0.6) { + singleOutput.add(s); + } + } + } + + + return singleOutput; + } + + private static List<Map<String, RDFNode>> loadBinary(String sourceEndpoint, SparqlSelect sq, ArrayList<Answer> answers, String queryLimit) { + List<Map<String, RDFNode>> result = SparqlProxy.query(sourceEndpoint, sq.toUnchangedString() + queryLimit); + for (Map<String, RDFNode> response : result) { + String s1 = response.get(sq.getSelectFocus().get(0).replaceFirst("\\?", "")).toString(); + String s2 = response.get(sq.getSelectFocus().get(1).replaceFirst("\\?", "")).toString(); + boolean type1 = response.get(sq.getSelectFocus().get(0).replaceFirst("\\?", "")).isAnon(); + boolean type2 = response.get(sq.getSelectFocus().get(1).replaceFirst("\\?", "")).isAnon(); + if (!type1 && !type2) { + if (!s1.equals("") && !s2.equals("")) { + PairAnswer pair = new PairAnswer(new Resource(s1), new Resource(s2)); + answers.add(pair); + } + } + } + return result; + } + + private static List<Map<String, RDFNode>> loadUnary(String sourceEndpoint, SparqlSelect sq, ArrayList<Answer> answers, String queryLimit) { + List<Map<String, RDFNode>> result = SparqlProxy.query(sourceEndpoint, sq.toUnchangedString() + queryLimit); + for (Map<String, RDFNode> response : result) { + String s = response.get(sq.getSelectFocus().get(0).replaceFirst("\\?", "")).toString(); + boolean type = response.get(sq.getSelectFocus().get(0).replaceFirst("\\?", "")).isAnon(); + if (!type) { + SingleAnswer singleton = new SingleAnswer(new Resource(s)); + answers.add(singleton); + } + + } + return result; + } +} + diff --git a/src/main/java/irit/complex/answer/Answer.java b/src/main/java/irit/complex/answer/Answer.java new file mode 100755 index 0000000000000000000000000000000000000000..79fbf7b4421056287ce16381627a22f38bcb5fb9 --- /dev/null +++ b/src/main/java/irit/complex/answer/Answer.java @@ -0,0 +1,36 @@ +package irit.complex.answer; + +import irit.complex.subgraphs.InstantiatedSubgraph; +import irit.sparql.query.exception.SparqlEndpointUnreachableException; +import irit.sparql.query.exception.SparqlQueryMalFormedException; +import irit.sparql.query.select.SparqlSelect; + +import java.util.HashSet; + +public abstract class Answer { + final HashSet<String> goodTriples ; + + public Answer(){ + goodTriples = new HashSet<>(); + + } + + public void getSimilarIRIs(String targetEndpoint) {} + + public void getExistingMatches(String sourceEndpoint, String targetEndpoint) {} + + public void retrieveIRILabels(String endpointURL) {} + + public HashSet<InstantiatedSubgraph> findCorrespondingSubGraph(SparqlSelect query, String targetEndpoint, double similarityThreshold) throws SparqlEndpointUnreachableException, SparqlQueryMalFormedException { + return new HashSet<>(); + } + + public boolean hasMatch(){ + return false; + } + + public String printMatchedEquivalents() { + return ""; + } + +} diff --git a/src/main/java/irit/complex/answer/PairAnswer.java b/src/main/java/irit/complex/answer/PairAnswer.java new file mode 100755 index 0000000000000000000000000000000000000000..2943f78745700bd862f7eba7784c0a54345e8f5b --- /dev/null +++ b/src/main/java/irit/complex/answer/PairAnswer.java @@ -0,0 +1,234 @@ +package irit.complex.answer; + +import irit.complex.subgraphs.InstantiatedSubgraph; +import irit.complex.subgraphs.Path; +import irit.resource.IRI; +import irit.resource.Resource; +import irit.sparql.query.select.SparqlSelect; + +import java.util.ArrayList; +import java.util.HashSet; + +public class PairAnswer extends Answer{ + final Resource r1; + final Resource r2; + boolean similarlooked; + + public PairAnswer(Resource r1, Resource r2) { + if(r1.isIRI()){ + this.r1 = new IRI("<"+ r1 +">"); + } + else{ + this.r1= r1; + } + if(r2.isIRI()){ + this.r2 = new IRI("<"+ r2 +">"); + } + else{ + this.r2= r2; + } + similarlooked=false; + } + + public void retrieveIRILabels(String endpointURL) { + if (r1 instanceof IRI){ + ((IRI) r1).retrieveLabels(endpointURL); + } + if (r2 instanceof IRI){ + ((IRI) r2).retrieveLabels(endpointURL); + } + + } + + + public void getSimilarIRIs(String targetEndpoint) { + if(!similarlooked) { + + if (r1 instanceof IRI iri){ + iri.findSimilarResource(targetEndpoint); + + } + if (r2 instanceof IRI iri){ + iri.findSimilarResource(targetEndpoint); + + } + + similarlooked=true; + } + } + + public void getExistingMatches(String sourceEndpoint, String targetEndpoint) { + + if (r1 instanceof IRI iri){ + iri.findExistingMatches(sourceEndpoint, targetEndpoint); + } + if (r2 instanceof IRI iri){ + iri.findExistingMatches(sourceEndpoint, targetEndpoint); + } + + } + + + private HashSet<InstantiatedSubgraph> findCorresponding(SparqlSelect query, String targetEndpoint, double similarityThreshold, int currentLen, int maxLen) { + + HashSet<InstantiatedSubgraph> paths = new HashSet<>(); + + if (currentLen > maxLen) return paths; + + HashSet<String> queryLabels = query.getLabels(); + + if (hasTotalMatch()) { + + for (Resource x: r1.getSimilarIRIs()) { + + for (Resource y: r2.getSimilarIRIs()) { + int length=1; + boolean found = false; + + while (length< 4 && !found) { + ArrayList<ArrayList<Boolean>> allInv= allInversePossibilities(length); + for(ArrayList<Boolean> invArray : allInv) { + Path p = new Path(x,y,targetEndpoint,length,invArray); + if (p.pathFound()) { + p.getMostSimilarTypes(targetEndpoint, queryLabels, 0.0); + paths.add(p); + found = true; + } + } + length ++; + } + } + + } + + } + + + if (paths.isEmpty() && hasR1Match()) { + if(!r2.isIRI()) { + for (IRI x: r1.getSimilarIRIs()) { + int length=1; + boolean found = false; + + while (length< 4 && !found) { + ArrayList<ArrayList<Boolean>> allInv= allInversePossibilities(length); + for(ArrayList<Boolean> invArray : allInv) { + Path p = new Path(x,r2,targetEndpoint,length,invArray); + if (p.pathFound()) { + p.getMostSimilarTypes(targetEndpoint, queryLabels, 0.0); + paths.add(p); + found = true; + } + } + length ++; + } + } + + } + + } + + + if (paths.isEmpty() && hasR2Match()) { + if(!r1.isIRI()) { + for (IRI y: r2.getSimilarIRIs()) { + int length=1; + boolean found = false; + + while (length< 4 && !found) { + ArrayList<ArrayList<Boolean>> allInv= allInversePossibilities(length); + for(ArrayList<Boolean> invArray : allInv) { + Path p = new Path(r1,y,targetEndpoint,length,invArray); + if (p.pathFound()) { + p.getMostSimilarTypes(targetEndpoint, queryLabels, 0.0); + paths.add(p); + found = true; + } + } + length ++; + } + } + + } + + } + + for (InstantiatedSubgraph p: paths) { + if (p instanceof Path) { + + ((Path) p).compareLabel(queryLabels, similarityThreshold, targetEndpoint,0.5); + + } + else { + System.err.println("problem in Pair answer: instantiated subgraph is not a path..."); + } + } + if (paths.isEmpty() && !similarlooked) { + getSimilarIRIs(targetEndpoint); + System.out.println("No path found, similar answers : "+ printMatchedEquivalents()); + paths = findCorresponding(query,targetEndpoint,similarityThreshold, currentLen + 1, maxLen); + } + + return paths; + } + + public HashSet<InstantiatedSubgraph> findCorrespondingSubGraph(SparqlSelect query, String targetEndpoint, double similarityThreshold ) { + return findCorresponding(query, targetEndpoint, similarityThreshold, 0, 5); + + } + + // has at least one match (r1 or r2) + public boolean hasMatch(){ + boolean match = !r1.isIRI() || hasR1Match(); + if(r2.isIRI() && !hasR2Match()) { + match= false; + } + return match && hasR1Match() && hasR2Match(); + } + + private boolean hasR1Match() { + return !r1.getSimilarIRIs().isEmpty(); + } + + private boolean hasR2Match() { + return !r2.getSimilarIRIs().isEmpty(); + } + + private boolean hasTotalMatch() { + return hasR1Match() && hasR2Match(); + } + + public String toString() { + return r1.toString() + " " + r2.toString(); + } + + private ArrayList<ArrayList<Boolean>> allInversePossibilities(int length){ + ArrayList<ArrayList<Boolean>> result = new ArrayList<>(); + for(int i =0; i< Math.pow(2, length); i++) { + ArrayList<Boolean>invArray = new ArrayList<>(); + StringBuilder invStr = new StringBuilder(Integer.toBinaryString(i)); + while(invStr.length()<length) { + invStr.insert(0, "0"); + } + for(char invCh: invStr.toString().toCharArray()) { + if(invCh == '0') { + invArray.add(false); + } + else if (invCh == '1') { + invArray.add(true); + } + } + //.println(invArray); + result.add(invArray); + } + + + return result; + + } + + public String printMatchedEquivalents() { + return r1.getSimilarIRIs().toString() +" <--> "+ r2.getSimilarIRIs().toString(); + } + +} diff --git a/src/main/java/irit/complex/answer/SingleAnswer.java b/src/main/java/irit/complex/answer/SingleAnswer.java new file mode 100755 index 0000000000000000000000000000000000000000..e1ad15174033685d9e034b99e13ad3a90f4a3c05 --- /dev/null +++ b/src/main/java/irit/complex/answer/SingleAnswer.java @@ -0,0 +1,203 @@ +package irit.complex.answer; + +import irit.complex.subgraphs.InstantiatedSubgraph; +import irit.complex.subgraphs.Triple; +import irit.complex.subgraphs.TripleType; +import irit.resource.IRI; +import irit.resource.Resource; +import irit.similarity.EmbeddingManager; +import irit.sparql.SparqlProxy; +import irit.sparql.query.exception.SparqlEndpointUnreachableException; +import irit.sparql.query.exception.SparqlQueryMalFormedException; +import irit.sparql.query.select.SparqlSelect; +import org.apache.jena.rdf.model.RDFNode; +import org.nd4j.linalg.api.buffer.DataType; +import org.nd4j.linalg.api.ndarray.INDArray; +import org.nd4j.linalg.factory.Nd4j; + +import java.util.HashSet; +import java.util.List; +import java.util.Map; + +public class SingleAnswer extends Answer { + final Resource res; + final int numberMaxOfExploredAnswers; + + + public SingleAnswer(Resource r) { + super(); + if (r.isIRI()) { + res = new IRI("<" + r + ">"); + } else { + res = r; + } + numberMaxOfExploredAnswers = 20; + } + + public void retrieveIRILabels(String endpointURL) { + if (res instanceof IRI) { + ((IRI) res).retrieveLabels(endpointURL); + } + } + + public void getSimilarIRIs(String targetEndpoint) { + if (res.getSimilarIRIs().isEmpty()) { + res.findSimilarResource(targetEndpoint); + } + } + + public void getExistingMatches(String sourceEndpoint, String targetEndpoint) { + if (res instanceof IRI) { + ((IRI) res).findExistingMatches(sourceEndpoint, targetEndpoint); + } + } + + public HashSet<InstantiatedSubgraph> findCorrespondingSubGraph(SparqlSelect query, String targetEndpoint, double similarityThreshold) throws SparqlEndpointUnreachableException, SparqlQueryMalFormedException { + + HashSet<String> queryLabels = query.getLabels(); + + INDArray zeros = Nd4j.zeros(DataType.DOUBLE, EmbeddingManager.embshape); + + for (String queryLabel : queryLabels) { + zeros = zeros.add(EmbeddingManager.get(queryLabel)); + } + + zeros = zeros.div(queryLabels.size()); + + + + double maxSim = -1; + Triple bestTriple = new Triple(); + HashSet<InstantiatedSubgraph> goodTriples = new HashSet<>(); + + int count = 0; + for (IRI iri : res.getSimilarIRIs()) { + if (count < numberMaxOfExploredAnswers) { + + count++; + double localMaxSim = -1; + retrieveAllTriples(iri, targetEndpoint); + + for (Triple t : iri.getTriples()) { + + double similarity = 0; +// t.retrieveIRILabels(targetEndpoint); +// t.retrieveTypes(targetEndpoint); +// similarity += t.compareLabel(queryLabels, similarityThreshold, targetEndpoint); + similarity += t.compareSim(zeros, similarityThreshold); + + if (similarity > maxSim) { + maxSim = similarity; + bestTriple = t; + } + + if (similarity > localMaxSim) { + localMaxSim = similarity; + } + + if (similarity >= 0.6) { + goodTriples.add(t); + } + } + + } + } + + if (goodTriples.isEmpty() && !bestTriple.isNullTriple()) { + goodTriples.add(bestTriple); + } + + return goodTriples; + } + + public void retrieveAllTriples(IRI iri, String targetEndpoint) { + if (!iri.isTriplesRetrieved()) { + getSubjectTriples(iri, targetEndpoint); + getObjectTriples(iri, targetEndpoint); + getPredicateTriples(iri, targetEndpoint); + iri.setTriplesRetrieved(true); + } + } + + + private void getPredicateTriples(IRI iri, String targetEndpoint) { + String query = "SELECT ?subject ?object WHERE {" + + "?subject " + iri.getValue() + " ?object." + + "} LIMIT 500"; + + List<Map<String, RDFNode>> result = SparqlProxy.query(targetEndpoint, query); + + + for (Map<String, RDFNode> response : result) { + String sub = response.get("subject").toString(); + String obj = response.get("object").toString(); + iri.getTriples().add(new Triple("<" + sub + ">", iri.getValue(), obj, TripleType.PREDICATE)); + } + + } + + private void getObjectTriples(IRI iri, String targetEndpoint) { + String query = "SELECT ?subject ?predicate WHERE {" + + "?subject ?predicate " + iri.getValue() + "." + + "MINUS{ ?subject <http://www.w3.org/2002/07/owl#sameAs> " + iri.getValue() + ".}" + + "MINUS{ ?subject <http://www.w3.org/2004/02/skos/core#closeMatch> " + iri.getValue() + ".}" + + "MINUS{ ?subject <http://www.w3.org/2004/02/skos/core#exactMatch> " + iri.getValue() + ".}" + + "}LIMIT 500"; + + List<Map<String, RDFNode>> result = SparqlProxy.query(targetEndpoint, query); + + + for (Map<String, RDFNode> response : result) { + String sub = response.get("subject").toString(); + String pred = response.get("predicate").toString(); + iri.getTriples().add(new Triple("<" + sub + ">", "<" + pred + ">", iri.getValue(), TripleType.OBJECT)); + } + + } + + private void getSubjectTriples(IRI iri, String targetEndpoint) { + String query = "SELECT ?predicate ?object WHERE {" + + iri.getValue() + " ?predicate ?object." + + "MINUS{ " + iri.getValue() + " <http://www.w3.org/2002/07/owl#sameAs> ?object.}" + + + "}LIMIT 500"; + + List<Map<String, RDFNode>> result = SparqlProxy.query(targetEndpoint, query); + + for (Map<String, RDFNode> response : result) { + if (!response.get("object").toString().matches("\"b\\d+\"")) { + String pred = response.get("predicate").toString(); + String obj = response.get("object").toString(); + iri.getTriples().add(new Triple(iri.getValue(), "<" + pred + ">", obj, TripleType.SUBJECT)); + } + + } + + } + + + public String toString() { + return res.toValueString(); + } + + public int hashCode() { + return res.toValueString().hashCode(); + } + + public boolean equals(Object obj) { + if (obj instanceof SingleAnswer) { + return res.toValueString().equals(((SingleAnswer) obj).res.toValueString()); + } else { + return false; + } + } + + public boolean hasMatch() { + return !res.getSimilarIRIs().isEmpty(); + } + + public String printMatchedEquivalents() { + return res.getSimilarIRIs().toString(); + } + +} diff --git a/src/main/java/irit/complex/subgraphs/InstantiatedSubgraph.java b/src/main/java/irit/complex/subgraphs/InstantiatedSubgraph.java new file mode 100644 index 0000000000000000000000000000000000000000..bf6f8e012f34f1607020df2221a3890567dca037 --- /dev/null +++ b/src/main/java/irit/complex/subgraphs/InstantiatedSubgraph.java @@ -0,0 +1,18 @@ +package irit.complex.subgraphs; + +public class InstantiatedSubgraph implements Comparable<InstantiatedSubgraph>{ + double similarity; + + public double getSimilarity() { + return similarity; + } + + @Override + public int compareTo(InstantiatedSubgraph s) { + + return Double.compare(s.getSimilarity(), getSimilarity()); + } + +} + + diff --git a/src/main/java/irit/complex/subgraphs/Path.java b/src/main/java/irit/complex/subgraphs/Path.java new file mode 100644 index 0000000000000000000000000000000000000000..49b98128737cdabec76b0d8e15ff6d41412b79d8 --- /dev/null +++ b/src/main/java/irit/complex/subgraphs/Path.java @@ -0,0 +1,234 @@ +package irit.complex.subgraphs; + +import irit.resource.IRI; +import irit.resource.Resource; +import irit.similarity.EmbeddingManager; +import irit.sparql.SparqlProxy; +import org.apache.jena.rdf.model.RDFNode; + +import java.util.*; + +public class Path extends InstantiatedSubgraph { + ArrayList<IRI> properties; + final ArrayList<Resource> entities; + final ArrayList<IRI> types; + double similarity; + double typeSimilarity; + final ArrayList<Boolean> inverse; + + public Path(Resource x, Resource y, String sparqlEndpoint, int length, ArrayList<Boolean> inverse) { + properties = new ArrayList<>(); + entities = new ArrayList<>(); + types = new ArrayList<>(); + similarity = 0; + this.inverse = inverse; + + + findPathWithLength(x, y, sparqlEndpoint, length); + } + + private void findPathWithLength(Resource x, Resource y, String sparqlEndpoint, int length) { + + + String query; + StringBuilder queryBody = new StringBuilder(); + ArrayList<String> variables = new ArrayList<>(); + + if (!x.isIRI()) { + variables.add("?x"); + } else { + variables.add(x.toString()); + } + + + for (int i = 1; i <= length - 1; i++) { + variables.add("?v" + i); + } + + if (!y.isIRI()) { + variables.add("?y"); + } else { + variables.add(y.toString()); + } + + for (int i = 1; i <= length; i++) { + if (inverse.get(i - 1)) { + queryBody.append(variables.get(i)).append(" ?p").append(i).append(" ").append(variables.get(i - 1)).append(". \n"); + } else { + queryBody.append(variables.get(i - 1)).append(" ?p").append(i).append(" ").append(variables.get(i)).append(". \n"); + } + + } + + if (!x.isIRI()) { + queryBody.append(" filter (regex(?x, \"^").append(x).append("$\",\"i\"))\n"); + } + + if (!y.isIRI()) { + queryBody.append(" filter (regex(?y, \"^").append(y).append("$\",\"i\"))\n"); + } + + query = "SELECT DISTINCT * WHERE { " + queryBody + " } LIMIT 1"; + + + List<Map<String, RDFNode>> result = SparqlProxy.query(sparqlEndpoint, query); + Iterator<Map<String, RDFNode>> retIteratorTarg = result.iterator(); + + if (retIteratorTarg.hasNext()) { + Map<String, RDFNode> next = retIteratorTarg.next(); + if (next.containsKey("x")) { + entities.add(new Resource(next.get("x").toString())); + } else { + entities.add(x); + } + int i = 1; + boolean stop = false; + while (i <= length && !stop) { + String p = next.get("p" + i).toString(); + Resource res = new Resource(p); + switch (p) { + case "http://www.w3.org/1999/02/22-rdf-syntax-ns#type", + "http://www.w3.org/2002/07/owl#sameAs", + "http://www.w3.org/2004/02/skos/core#exactMatch", + "http://www.w3.org/2004/02/skos/core#closeMatch", + "http://dbpedia.org/ontology/wikiPageWikiLink" -> stop = true; + } + + if (res.isIRI()) { + properties.add(new IRI("<" + p + ">")); + } + i++; + } + if (stop) { + properties = new ArrayList<>(); + } + if (length >= 2 && !stop) { + for (int j = 1; j <= length - 1; j++) { + String v = next.get("v" + j).toString(); + Resource res = new Resource(v); + if (res.isIRI()) { + entities.add(new IRI("<" + v + ">")); + } else { + entities.add(res); + } + } + } + if (next.containsKey("y")) { + entities.add(new Resource(next.get("y").toString())); + } else { + entities.add(y); + } + + } + + + } + + public void compareLabel(HashSet<String> targetLabels, double threshold, String targetEndpoint, double typeThreshold) { + similarity = 0; + for (IRI prop : properties) { + prop.retrieveLabels(targetEndpoint); + similarity += similarity(prop.getLabels(), targetLabels, threshold); + } + + for (int i = 0; i < entities.size(); i++) { + Resource ent = entities.get(i); + if (ent instanceof IRI) { + IRI type = types.get(i); + if (type != null) { + double scoreType = similarity(type.getLabels(), targetLabels, threshold); + if (scoreType > typeThreshold) { + typeSimilarity += scoreType; + } else { + types.set(i, null); + } + } + } + } + if (pathFound()) { + similarity += 0.5;//if path + } + + getSimilarity(); + } + + public double similarity(Set<String> labels1, HashSet<String> labels2, double threshold){ + double score = 0; + for(String l1 : labels1){ + for(String l2: labels2){ + double sim = EmbeddingManager.getSim(l1, l2); + sim = sim < threshold ? 0 : sim; + score += sim; + } + } + return score; + } + + public boolean pathFound() { + return !properties.isEmpty(); + } + + //version all entities + public String toString() { + StringBuilder ret = new StringBuilder(); + for (int i = 0; i < properties.size(); i++) { + ret.append(entities.get(i)).append(" ").append(properties.get(i)).append(" ").append(entities.get(i + 1)).append(". "); + } + return getSimilarity() + " <-> " + ret; + } + + public String toSubGraphString() { + StringBuilder ret = new StringBuilder(); + ArrayList<String> variables = new ArrayList<>(); + variables.add("?answer0"); + for (int i = 1; i <= properties.size() - 1; i++) { + variables.add("?v" + i); + } + variables.add("?answer1"); + + for (int i = 0; i < properties.size(); i++) { + String xStr = variables.get(i); + String yStr = variables.get(i + 1); + + if (types.get(i) != null) { + ret.append(xStr).append(" a ").append(types.get(i)).append(". "); + } + if (inverse.get(i)) { + ret.append(yStr).append(" ").append(properties.get(i)).append(" ").append(xStr).append(". "); + } else { + ret.append(xStr).append(" ").append(properties.get(i)).append(" ").append(yStr).append(". "); + } + } + if (types.get(properties.size()) != null) { + ret.append(variables.get(properties.size())).append(" a ").append(types.get(properties.size())).append(". "); + } + return ret.toString(); + } + + public double getSimilarity() { + return similarity + typeSimilarity; + } + + public void getMostSimilarTypes(String endpointUrl, HashSet<String> targetLabels, double threshold) { + for (Resource r : entities) { + if (r instanceof IRI) { + IRI type = ((IRI) r).findMostSimilarType(endpointUrl, targetLabels, threshold); + types.add(type); + } else { + types.add(null); + } + } + } + + public ArrayList<IRI> getProperties() { + return properties; + } + + public ArrayList<IRI> getTypes() { + return types; + } + + public ArrayList<Boolean> getInverse() { + return inverse; + } +} diff --git a/src/main/java/irit/complex/subgraphs/PathSubgraph.java b/src/main/java/irit/complex/subgraphs/PathSubgraph.java new file mode 100644 index 0000000000000000000000000000000000000000..b994a2977d8a4d0ee2ab49cc138d8bfa3b7f00b9 --- /dev/null +++ b/src/main/java/irit/complex/subgraphs/PathSubgraph.java @@ -0,0 +1,50 @@ +package irit.complex.subgraphs; + +import java.util.ArrayList; + +public class PathSubgraph extends SubgraphForOutput { + final ArrayList<Path> paths; + + public PathSubgraph(Path p) { + paths= new ArrayList<>(); + paths.add(p); + similarity = p.getSimilarity(); + } + + public double getAverageSimilarity() { + return similarity; + } + + public boolean addSubgraph(Path p) { + boolean added = false; + if(p.toSubGraphString().equals(paths.get(0).toSubGraphString())) { + addSimilarity(p); + paths.add(p); + added = true; + } + return added; + } + + public void addSimilarity(Path p){ + similarity = ((similarity*paths.size())+p.getSimilarity())/(paths.size()+1); + } + + public String toExtensionString() { + return paths.get(0).toSubGraphString(); + } + + public String toSPARQLForm() { + return "SELECT distinct ?answer0 ?answer1 WHERE {\n"+ + paths.get(0).toSubGraphString() +"}"; + } + + public String toIntensionString() { + return paths.get(0).toSubGraphString(); + } + + public Path getMainPath() { + return paths.get(0); + } + + +} diff --git a/src/main/java/irit/complex/subgraphs/SubgraphForOutput.java b/src/main/java/irit/complex/subgraphs/SubgraphForOutput.java new file mode 100644 index 0000000000000000000000000000000000000000..684805acd27b2a518904381844d5ac159926650b --- /dev/null +++ b/src/main/java/irit/complex/subgraphs/SubgraphForOutput.java @@ -0,0 +1,138 @@ +package irit.complex.subgraphs; + +import irit.resource.IRI; +import irit.resource.Resource; +import irit.sparql.SparqlProxy; +import irit.sparql.query.select.SparqlSelect; +import org.apache.jena.rdf.model.RDFNode; + +import java.util.ArrayList; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +public class SubgraphForOutput implements Comparable<SubgraphForOutput>{ + double similarity; + + public String toExtensionString() {return "";} + + public String toIntensionString() {return "";} + + public String toString() { + return getAverageSimilarity()+" <-> "+ toIntensionString(); + } + public double getSimilarity() { + return similarity; + } + public double getAverageSimilarity() {return similarity;} + + public void reassessSimilarityWithCounterExamples(String sourceEndpoint, String targetEndpoint, SparqlSelect sq) { + double nbTrueExamples =0; + double nbCounterExamples = 0; + double nbRetSource = 0; + + List<Map<String, RDFNode>> resultS = SparqlProxy.query(sourceEndpoint, sq.toString()); + int offset = 0; + int limit = 10000; + boolean end = false; + while (!end) { + String newQuery = toSPARQLForm(); + newQuery += "\n LIMIT "+limit; + newQuery += "\n OFFSET "+offset; + List<Map<String, RDFNode>> resultT = SparqlProxy.query(targetEndpoint, newQuery); + Iterator<Map<String, RDFNode>> retIterator = resultT.iterator(); + + while (retIterator.hasNext() && nbCounterExamples <= 10 * nbRetSource) { + Map<String, RDFNode> response = retIterator.next(); + if(response.containsKey("answer")) { + IRI iriResponse = new IRI("<"+response.get("answer").toString()+">"); + iriResponse.findExistingMatches(targetEndpoint, sourceEndpoint); + for(IRI sourceRes: iriResponse.getSimilarIRIs()) { + if(SparqlProxy.sendAskQuery(sourceEndpoint, "ASK{"+sq.toSubgraphForm().replaceAll("\\?answer", sourceRes.toString())+"}")) { + nbTrueExamples+=1; + } + else { + nbCounterExamples+=1; + } + } + + } + if(response.containsKey("answer1")) { + Resource r1 = new Resource(response.get("answer0").toString()); + Resource r2 = new Resource(response.get("answer1").toString()); + ArrayList<Resource> valuesr1Source = new ArrayList<>(); + ArrayList<Resource> valuesr2Source = new ArrayList<>(); + + if(r1.isIRI()){ + r1 = new IRI("<"+ r1 +">"); + ((IRI)r1).findExistingMatches(targetEndpoint, sourceEndpoint); + valuesr1Source.addAll(r1.getSimilarIRIs()); + } + else { + valuesr1Source.add(r1); + } + if(r2.isIRI()){ + r2 = new IRI("<"+ r2 +">"); + ((IRI)r2).findExistingMatches(targetEndpoint, sourceEndpoint); + valuesr2Source.addAll(r2.getSimilarIRIs()); + } + else { + valuesr2Source.add(r2); + } + for(Resource sourceRes1: valuesr1Source) { + for(Resource sourceRes2: valuesr2Source) { + String query = sq.toSubgraphForm(); + if(sourceRes1.isIRI()) { + query=query.replaceAll("\\?answer0", sourceRes1.toString()); + }else { + query+=" Filter(str(?answer0)="+sourceRes1.toValueString()+")"; + } + if(sourceRes2.isIRI()) { + query=query.replaceAll("\\?answer1", sourceRes2.toString()); + }else { + query+=" Filter(str(?answer1)="+sourceRes2.toValueString()+")"; + } + query="ASK{"+query+"}"; + if(SparqlProxy.sendAskQuery(sourceEndpoint, query)) { + nbTrueExamples+=1; + } + else { + nbCounterExamples+=1; + } + } + } + } + }// get target answers + if(resultS.size() <limit) { + end = true; + } + else { + offset+=limit; + } + if(nbCounterExamples >= 10*nbRetSource) { + end = true; + } + if (offset > 600000) { + end = true; + } + } + + if(nbTrueExamples+nbCounterExamples == 0) { + similarity = 0; + }else { + double percentageCommonOK = nbTrueExamples/(nbTrueExamples+nbCounterExamples); + similarity *= percentageCommonOK; + } + + } + + public String toSPARQLForm() { + return ""; + } + + @Override + public int compareTo(SubgraphForOutput o) { + return Double.compare(getSimilarity(), o.getSimilarity()); + } + +} diff --git a/src/main/java/irit/complex/subgraphs/Triple.java b/src/main/java/irit/complex/subgraphs/Triple.java new file mode 100755 index 0000000000000000000000000000000000000000..8ea82e85ba65d24f05055f8269d6ca346f078f42 --- /dev/null +++ b/src/main/java/irit/complex/subgraphs/Triple.java @@ -0,0 +1,267 @@ +package irit.complex.subgraphs; + +import irit.resource.IRI; +import irit.resource.Resource; +import irit.similarity.EmbeddingManager; +import irit.sparql.query.exception.SparqlEndpointUnreachableException; +import irit.sparql.query.exception.SparqlQueryMalFormedException; +import org.nd4j.linalg.api.ndarray.INDArray; +import org.nd4j.linalg.ops.transforms.Transforms; + +import java.util.HashSet; +import java.util.Set; + +public class Triple extends InstantiatedSubgraph { + private final IRI subject; + private final IRI predicate; + private Resource object; + private TripleType tripleType; + private boolean visited; + public boolean keepObjectType; + public boolean keepSubjectType; + private IRI subjectType; + private IRI objectType; + private double objectSimilarity; + private double subjectSimilarity; + private double predicateSimilarity; + + public Triple() { + subject = new IRI(""); + object = new Resource(""); + predicate = new IRI(""); + } + + public Triple(String sub, String pred, String obj, TripleType type) { + subject = new IRI(sub); + predicate = new IRI(pred); + Resource r = new Resource(obj); + if (r.isIRI()) { + object = new IRI("<" + obj.replaceAll("[<>]", "") + ">"); + } else { + object = r; + } + tripleType = type; + visited = false; + keepObjectType = false; + keepSubjectType = false; + objectSimilarity = 0; + subjectSimilarity = 0; + predicateSimilarity = 0; + } + + public void retrieveIRILabels(String targetEndpoint) throws SparqlQueryMalFormedException, SparqlEndpointUnreachableException { + if (tripleType != TripleType.SUBJECT) { + subject.retrieveLabels(targetEndpoint); + } + if (tripleType != TripleType.PREDICATE) { + predicate.retrieveLabels(targetEndpoint); + } + if (tripleType != TripleType.OBJECT && object instanceof IRI) { + ((IRI) object).retrieveLabels(targetEndpoint); + } + } + + public void retrieveTypes(String targetEndpoint) throws SparqlQueryMalFormedException, SparqlEndpointUnreachableException { + if (tripleType != TripleType.SUBJECT) { + subject.retrieveTypes(targetEndpoint); + } + if (tripleType != TripleType.PREDICATE) { + predicate.retrieveTypes(targetEndpoint); + } + if (tripleType != TripleType.OBJECT && object instanceof IRI) { + ((IRI) object).retrieveTypes(targetEndpoint); + } + } + + + public double compareSim(INDArray label, double threshold){ + if (tripleType != TripleType.SUBJECT) { + subjectSimilarity = Transforms.cosineSim(EmbeddingManager.get(subject.toString().replaceAll("[<>]", "")), label); + subjectSimilarity = subjectSimilarity >= threshold ? subjectSimilarity : 0; + } + if (tripleType != TripleType.PREDICATE && !predicate.toString().equals("<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>")) { + predicateSimilarity = Transforms.cosineSim(EmbeddingManager.get(predicate.toString().replaceAll("[<>]", "")), label); + predicateSimilarity = predicateSimilarity >= threshold ? predicateSimilarity : 0; + } + if (tripleType != TripleType.OBJECT) { + objectSimilarity = Transforms.cosineSim(EmbeddingManager.get(object.toString().replaceAll("[<>]", "")), label); + objectSimilarity = objectSimilarity >= threshold ? objectSimilarity : 0; + } + + return subjectSimilarity + predicateSimilarity + objectSimilarity; + } + public double similarity(Set<String> labels1, HashSet<String> labels2, double threshold){ + double score = 0; + for(String l1 : labels1){ + for(String l2: labels2){ + double sim = EmbeddingManager.getSim(l1, l2); + sim = sim < threshold ? 0 : sim; + score += sim; + } + } + return score; + } + + public double compareLabel(HashSet<String> targetLabels, double threshold, String targetEndpoint) { + if (tripleType != TripleType.SUBJECT) { + subjectType = subject.findMostSimilarType(targetEndpoint, targetLabels, threshold); + double scoreTypeSubMax = 0; + if (subjectType != null) { + scoreTypeSubMax = similarity(subjectType.getLabels(), targetLabels, threshold); + } + subjectSimilarity = similarity(subject.getLabels(), targetLabels, threshold); + if (scoreTypeSubMax > subjectSimilarity) { + keepSubjectType = true; + subjectSimilarity = scoreTypeSubMax; + } + } + if (tripleType != TripleType.PREDICATE && !predicate.toString().equals("<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>")) { + predicateSimilarity = similarity(predicate.getLabels(), targetLabels, threshold); + } + if (tripleType != TripleType.OBJECT && object instanceof IRI) { + objectType = ((IRI) object).findMostSimilarType(targetEndpoint, targetLabels, threshold); + if (objectType != null) { + double scoreTypeObMax = similarity(objectType.getLabels(), targetLabels, threshold); + objectSimilarity = similarity(((IRI) object).getLabels(), targetLabels, threshold); + if (scoreTypeObMax > objectSimilarity) { + keepObjectType = true; + objectSimilarity = scoreTypeObMax; + } + } + + } else if (tripleType != TripleType.OBJECT) { + HashSet<String> hashObj = new HashSet<>(); + hashObj.add(object.toString()); + objectSimilarity = similarity(hashObj, targetLabels, threshold); + } + + + return subjectSimilarity + predicateSimilarity + objectSimilarity; + } + + public IRI getSubject() { + return subject; + } + + public IRI getPredicate() { + return predicate; + } + + public Resource getObject() { + return object; + } + public boolean isSubjectTriple() { + return tripleType == TripleType.SUBJECT; + } + + public boolean isPredicateTriple() { + return tripleType == TripleType.PREDICATE; + } + + public boolean isObjectTriple() { + return tripleType == TripleType.OBJECT; + } + + public String toString() { + String subjStr = subject.toValueString(); + String predStr = predicate.toValueString(); + String objStr = object.toValueString(); + + if (isSubjectTriple()) { + subjStr = "?answer"; + } else if (isPredicateTriple()) { + predStr = "?answer"; + } else if (isObjectTriple()) { + objStr = "?answer"; + } + + String result = subjStr + " " + predStr + " " + objStr + ". "; + if (keepSubjectType && !keepObjectType) { + result = "?x " + predStr + " " + objStr + ". " + + "?x a " + subjectType + ". "; + } else if (keepObjectType && !keepSubjectType) { + result = subjStr + " " + predStr + " ?y. " + + "?y a " + objectType + ". "; + } else if (keepObjectType && keepSubjectType) { + result = "?x " + predStr + " ?y. " + + "?y a " + objectType + ". " + + "?x a " + subjectType + ". "; + } + return result; + } + + + public TripleType commonPartValue(Triple t) { + TripleType res = TripleType.NONE; + if (getType() == t.getType()) { + if (getPredicate().equals(t.getPredicate()) && !isPredicateTriple()) { + res = TripleType.PREDICATE; + } + if (getObject().equals(t.getObject()) && !isObjectTriple() && !keepObjectType) { + res = TripleType.OBJECT; + } + if (getSubject().equals(t.getSubject()) && !isSubjectTriple() && !keepSubjectType) { + res = TripleType.SUBJECT; + } + } + return res; + } + + public boolean hasCommonPart(Triple t) { + boolean res = false; + if (getType() == t.getType()) { + if (!isSubjectTriple()) { + res = res || getSubject().equals(t.getSubject()); + } + if (!isPredicateTriple()) { + res = res || getPredicate().equals(t.getPredicate()); + } + if (!isObjectTriple()) { + res = res || getObject().equals(t.getObject()); + } + } + return res; + } + + + public TripleType getType() { + return tripleType; + } + + public int hashCode() { + return (subject.toString() + predicate.toString() + object.toString()).hashCode(); + } + + public boolean equals(Object obj) { + if (obj instanceof Triple) { + return (subject.toString() + predicate.toString() + object.toString()) + .equals(((Triple) obj).subject.toString() + ((Triple) obj).predicate.toString() + ((Triple) obj).object.toString()); + } else { + return false; + } + + } + + public boolean isNullTriple() { + return subject.toString().equals("") && predicate.toString().equals("") && object.toString().equals(""); + } + + + public double getSimilarity() { + return subjectSimilarity + predicateSimilarity + objectSimilarity; + } + + + public TripleType getPartGivingMaxSimilarity() { + TripleType res = TripleType.NONE; + if (subjectSimilarity > objectSimilarity && subjectSimilarity > predicateSimilarity) { + res = TripleType.SUBJECT; + } else if (objectSimilarity > subjectSimilarity && objectSimilarity > predicateSimilarity) { + res = TripleType.OBJECT; + } else if (predicateSimilarity > subjectSimilarity && predicateSimilarity > objectSimilarity) { + res = TripleType.PREDICATE; + } + return res; + + } +} diff --git a/src/main/java/irit/complex/subgraphs/TripleSubgraph.java b/src/main/java/irit/complex/subgraphs/TripleSubgraph.java new file mode 100755 index 0000000000000000000000000000000000000000..e25123eabd8cec39ded4c52c6b6b8f1f1f79965c --- /dev/null +++ b/src/main/java/irit/complex/subgraphs/TripleSubgraph.java @@ -0,0 +1,216 @@ +package irit.complex.subgraphs; + +import java.util.ArrayList; +import java.util.HashSet; + +public class TripleSubgraph extends SubgraphForOutput { + + final ArrayList <Triple> triples; + TripleType commonPart; + double maxSimilarity; + boolean formsCalculated; + String intension; + String extension; + final TripleType partWithMaxSim; + + + public TripleSubgraph(Triple t){ + triples = new ArrayList<>(); + triples.add(t); + commonPart = TripleType.NONE; + maxSimilarity = t.getSimilarity(); + similarity = t.getSimilarity(); + formsCalculated = false; + partWithMaxSim=t.getPartGivingMaxSimilarity(); + } + + public boolean addSubgraph(Triple t){ + boolean added=false; + if(triples.get(0).toString().equals(t.toString())) { + triples.add(t); + added=true; + } + else if (triples.get(0).hasCommonPart(t) && commonPart == TripleType.NONE){ + if(!triples.get(0).getPredicate().toString().equals("<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>")|| triples.get(0).commonPartValue(t) !=TripleType.PREDICATE) { + addSimilarity(t); + triples.add(t); + commonPart = triples.get(0).commonPartValue(t); + added=true; + } + + } + else if (triples.get(0).hasCommonPart(t) && triples.get(0).commonPartValue(t) == commonPart ){ + if(!triples.get(0).getPredicate().toString().equals("<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>")|| commonPart !=TripleType.PREDICATE) { + addSimilarity(t); + triples.add(t); + added=true; + } + } + return added; + } + + public void addSimilarity(Triple t){ + maxSimilarity = Math.max(maxSimilarity, t.getSimilarity()); + similarity = ((similarity*triples.size())+t.getSimilarity())/(triples.size()+1); + } + + public void calculateIntensionString(){ + String res = triples.get(0).toString(); + Triple t = triples.get(0); + HashSet<String> concatSub = new HashSet<>(); + HashSet<String> concatPred = new HashSet<>(); + HashSet<String> concatObj = new HashSet<>(); + for (Triple t1 : triples){ + concatSub.add(t1.getSubject().toString()); + concatPred.add(t1.getPredicate().toString()); + concatObj.add(t1.getObject().toString()); + } + if (t.isSubjectTriple() && !t.getPredicate().toString().equals("<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>")){ + if (commonPart == TripleType.PREDICATE && concatObj.size() > 1){ + res = res.replaceFirst(stringToRegex(t.getObject().toValueString()), "?someObject"); + } + else if (commonPart == TripleType.OBJECT && concatPred.size() > 1){ + res = res.replaceFirst(t.getPredicate().toValueString(), "?somePredicate"); + } + else if (commonPart==TripleType.NONE && predicateHasMaxSim()) { + res = res.replaceFirst(stringToRegex(t.getObject().toValueString()), "?someObject"); + } + } + else if (t.isPredicateTriple()){ + if (commonPart == TripleType.SUBJECT && concatObj.size() > 1){ + res = res.replaceFirst(stringToRegex(t.getObject().toValueString()), "?someObject"); + } + else if (commonPart == TripleType.OBJECT && concatSub.size() > 1){ + res = res.replaceFirst(t.getSubject().toValueString(), "?someSubject"); + } + } + else if (t.isObjectTriple() && !t.getPredicate().toString().equals("<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>")){ + if (commonPart == TripleType.SUBJECT && concatPred.size() > 1){ + res = res.replaceFirst(t.getPredicate().toValueString(), "?somePredicate"); + } + else if (commonPart == TripleType.PREDICATE && concatSub.size() > 1){ + res = res.replaceFirst(t.getSubject().toValueString(), "?someSubject"); + } + else if (commonPart==TripleType.NONE && predicateHasMaxSim()) { + res = res.replaceFirst(t.getSubject().toValueString(), "?someSubject"); + } + } + + intension = res; + } + + public void calculateExtensionString(){ + String res = intension; + HashSet<String> concatSub = new HashSet<>(); + HashSet<String> concatPred = new HashSet<>(); + HashSet<String> concatObj = new HashSet<>(); + for (Triple t : triples){ + concatSub.add(t.getSubject().toString()); + concatPred.add(t.getPredicate().toString()); + concatObj.add(t.getObject().toString()); + } + + + res = res.replaceAll("\\?someSubject", concatSub.toString()); + + res = res.replaceAll("\\?somePredicate", concatPred.toString()); + + res = res.replaceAll("\\?someObject", concatObj.toString()); + + res = res.replaceAll("\\[", "\\{").replaceAll("]", "\\}"); + extension = res; + + } + + public String toIntensionString(){ + if(!formsCalculated){ + calculateIntensionString(); + calculateExtensionString(); + formsCalculated = true; + } + return intension; + } + + public String toExtensionString(){ + if(!formsCalculated){ + calculateIntensionString(); + calculateExtensionString(); + formsCalculated = true; + } + return extension; + } + + public String toSPARQLForm() { + String res ="SELECT DISTINCT ?answer WHERE {"; + if (toIntensionString().contains("somePredicate")){ + res+= toSPARQLExtension(); + } + // If common part is the predicate + else if (commonPart == TripleType.PREDICATE || commonPart == TripleType.NONE){ + // and if the predicate similarity is higher than the object/subject similarity --> Intension + if(predicateHasMaxSim()) { + res+= intension; + } + // else --> extension + else { + res+= toSPARQLExtension(); + } + + } + else { + res+= toSPARQLExtension(); + } + + res +="}"; + return res; + } + + public String toSPARQLExtension() { + HashSet<String> concatTriple = new HashSet<>(); + for (Triple t1 : triples){ + concatTriple.add(t1.toString()); + } + ArrayList<String> unionMembers = new ArrayList<>(concatTriple); + StringBuilder res = new StringBuilder(); + + if(toIntensionString().equals(extension)) { + res = new StringBuilder(extension); + } + else if (unionMembers.size() > 1){ + res.append("{").append(unionMembers.get(0)).append("}\n"); + for(int i =1 ; i < unionMembers.size();i++) { + res.append("UNION {").append(unionMembers.get(i)).append("}\n"); + } + } + return res.toString(); + } + + public ArrayList<Triple> getTriples(){ + return triples; + } + + public double getAverageSimilarity(){ + return similarity; + } + + public boolean predicateHasMaxSim() { + return partWithMaxSim == TripleType.PREDICATE; + } + + private String stringToRegex(String s) { + s= s.replaceAll("\\{", "\\\\{"); + s = s.replaceAll("}", "\\\\}"); + s=s.replaceAll("\\[", "\\\\["); + s=s.replaceAll("]", "\\\\]"); + s=s.replaceAll("\\.", "\\\\."); + s=s.replaceAll("\\?", "\\\\?"); + s=s.replaceAll("\\+", "\\\\+"); + s=s.replaceAll("\\*", "\\\\*"); + s=s.replaceAll("\\|", "\\\\|"); + s=s.replaceAll("\\^", "\\\\^"); + s=s.replaceAll("\\$", "\\\\$"); + return s; + } + + +} \ No newline at end of file diff --git a/src/main/java/irit/complex/subgraphs/TripleType.java b/src/main/java/irit/complex/subgraphs/TripleType.java new file mode 100644 index 0000000000000000000000000000000000000000..215929e9db6610c0438e51df5d16973b68d527ab --- /dev/null +++ b/src/main/java/irit/complex/subgraphs/TripleType.java @@ -0,0 +1,5 @@ +package irit.complex.subgraphs; + +public enum TripleType { + NONE, SUBJECT, PREDICATE, OBJECT +} diff --git a/src/main/java/irit/complex/utils/CQAGenerator.java b/src/main/java/irit/complex/utils/CQAGenerator.java new file mode 100755 index 0000000000000000000000000000000000000000..0da484e388b613056c59ab0a47c8c29fadac1d27 --- /dev/null +++ b/src/main/java/irit/complex/utils/CQAGenerator.java @@ -0,0 +1,174 @@ +package irit.complex.utils; + +import irit.sparql.SparqlProxy; +import org.apache.jena.rdf.model.RDFNode; + +import java.io.*; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.List; +import java.util.Map; + +public class CQAGenerator { + + private final String endpoint; + private final String CQAFolder; + private int count; + private final double ratio; + private final int maxCAV; + + public CQAGenerator(String endpoint, String CQAFolder){ + this.endpoint = endpoint; + this.CQAFolder = CQAFolder; + count=0; + ratio = 30; + maxCAV = 20; + } + + public void createCQAs() throws IOException { + createClasses(); + createCAV(); + createProperties(); + } + + public void cleanCQARepository() { + Path cqaPath = Paths.get(CQAFolder); + try { + //If the folder does not exist, create it + if (Files.notExists(cqaPath)){ + Files.createDirectory(cqaPath); + } + //Else empty the folder + else{ + File dir = new File(CQAFolder); + for(File file: dir.listFiles()){ + file.delete(); + } + } + } catch (IOException e) { + e.printStackTrace(); + } + } + + public void createClasses() throws IOException { + String query = """ + PREFIX owl: <http://www.w3.org/2002/07/owl#> \s + SELECT distinct ?x WHERE{ \s + ?x a owl:Class.\s + ?y a ?x. filter(isIRI(?x))}"""; + + List<Map<String, RDFNode>> result = SparqlProxy.query(endpoint, query); + for (Map<String, RDFNode> jsonNode : result) { + String owlClass = jsonNode.get("x").toString(); + if (interestingIRI(owlClass)) { + //System.out.println(owlClass); + //Create new file in designated folder with the new CQA + PrintWriter writer = new PrintWriter(CQAFolder + "/CQA" + count + ".sparql", StandardCharsets.UTF_8); + String CQA = "SELECT DISTINCT ?x WHERE { \n" + + "?x a <" + owlClass + ">.} "; + writer.append(CQA); + writer.flush(); + writer.close(); + count++; + } + } + } + + public void createProperties() throws IOException { + String query = """ + PREFIX owl: <http://www.w3.org/2002/07/owl#> \s + SELECT distinct ?x WHERE{ \s + ?y ?x ?z. {?x a owl:ObjectProperty.} + union{ + ?x a owl:DatatypeProperty.} + }"""; + List<Map<String, RDFNode>> result = SparqlProxy.query(endpoint, query); + for (Map<String, RDFNode> jsonNode : result) { + String owlProp = jsonNode.get("x").toString(); + if (interestingIRI(owlProp)) { + PrintWriter writer = new PrintWriter(CQAFolder + "/CQA" + count + ".sparql", StandardCharsets.UTF_8); + String CQA = "SELECT DISTINCT ?x ?y WHERE { \n" + + "?x <" + owlProp + "> ?y.} "; + writer.append(CQA); + writer.flush(); + writer.close(); + count++; + } + } + } + + public void createCAV() throws IOException { + //Get all "interesting" properties + String query = """ + PREFIX owl: <http://www.w3.org/2002/07/owl#> \s + SELECT distinct ?x WHERE { \s + ?x a owl:ObjectProperty.}"""; + List<Map<String, RDFNode>> result = SparqlProxy.query(endpoint, query); + for (Map<String, RDFNode> node : result) { + String property = node.get("x").toString(); + if (interestingIRI(property)) { + String queryNb = "SELECT (count(distinct ?x) as ?sub) (count(distinct ?y) as ?ob) where {\n" + + "?x <" + property + "> ?y.}"; + List<Map<String, RDFNode>> retNb = SparqlProxy.query(endpoint, queryNb); + for (Map<String, RDFNode> nodeNb : retNb) { + int nbSub = Integer.parseInt(nodeNb.get("sub").toString()); + int nbOb = Integer.parseInt(nodeNb.get("ob").toString()); + + if (nbSub != 0 && nbOb != 0) { + // If n_subj >> n_obj and n_obj < maxThreshold + if ((double) nbSub / (double) nbOb > ratio && nbOb < maxCAV) { + // get all the objects + String queryOb = "SELECT distinct ?y where {\n" + + "?x <" + property + "> ?y.}"; + List<Map<String, RDFNode>> retOb = SparqlProxy.query(endpoint, queryOb); + // create n_obj CAV: ?x P oi + for (Map<String, RDFNode> jsonNode : retOb) { + String object = jsonNode.get("y").toString(); + PrintWriter writer = new PrintWriter(CQAFolder + "/CQA" + count + ".sparql", StandardCharsets.UTF_8); + String CQA = "SELECT DISTINCT ?x WHERE {\n" + + "?x <" + property + "> <" + object + ">.} "; + writer.append(CQA); + writer.flush(); + writer.close(); + count++; + } + + } + + // ELIF n_obj >> n_subj and n_subj < threshold + else if ((double) nbSub / (double) nbOb > ratio && nbOb < maxCAV) { + String querySub = "SELECT distinct ?x where {\n" + + "?x <" + property + "> ?y.}"; + List<Map<String, RDFNode>> retSub = SparqlProxy.query(endpoint, querySub); + for (Map<String, RDFNode> jsonNode : retSub) { + String subject = jsonNode.get("x").toString(); + PrintWriter writer = new PrintWriter(CQAFolder + "/CQA" + count + ".sparql", StandardCharsets.UTF_8); + String CQA = "SELECT DISTINCT ?x WHERE {\n" + + "<" + subject + "> <" + property + "> ?x.} "; + writer.append(CQA); + writer.flush(); + writer.close(); + count++; + } + } + } + } + } + } + + } + + public boolean interestingIRI(String iri){ + return !(iri.contains("http://www.w3.org/2000/01/rdf-schema#") || + iri.contains("http://www.w3.org/1999/02/22-rdf-syntax-ns#") || + iri.contains("http://www.w3.org/2001/XMLSchema#") || + iri.contains("http://www.w3.org/2004/02/skos/core#") || + iri.contains("http://www.w3.org/2008/05/skos-xl#") || + iri.contains("http://www.w3.org/2002/07/owl#") || + iri.contains("http://xmlns.com/foaf/") || + iri.contains("http://purl.org/dc/terms/") || + iri.contains("http://purl.org/dc/elements/1.1/")); + } +} diff --git a/src/main/java/irit/complex/utils/SPARQLNode.java b/src/main/java/irit/complex/utils/SPARQLNode.java new file mode 100644 index 0000000000000000000000000000000000000000..2a81d06f9a925f668c546d8b5f0e1f30c26a777c --- /dev/null +++ b/src/main/java/irit/complex/utils/SPARQLNode.java @@ -0,0 +1,68 @@ +package irit.complex.utils; + +import java.util.HashMap; + +public class SPARQLNode { + + final String name; + final HashMap<String,String> triples; + final HashMap<String,SPARQLNode> neighbors; + boolean explored; + SPARQLNode predecessor ; + + public SPARQLNode(String n) { + name = n; + triples = new HashMap<>(); + neighbors = new HashMap<>(); + explored = false; + } + + public void addNeighbour(SPARQLNode neighbor, String triple) { + if(neighbors.containsKey(neighbor.getName())) { + //triples.put(neighbor.getName(), triples.get(neighbor.getName())+ " "+triple); + System.out.println("more than one prop: "+triples.get(neighbor.getName())+ " "+triple ); + } + else { + neighbors.put(neighbor.getName(), neighbor); + triples.put(neighbor.getName(), triple); + } + } + + public HashMap<String,SPARQLNode> getNeighbors(){ + return neighbors; + } + + public boolean hasNeighbor(String n) { + return neighbors.containsKey(n); + } + + public void setPredecessor(SPARQLNode pred) { + predecessor = pred; + } + + public SPARQLNode getPredecessor() { + return predecessor; + } + + public boolean isExplored() { + return explored; + } + + public void explore() { + explored=true; + } + + public String getName() { + return name; + } + + public String getTriple(String n) { + return triples.get(n); + } + + public String toString() { + return name+" : "+ neighbors.keySet(); + } + + +} diff --git a/src/main/java/irit/dataset/DatasetManager.java b/src/main/java/irit/dataset/DatasetManager.java new file mode 100644 index 0000000000000000000000000000000000000000..b3510220b8d2136d7a31032f949467c00ea9b66c --- /dev/null +++ b/src/main/java/irit/dataset/DatasetManager.java @@ -0,0 +1,47 @@ +package irit.dataset; + +import irit.labelmap.LabelMap; +import org.apache.jena.query.Dataset; +import org.apache.jena.query.DatasetFactory; +import org.apache.jena.rdf.model.Model; +import org.apache.jena.rdf.model.ModelFactory; + +import java.util.HashMap; +import java.util.Map; + +public class DatasetManager { + + private final Map<String, Dataset> datasetMap; + public final Map<String, LabelMap> labelMaps; + private DatasetManager(){ + datasetMap = new HashMap<>(); + labelMaps = new HashMap<>(); + } + + private static DatasetManager instance; + + public static DatasetManager getInstance(){ + if(instance == null) instance = new DatasetManager(); + return instance; + } + + public void load(String name, String path) { + Model m = ModelFactory.createDefaultModel(); + m.read(path); + Dataset dataset = DatasetFactory.create(m); + datasetMap.put(name, dataset); + labelMaps.put(name, new LabelMap(path)); + } + + public void close(){ + for (Dataset value : datasetMap.values()) { + value.close(); + } + } + + public Dataset get(String name){ + return datasetMap.get(name); + } + + +} diff --git a/src/main/java/irit/labelmap/LabelMap.java b/src/main/java/irit/labelmap/LabelMap.java new file mode 100644 index 0000000000000000000000000000000000000000..952f0001d884cbaf5aa8911f3b5f97c3042ba204 --- /dev/null +++ b/src/main/java/irit/labelmap/LabelMap.java @@ -0,0 +1,171 @@ +package irit.labelmap; + +import org.apache.jena.rdf.model.*; +import org.apache.jena.riot.RDFDataMgr; + +import java.util.*; +import java.util.stream.Collectors; + +public class LabelMap { + + private final Map<String, Map<String, Set<String>>> spm = new HashMap<>(); + private final Map<String, Map<String, Set<String>>> spmi = new HashMap<>(); + private final Map<String, Map<String, Set<String>>> pom = new HashMap<>(); + private final Map<String, Map<String, Set<String>>> som = new HashMap<>(); + private final Map<String, String> typeMap = new HashMap<>(); + + + public LabelMap(String path) { + load(path); + } + + public void load(String path) { + Model defaultModel = RDFDataMgr.loadModel(path); + + + StmtIterator stmtIterator = defaultModel.listStatements(); + + while (stmtIterator.hasNext()) { + Statement statement = stmtIterator.nextStatement(); + String s = statement.getSubject().toString(); + String p = statement.getPredicate().toString(); + String o = statement.getObject().toString(); + + typeMap.put(s, getType(statement.getSubject())); + typeMap.put(p, getType(statement.getPredicate())); + typeMap.put(o, getType(statement.getObject())); + + + String si = s.toLowerCase(); + String pi = p.toLowerCase(); + String oi = o.toLowerCase(); + + + if (!spm.containsKey(o)) spm.put(o, new HashMap<>()); + if (!spm.get(o).containsKey(p)) spm.get(o).put(p, new HashSet<>()); + + spm.get(o).get(p).add(s); + + if (!pom.containsKey(s)) pom.put(s, new HashMap<>()); + if (!pom.get(s).containsKey(p)) pom.get(s).put(p, new HashSet<>()); + + pom.get(s).get(p).add(o); + + + if (!spmi.containsKey(oi)) spmi.put(oi, new HashMap<>()); + if (!spmi.get(oi).containsKey(pi)) spmi.get(oi).put(pi, new HashSet<>()); + + spmi.get(oi).get(pi).add(si); + + if (!som.containsKey(p)) spm.put(p, new HashMap<>()); + if (!spm.get(p).containsKey(s)) spm.get(p).put(s, new HashSet<>()); + + spm.get(p).get(s).add(o); + + } + + + } + + public Set<String> getSimilar(String v) { + + Set<String> result = new HashSet<>(); + + if (spmi.containsKey(v)) { + Map<String, Set<String>> stringSetMap = spmi.get(v); + + for (Set<String> value : stringSetMap.values()) { + result.addAll(value); + } + + + if (stringSetMap.containsKey("http://www.w3.org/2008/05/skos-xl#literalForm")) { + Set<String> strings = stringSetMap.get("http://www.w3.org/2008/05/skos-xl#literalForm"); + for (String string : strings) { + Map<String, Set<String>> stringSetMap1 = spmi.get(string); + if (stringSetMap1.containsKey("http://www.w3.org/2008/05/skos-xl#prefLabel")) { + Set<String> strings1 = stringSetMap1.get("http://www.w3.org/2008/05/skos-xl#prefLabel"); + result.addAll(strings1); + } + } + } + } + + return result; + } + + + public Set<String> getMatched(String v) { + + Set<String> result = new HashSet<>(); + + result.addAll(pom.getOrDefault(v, Map.of()).getOrDefault("http://www.w3.org/2000/01/rdf-schema#seeAlso", Set.of())); + result.addAll(pom.getOrDefault(v, Map.of()).getOrDefault("http://www.w3.org/2002/07/owl#sameAs", Set.of())); + result.addAll(pom.getOrDefault(v, Map.of()).getOrDefault("http://www.w3.org/2004/02/skos/core#closeMatch", Set.of())); + result.addAll(pom.getOrDefault(v, Map.of()).getOrDefault("http://www.w3.org/2004/02/skos/core#exactMacth", Set.of())); + result.addAll(spm.getOrDefault(v, Map.of()).getOrDefault("http://www.w3.org/2000/01/rdf-schema#seeAlso", Set.of())); + result.addAll(spm.getOrDefault(v, Map.of()).getOrDefault("http://www.w3.org/2002/07/owl#sameAs", Set.of())); + result.addAll(spm.getOrDefault(v, Map.of()).getOrDefault("http://www.w3.org/2004/02/skos/core#closeMatch", Set.of())); + result.addAll(spm.getOrDefault(v, Map.of()).getOrDefault("http://www.w3.org/2004/02/skos/core#exactMacth", Set.of())); + + return result; + } + + + public boolean exists(String v) { + Set<String> excludedProperties = Set.of( + "http://www.w3.org/2002/07/owl#sameAs", + "http://www.w3.org/2004/02/skos/core#closeMatch", + "http://www.w3.org/2004/02/skos/core#exactMatch" + ); + int sc = 0; + + for (String s : pom.getOrDefault(v, Map.of()).keySet()) { + if (!excludedProperties.contains(s)) sc++; + } + + int oc = 0; + for (String s : spm.getOrDefault(v, Map.of()).keySet()) { + if (!excludedProperties.contains(s)) oc++; + } + return sc + oc > 0; + } + + + public Set<String> labels(String v) { + Set<String> result = new HashSet<>(); + for (Set<String> value : pom.getOrDefault(v, Map.of()).values()) { + result.addAll(value); + } + + Set<String> orDefault = pom.getOrDefault(v, Map.of()).getOrDefault("http://www.w3.org/2008/05/skos-xl#prefLabel", Set.of()); + for (String s : orDefault) { + Set<String> orDefault1 = pom.getOrDefault(s, Map.of()).getOrDefault("http://www.w3.org/2008/05/skos-xl#literalForm", Set.of()); + result.addAll(orDefault1); + } + + return result.stream().filter(s -> typeMap.get(s).equals("Literal")).collect(Collectors.toSet()); + } + + + public String getType(RDFNode r){ + if (r.isLiteral()) return "Literal"; + if (r.isAnon()) return "Anon"; + if (r.isResource()) return "Resource"; + if (r.isURIResource()) return "URIResource"; + return "None"; + } + + + public Set<String> types(String value) { +// SELECT DISTINCT ?type WHERE {" + +// value + " a ?type." +// + "filter(isIRI(?type))} + + Set<String> orDefault = pom.getOrDefault(value, Map.of()).getOrDefault("http://www.w3.org/1999/02/22-rdf-syntax-ns#type", Set.of()); + + return orDefault.stream().filter(s -> typeMap.get(s).equals("URIResource") || typeMap.get(s).equals("Resource")).collect(Collectors.toSet()); + } + + +} diff --git a/src/main/java/irit/misc/Progress.java b/src/main/java/irit/misc/Progress.java new file mode 100644 index 0000000000000000000000000000000000000000..9869ffe0c8712a2c98d119a74e1c714e2828a648 --- /dev/null +++ b/src/main/java/irit/misc/Progress.java @@ -0,0 +1,102 @@ +package irit.misc; + +import java.time.Duration; +import java.util.concurrent.atomic.AtomicInteger; + +public class Progress { + + int width = 20; + AtomicInteger progress = new AtomicInteger(0); + int max; + long time; + long start; + + public Progress(int max) { + this.max = max; + start(); + } + + public Progress(int max, int width){ + this.max = max; + this.width = width; + start(); + } + + private void start(){ + start = System.nanoTime(); + tic(); + render(); + } + + private void render(){ + + + clear(); + renderPercent(); + renderBar(); + renderCount(); + renderTime(); + + if (progress.get() == max) System.out.println(); + } + + private void tic(){ + time = System.nanoTime(); + } + + private long toc(){ + return System.nanoTime() - time; + } + + private void renderPercent(){ + float p = progress.get() / (float)max * 100; + System.out.printf("%3.0f%% ", p); + System.out.print("|"); + } + + private void renderTime(){ + Duration duration = Duration.ofNanos(System.nanoTime() - start); + Duration rem = Duration.ofNanos(toc() * (max - progress.get())); + tic(); + if (progress.get() != max) { + System.out.printf("(%d:%d:%d / %d:%d:%d)", duration.toHoursPart(), duration.toMinutesPart(), duration.toSecondsPart(), rem.toHoursPart(), rem.toMinutesPart(), rem.toSecondsPart()); + + } else { + System.out.printf("(%d:%d:%d)", duration.toHoursPart(), duration.toMinutesPart(), duration.toSecondsPart()); + + } + } + + private void clear(){ + System.out.print("\r"); + } + + private void renderBar(){ + if (progress.get() != max){ + System.out.print("\u001B[34m"); + } else { + System.out.print("\u001B[32m"); + } + + + for (int i = 0; i < width; i++) { + if (i / (float)width < progress.get() / (float)max){ + System.out.print("█"); + } else { + System.out.print(" "); + } + } + System.out.print("\u001B[0m"); + System.out.print("| "); + } + + private void renderCount(){ + int ds = (int)Math.floor(Math.log10(max)) + 1; + System.out.printf(" %" + ds + "d/" + max + " ", progress.get()); + } + public void step(){ + progress.incrementAndGet(); + render(); + } + +} diff --git a/src/main/java/irit/output/EDOALOutput.java b/src/main/java/irit/output/EDOALOutput.java new file mode 100755 index 0000000000000000000000000000000000000000..8eb362f1498fad0498d3b48ef32f85f0b53272f0 --- /dev/null +++ b/src/main/java/irit/output/EDOALOutput.java @@ -0,0 +1,724 @@ +package irit.output; + +import fr.inrialpes.exmo.align.impl.edoal.*; +import fr.inrialpes.exmo.align.impl.renderer.RDFRendererVisitor; +import fr.inrialpes.exmo.align.parser.SyntaxElement.Constructor; +import fr.inrialpes.exmo.ontowrap.BasicOntology; +import irit.complex.subgraphs.Path; +import irit.complex.subgraphs.PathSubgraph; +import irit.complex.subgraphs.SubgraphForOutput; +import irit.complex.subgraphs.TripleSubgraph; +import irit.complex.utils.SPARQLNode; +import irit.output.Output; +import irit.sparql.query.select.SparqlSelect; +import org.semanticweb.owl.align.AlignmentException; + +import java.io.IOException; +import java.io.PrintWriter; +import java.net.URI; +import java.net.URISyntaxException; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + + +public class EDOALOutput extends Output { + private EDOALAlignment alignment; + private final String outputEDOALfile; + + public EDOALOutput(String source, String target, String file){ + super(source, target); + outputEDOALfile = file; + } + + /***EDOAL ALIGNMENTS**/ + public void init(){ + alignment = new EDOALAlignment(); + BasicOntology o1= new BasicOntology(); + try { + o1.setURI(new URI(sourceEndpoint)); + o1.setFormalism("owl"); + o1.setFormURI(new URI("http://www.w3.org/TR/owl-guide/")); + } catch (URISyntaxException e) { + e.printStackTrace(); + } + BasicOntology o2= new BasicOntology(); + try { + o2.setURI(new URI(targetEndpoint)); + o2.setFormalism("owl"); + o2.setFormURI(new URI("http://www.w3.org/TR/owl-guide/")); + } catch (URISyntaxException e) { + e.printStackTrace(); + } + + try { + alignment.init(o1,o2); + } catch (AlignmentException e) { + e.printStackTrace(); + } + } + + @Override + public void addToOutput(List<SubgraphForOutput> output, SparqlSelect sq){ + //Source sub-graph + Expression sourceExpr = subgraphFormToEDOALEntity(sq.toSubgraphForm(),sq.getSelectFocus()); + + //2-transform source and target query subgraphs into edoal entities + for (SubgraphForOutput s: output){ + if (s.getSimilarity() >= 0){ + //System.out.println(s.toIntensionString()); + Expression targetExpr = null; + double score = s.getSimilarity(); + if(score>1.0){ + score=1.0; + } + if(s instanceof TripleSubgraph) { + // If common part is the subject or object, always express the extension + if (s.toIntensionString().contains("somePredicate")){ + targetExpr= subgraphFormToEDOALEntity(s.toExtensionString(),sq.getSelectFocus()); + } + // If common part is the predicate + else if (s.toIntensionString().contains("someObject")||s.toIntensionString().contains("someSubject")){ + // and if the predicate similarity is higher than the object/subject similarity --> Intension + if(((TripleSubgraph)s).predicateHasMaxSim()) { + targetExpr= subgraphFormToEDOALEntity(s.toIntensionString(),sq.getSelectFocus()); + } + // else --> extension + else { + targetExpr= subgraphFormToEDOALEntity(s.toExtensionString(),sq.getSelectFocus()); + } + + } + else { + targetExpr= subgraphFormToEDOALEntity(s.toExtensionString(),sq.getSelectFocus()); + } + } + + // same for PathSubgraph + else if (s instanceof PathSubgraph) { + targetExpr= subgraphFormToEDOALProperty(s); + } + + + + try { + //System.out.println(sourceExpr+" "+targetExpr); + if(sourceExpr != null & targetExpr !=null) { + alignment.addAlignCell(sourceExpr, targetExpr,"Equivalence", score); + } + } catch (AlignmentException e) { + e.printStackTrace(); + } + } + } + + } + + public void end(){ + try { + Files.createDirectories(Paths.get(outputEDOALfile).getParent()); + PrintWriter writer = new PrintWriter(outputEDOALfile, StandardCharsets.UTF_8); + RDFRendererVisitor renderer = new RDFRendererVisitor(writer); + alignment.render(renderer); + writer.flush(); + writer.close(); + } catch (AlignmentException | IOException e) { + e.printStackTrace(); + } + } + + public Expression subgraphFormToEDOALEntity(String s, ArrayList<String> focus){ + Expression expr = null; + ArrayList<String> subgraphs = setOfUNIONSubgraphs(s); + ArrayList<String> minusSubgraphs = setOfMINUSSubgraphs(s); + //CLASS expressions + if (focus.size()==1){ + //System.out.println("it's a class"); + ClassExpression theclassExpr=null; + ClassExpression classExprM = null; + ArrayList<ClassExpression> classExpr = new ArrayList<>(); + for(String sub: subgraphs) { + classExpr.add(subgraphFormToEDOALClass(sub, "?answer")); + } + if(classExpr.size()==1) { + theclassExpr=classExpr.get(0); + } + else { + theclassExpr = new ClassConstruction(Constructor.OR,classExpr); + } + + ArrayList<ClassExpression> classExprMSet = new ArrayList<>(); + for(String mSub : minusSubgraphs) { + classExprMSet.add(subgraphFormToEDOALClass(mSub, "?answer")); + } + if(classExprMSet.size()==1) { + classExprM=classExprMSet.get(0); + } + else { + classExprM = new ClassConstruction(Constructor.OR,classExprMSet); + } + + if(classExprMSet.size()>0) { + ArrayList<ClassExpression> classExprMNotfinal = new ArrayList<>(); + classExprMNotfinal.add(classExprM); + ClassExpression minusExpr = new ClassConstruction(Constructor.NOT,classExprMNotfinal); + ArrayList<ClassExpression> classExprFinal = new ArrayList<>(); + classExprFinal.add(theclassExpr); + classExprFinal.add(minusExpr); + theclassExpr = new ClassConstruction(Constructor.AND, classExprFinal); + } + + expr=theclassExpr; + + } + //PROPERTY expression + else if (focus.size()==2){ + RelationExpression theRelExpr = null; + RelationExpression relM =null; + ArrayList<RelationExpression> relExprSet = new ArrayList<>(); + for(String sub: subgraphs) { + relExprSet.add(subgraphFormToEDOALProperty(sub, "?answer0", "?answer1")); + } + if(relExprSet.size()==1) { + theRelExpr=relExprSet.get(0); + } + else { + theRelExpr = new RelationConstruction(Constructor.OR,relExprSet); + } + ArrayList<RelationExpression> relExprMSet = new ArrayList<>(); + for(String mSub : minusSubgraphs) { + relExprMSet.add(subgraphFormToEDOALProperty(mSub, "?answer0", "?answer1")); + } + if(relExprMSet.size()==1) { + relM=relExprMSet.get(0); + } + else { + relM = new RelationConstruction(Constructor.OR,relExprMSet); + } + + if(relExprMSet.size()>0) { + ArrayList<RelationExpression> relExprMNotfinal = new ArrayList<>(); + relExprMNotfinal.add(relM); + RelationExpression minusExpr = new RelationConstruction(Constructor.NOT,relExprMNotfinal); + ArrayList<RelationExpression> relExprFinalSet = new ArrayList<>(); + relExprFinalSet.add(theRelExpr); + relExprFinalSet.add(minusExpr); + theRelExpr = new RelationConstruction(Constructor.AND, relExprFinalSet); + } + + expr=theRelExpr; + } + else{ + System.err.println("Trying to find a class expression with more than 1 variable as select"); + } + // + + return expr; + } + + public ArrayList<String> setOfUNIONSubgraphs(String s) { + s=s.replaceAll("[\n\t ]+", " "); + s=s.replaceAll("[\n\t ]+\\.", "\\."); + s=s.replaceAll("\\\\*\\{", "\\\\\\{"); + s=s.replaceAll("\\\\*}", "\\\\\\}"); + s=s.replaceAll("minus", "MINUS"); + s=s.replaceAll("MINUS *\\\\\\{([^\\\\}]+)\\\\}",""); + s=s.replaceAll("union", "UNION"); + ArrayList<String> res = new ArrayList<>(); + Pattern pattern1 = Pattern.compile("\\\\\\{([^\\\\}]+)\\\\} *UNION"); + Matcher matcher1 = pattern1.matcher(s); + if (matcher1.find()){ + res.add(matcher1.group(1).trim()); + s=s.replaceFirst("\\\\\\{"+matcher1.group(1).replaceAll("\\?","\\\\\\?")+"\\\\}", ""); + Pattern pattern2 = Pattern.compile("UNION *\\\\\\{([^\\\\}]+)\\\\}"); + Matcher matcher2 = pattern2.matcher(s); + while (matcher2.find()){ + res.add(matcher2.group(1)); + //System.out.println(matcher2.group()); + String mgroup = matcher2.group(); + mgroup=mgroup.replaceAll("\\\\*\\{", "\\\\\\{"); + mgroup=mgroup.replaceAll("\\\\*}", "\\\\\\}"); + mgroup = mgroup.replaceAll("\\?","\\\\\\?"); + mgroup=mgroup.replaceAll("\\.", "\\\\\\."); + + s=s.replaceAll("\\\\*\\{", "\\{"); + s=s.replaceAll("\\\\*}", "\\}"); + s=s.replaceAll(mgroup, ""); + } + if(!s.trim().equals("")) { + //System.out.println(s); + for (int i =0; i< res.size();i++) { + res.set(i, res.get(i)+s); + //System.out.println(i+" "+res.get(i)); + } + } + } + else { + res.add(s); + } + return res; + } + + public ArrayList<String> setOfMINUSSubgraphs(String s) { + s=s.replaceAll("[\n\t ]+", " "); + s=s.replaceAll("[\n\t ]+\\.", "\\."); + s=s.replaceAll("\\\\*\\{", "\\\\\\{"); + s=s.replaceAll("\\\\*}", "\\\\\\}"); + s=s.replaceAll("minus", "MINUS"); + ArrayList<String> res = new ArrayList<>(); + Pattern pattern1 = Pattern.compile("MINUS *\\\\\\{([^\\\\}]+)\\\\}"); + Matcher matcher1 = pattern1.matcher(s); + while (matcher1.find()){ + res.add(matcher1.group(1).trim()); + } + return res; + } + + + //OLD VERSION + public ClassExpression subgraphFormToEDOALClass(String s, String focus){ + ClassExpression expr = null; + s=s.replaceAll("[\n\t ]+", " "); + s=s.replaceAll("[\n\t ]+\\.", "\\."); + s=s.replaceAll("\\\\*\\{", "\\\\\\{"); + s=s.replaceAll("\\\\*}", "\\\\\\}"); + focus = focus.replaceAll("\\?", "\\\\\\?"); + try { + ArrayList<ClassExpression> expressions = new ArrayList<>(); + + //focus predicate ?y. + Pattern pattern1 = Pattern.compile(focus+" <([^ ]+)>\\+? (\\?[A-Za-z\\d_-]+)\\."); + Matcher matcher1 = pattern1.matcher(s); + while (matcher1.find()){ + s=s.replaceAll(matcher1.group().replaceAll("\\?","\\\\\\?"), ""); + RelationId pred = new RelationId(new URI(matcher1.group(1).trim())); + ClassExpression newExpression = subgraphFormToEDOALClass(s,matcher1.group(2).trim()); + + if(newExpression == null){ //It's a CAE + expressions.add(new ClassOccurenceRestriction(pred, Comparator.GREATER, 0)); + } + else{ //It's a CAT + expressions.add(new ClassDomainRestriction(pred,false,newExpression)); + } + } + //?y predicate focus + Pattern pattern2 = Pattern.compile("(\\?[A-Za-z\\d_-]+) <([^>]+)>\\+? "+focus+"\\."); + Matcher matcher2 = pattern2.matcher(s); + while (matcher2.find()){ + s=s.replaceAll(matcher2.group().replaceAll("\\?","\\\\\\?"), ""); + RelationId predId = new RelationId(new URI(matcher2.group(2).trim())); + ArrayList<RelationExpression> setPredId= new ArrayList<>(); + setPredId.add(predId); + RelationConstruction pred = new RelationConstruction(Constructor.INVERSE,setPredId); + ClassExpression newExpression = subgraphFormToEDOALClass(s,matcher2.group(1).trim()); + + if(newExpression == null){ //It's a CIAE + //System.out.println("CIAE: pred="+predId); + expressions.add(new ClassOccurenceRestriction(pred, Comparator.GREATER, 0)); + } + else{ //It's a CIAT + //System.out.println("CIAT: pred="+predId+" class="+newExpression.toString()); + expressions.add(new ClassDomainRestriction(pred,false,newExpression)); + } + } + //focus a Class + Pattern pattern3 = Pattern.compile(focus+" (a)?(<http://www\\.w3\\.org/1999/02/22-rdf-syntax-ns#type>)? <([^>]+)>\\."); + Matcher matcher3 = pattern3.matcher(s); + while (matcher3.find()){ //It's a Class URI + s=s.replaceAll(matcher3.group().replaceAll("\\?","\\\\\\?"), ""); + ClassId classId = new ClassId(new URI(matcher3.group(3).trim())); + expressions.add(classId); + } + + //focus predicate Instance + Pattern pattern4 = Pattern.compile(focus+" <([^>]+)>\\+? <([^>]+)>\\."); + Matcher matcher4 = pattern4.matcher(s); + while (matcher4.find()){ //It's a CAV (instance) + s=s.replaceAll(matcher4.group().replaceAll("\\?","\\\\\\?"), ""); + RelationId pred = new RelationId(new URI(matcher4.group(1).trim())); + InstanceId inst = new InstanceId(new URI(matcher4.group(2).trim())); + expressions.add(new ClassValueRestriction(pred,Comparator.EQUAL, inst)); + } + + //Instance predicate focus + Pattern pattern5 = Pattern.compile("<([^>]+)> <([^>]+)>\\+? "+focus+"\\."); + Matcher matcher5 = pattern5.matcher(s); + while (matcher5.find()){ //It's a CIAV (instance) + s=s.replaceAll(matcher5.group().replaceAll("\\?","\\\\\\?"), ""); + RelationId predId = new RelationId(new URI(matcher5.group(2).trim())); + ArrayList<RelationExpression> setPredId= new ArrayList<>(); + setPredId.add(predId); + RelationConstruction pred = new RelationConstruction(Constructor.INVERSE,setPredId); + InstanceId inst = new InstanceId(new URI(matcher5.group(1).trim())); + expressions.add(new ClassValueRestriction(pred,Comparator.EQUAL, inst)); + } + + //focus predicate LitteralValue + Pattern pattern6 = Pattern.compile(focus+" <([^>]+)>\\+? \"([^\"]+)\"\\."); + Matcher matcher6 = pattern6.matcher(s); + while (matcher6.find()){ //It's a CAV (value) + s=s.replaceAll(matcher6.group().replaceAll("\\?","\\\\\\?"), ""); + RelationId pred = new RelationId(new URI(matcher6.group(1).trim())); + Value value = new Value(matcher6.group(2).trim()); + expressions.add(new ClassValueRestriction(pred,Comparator.EQUAL, value)); + } + + //focus predicate SetOfValues + Pattern pattern7 = Pattern.compile(focus+" <([^>]+)>\\+? \\\\\\{([^\\\\}]+)\\\\}\\."); + Matcher matcher7 = pattern7.matcher(s); + while (matcher7.find()){ //It's a U(CAV) + s=s.replaceAll(matcher7.group().replaceAll("\\?","\\\\\\?"), ""); + ArrayList<ClassExpression> setOfCAV = new ArrayList<>(); + RelationId pred = new RelationId(new URI(matcher7.group(1).trim())); + String[] values = matcher7.group(2).trim().split(","); + for(String v: values){ + if(v.contains(">") && v.contains(">")){ + InstanceId value = new InstanceId(new URI(v.replaceAll("[<> ]", ""))); + setOfCAV.add(new ClassValueRestriction(pred,Comparator.EQUAL, value)); + } + else{ + Value value = new Value(v); + setOfCAV.add(new ClassValueRestriction(pred,Comparator.EQUAL, value)); + } + } + expressions.add(new ClassConstruction(Constructor.OR,setOfCAV)); + } + + //SetOfValues predicate focus + Pattern pattern8 = Pattern.compile("\\\\\\{([^\\\\}]+)\\\\} <([^>]+)>\\+? "+focus+"\\."); + Matcher matcher8 = pattern8.matcher(s); + while (matcher8.find()){ //It's a U(CIAV) + s=s.replaceAll(matcher8.group().replaceAll("\\?","\\\\\\?"), ""); + ArrayList<ClassExpression> setOfCAV = new ArrayList<>(); + RelationId predId = new RelationId(new URI(matcher8.group(2).trim())); + ArrayList<RelationExpression> setPredId= new ArrayList<>(); + setPredId.add(predId); + RelationConstruction pred = new RelationConstruction(Constructor.INVERSE,setPredId); + String[] values = matcher8.group(1).trim().split(","); + for(String v: values){ + InstanceId value = new InstanceId(new URI(v.replaceAll("[<> ]", ""))); + setOfCAV.add(new ClassValueRestriction(pred,Comparator.EQUAL, value)); + } + expressions.add(new ClassConstruction(Constructor.OR,setOfCAV)); + } + + //focus setOfpredicate ?y + Pattern pattern9 = Pattern.compile(focus+" \\\\\\{([^\\\\}]+)\\\\} (\\?[A-Za-z\\d_-]+)\\."); + Matcher matcher9 = pattern9.matcher(s); + while (matcher9.find()){ //It's a U(CAE/T) + s=s.replace(matcher9.group(), ""); + ArrayList<RelationExpression> setPredId= new ArrayList<>(); + String[] preds = matcher9.group(1).trim().split(","); + for(String predUri: preds){ + RelationId pred = new RelationId(new URI(predUri.replaceAll("[<> ]", ""))); + setPredId.add(pred); + } + ClassExpression newExpression = subgraphFormToEDOALClass(s,matcher9.group(2).trim()); + RelationConstruction relConstr = new RelationConstruction(Constructor.OR,setPredId); + if(newExpression == null){ //It's a U(CAE) + expressions.add(new ClassOccurenceRestriction(relConstr, Comparator.GREATER, 0)); + } + else{ //It's a U(CAT) + expressions.add(new ClassDomainRestriction(relConstr,false,newExpression)); + } + } + + //?y setOfpredicate focus + Pattern pattern10 = Pattern.compile("(\\?[A-Za-z\\d_-]+) \\\\\\{([^\\\\}]+)\\\\} "+focus+"\\."); + Matcher matcher10 = pattern10.matcher(s); + while (matcher10.find()){ //It's a U(CAE/T) + s=s.replace(matcher10.group(), ""); + ArrayList<RelationExpression> setPredId= new ArrayList<>(); + String[] preds = matcher10.group(2).trim().split(","); + for(String predUri: preds){ + RelationId predInv = new RelationId(new URI(predUri.replaceAll("[<> ]", ""))); + ArrayList<RelationExpression> setPredIdInv= new ArrayList<>(); + setPredIdInv.add(predInv); + RelationConstruction pred = new RelationConstruction(Constructor.INVERSE,setPredIdInv); + setPredId.add(pred); + } + ClassExpression newExpression = subgraphFormToEDOALClass(s,matcher10.group(1).trim()); + RelationConstruction relConstr = new RelationConstruction(Constructor.OR,setPredId); + if(newExpression == null){ //It's a U(CIAE) + expressions.add(new ClassOccurenceRestriction(relConstr, Comparator.GREATER, 0)); + } + else{ //It's a U(CIAT) + expressions.add(new ClassDomainRestriction(relConstr,false,newExpression)); + } + } + + //focus setOfpredicate Value(instance) + Pattern pattern11 = Pattern.compile(focus+" \\\\\\{([^\\\\}]+)\\\\} <([^>]+)>\\."); + Matcher matcher11 = pattern11.matcher(s); + while (matcher11.find()){ //It's a C(UA)V + s=s.replace(matcher11.group(), ""); + ArrayList<RelationExpression> setPredId= new ArrayList<>(); + String[] preds = matcher11.group(1).trim().split(","); + for(String predUri: preds){ + RelationId pred = new RelationId(new URI(predUri.replaceAll("[<> ]", ""))); + setPredId.add(pred); + } + RelationConstruction relConstr = new RelationConstruction(Constructor.OR,setPredId); + InstanceId inst = new InstanceId(new URI(matcher11.group(2).trim())); + expressions.add(new ClassValueRestriction(relConstr,Comparator.EQUAL, inst)); + } + + + //Value setOfpredicate focus + Pattern pattern12 = Pattern.compile("<([^>]+)> \\\\\\{([^\\\\}]+)\\\\} "+focus+"\\."); + Matcher matcher12= pattern12.matcher(s); + while (matcher12.find()){ //It's a C(UIA)V + s=s.replace(matcher12.group(), ""); + ArrayList<RelationExpression> setPredId= new ArrayList<>(); + String[] preds = matcher12.group(2).trim().split(","); + for(String predUri: preds){ + RelationId predInv = new RelationId(new URI(predUri.replaceAll("[<> ]", ""))); + ArrayList<RelationExpression> setPredIdInv= new ArrayList<>(); + setPredIdInv.add(predInv); + RelationConstruction pred = new RelationConstruction(Constructor.INVERSE,setPredIdInv); + setPredId.add(pred); + } + RelationConstruction relConstr = new RelationConstruction(Constructor.OR,setPredId); + InstanceId inst = new InstanceId(new URI(matcher12.group(1).trim())); + expressions.add(new ClassValueRestriction(relConstr,Comparator.EQUAL, inst)); + } + + //focus setOfpredicate Value(literal) + Pattern pattern13 = Pattern.compile(focus+" \\\\\\{([^\\\\}]+)\\\\} \"([^\"]+)\"\\."); + Matcher matcher13 = pattern13.matcher(s); + while (matcher13.find()){ //It's a C(UA)V + s=s.replaceAll(matcher13.group().replaceAll("\\?","\\\\\\?"), ""); + ArrayList<RelationExpression> setPredId= new ArrayList<>(); + String[] preds = matcher13.group(1).trim().split(","); + for(String predUri: preds){ + RelationId pred = new RelationId(new URI(predUri.replaceAll("[<> ]", ""))); + setPredId.add(pred); + } + RelationConstruction relConstr = new RelationConstruction(Constructor.OR,setPredId); + Value val = new Value(matcher13.group(2).trim()); + expressions.add(new ClassValueRestriction(relConstr,Comparator.EQUAL, val)); + } + + + if (expressions.size() == 1){ + expr=expressions.get(0); + } + else if (expressions.size() > 1){ + expr = new ClassConstruction(Constructor.AND,expressions); + } + } catch (URISyntaxException e) { + e.printStackTrace(); + } + return expr; + } + + + public RelationExpression subgraphFormToEDOALProperty(String s, String focus1, String focus2){ + s=s.replaceAll("[\n\t ]+", " "); + s=s.replaceAll("[\n\t ]+\\.", "\\."); + s=s.replaceAll("\\\\*\\{", "\\\\\\{"); + s=s.replaceAll("\\\\*}", "\\\\\\}"); + s=s.replaceAll("\\+", ""); + String sCopy = s; + HashMap<String, SPARQLNode> nodes = new HashMap<>(); + + + Pattern pattern1 = Pattern.compile("(\\?[A-Za-z\\d_-]+) <[^>]+> (\\?[A-Za-z\\d_-]+)\\."); + Matcher matcher1 = pattern1.matcher(s); + while (matcher1.find()){ + s=s.replaceAll(matcher1.group().replaceAll("\\?","\\\\\\?"), ""); + String n1= matcher1.group(1); + String n2 = matcher1.group(2); + String triple = matcher1.group(); + + SPARQLNode node1 = new SPARQLNode(n1); + SPARQLNode node2 = new SPARQLNode(n2); + if(nodes.containsKey(n1)) { + node1 = nodes.get(n1); + } + else { + nodes.put(n1, node1); + } + if(nodes.containsKey(n2)) { + node2 = nodes.get(n2); + } + else { + nodes.put(n2, node2); + } + node1.addNeighbour(node2, triple); + node2.addNeighbour(node1, triple); + } + + //Parcours de graphe + ArrayList<SPARQLNode> nodesToVisit = new ArrayList<>(); + nodesToVisit.add(nodes.get(focus1)); + boolean pathFound = false; + + while(!nodesToVisit.isEmpty() && !pathFound && nodesToVisit.get(0) != null) { + SPARQLNode currNode = nodesToVisit.get(0); + currNode.explore(); + if(currNode.hasNeighbor(focus2)) { + nodes.get(focus2).setPredecessor(currNode); + pathFound = true; + nodesToVisit.remove(0); + } + else { + nodesToVisit.remove(0); + for(SPARQLNode newNode: currNode.getNeighbors().values()) { + if(!newNode.isExplored()) { + newNode.setPredecessor(currNode); + nodesToVisit.add(newNode); + } + } + } + } + + + ArrayList <String> properties = new ArrayList<>() ; + ArrayList<Boolean> inverse= new ArrayList<>(); + ArrayList<ClassExpression> types = new ArrayList<>(); + if (pathFound) { + //start from nodes.get(focus2) + SPARQLNode currNode = nodes.get(focus2); + String predName = currNode.getPredecessor().getName(); + boolean endOfPath = false; + while (!endOfPath) { + String triple = currNode.getTriple(predName); + //get property and inverse + Pattern pattern2 = Pattern.compile("(\\?[A-Za-z\\d_-]+) <([^>]+)> (\\?[A-Za-z\\d_-]+)\\."); + Matcher matcher2 = pattern2.matcher(triple); + if (matcher2.find()) { + sCopy = sCopy.replaceAll(matcher2.group().replaceAll("\\?","\\\\\\?"), ""); + properties.add(0,matcher2.group(2)); + + if(matcher2.group(1).equals(currNode.getName())) { + inverse.add(0,true); + } + else { + inverse.add(0,false); + } + } + //Get ClassExpressions + types.add(0, subgraphFormToEDOALClass(sCopy,currNode.getName())); + if(predName.equals(focus1)) { + endOfPath = true; + } + else { + //Go to following + currNode = nodes.get(predName); + predName = currNode.getPredecessor().getName(); + } + } + types.add(0, subgraphFormToEDOALClass(sCopy,focus1)); + } + else { // IF path not found (e.g. in a minus) + types.add(subgraphFormToEDOALClass(s,focus1)); + types.add(subgraphFormToEDOALClass(s,focus2)); + } + + return subgraphFormToEDOALProperty(properties, inverse, types); + } + + + public RelationExpression subgraphFormToEDOALProperty(ArrayList<String> properties, ArrayList<Boolean> inverse, ArrayList<ClassExpression> types) { + RelationExpression expr= null; + ArrayList<RelationExpression> setRelComp = new ArrayList<>(); + try { + for (int i = 0; i < inverse.size(); i++) { + RelationExpression rel = null; + ArrayList<RelationExpression> setRelAnd = new ArrayList<>(); + if(types.get(i) != null) { + setRelAnd.add(new RelationDomainRestriction(types.get(i))); + } + if(i+1 == properties.size() && types.get(i+1) != null ) { + setRelAnd.add(new RelationCoDomainRestriction(types.get(i+1))); + } + if(inverse.get(i)) { + ArrayList<RelationExpression> setRelInv = new ArrayList<>(); + setRelInv.add(new RelationId(new URI(properties.get(i)))); + rel = new RelationConstruction(Constructor.INVERSE,setRelInv); + } + else { + rel = new RelationId(new URI(properties.get(i))); + } + setRelAnd.add(rel); + if(setRelAnd.size() > 1) { + setRelComp.add(new RelationConstruction(Constructor.AND,setRelAnd)); + } + else { + setRelComp.add(rel); + } + } + + //only for a domain/range restriction + if(properties.isEmpty() && types.size()==2) { + ArrayList<RelationExpression> setRelAnd = new ArrayList<>(); + if(types.get(0)!=null) { + setRelAnd.add(new RelationDomainRestriction(types.get(0))); + } + if(types.get(1)!=null) { + setRelAnd.add(new RelationCoDomainRestriction(types.get(1))); + } + if(setRelAnd.size() > 1) { + setRelComp.add(new RelationConstruction(Constructor.AND,setRelAnd)); + } + else if (setRelAnd.size() ==1) { + setRelComp.add(setRelAnd.get(0)); + } + + + } + + + if(setRelComp.size()==1) { + expr=setRelComp.get(0); + } + else{ + expr = new RelationConstruction(Constructor.COMP,setRelComp); + } + + } catch (URISyntaxException e) { + e.printStackTrace(); + } + return expr; + + } + public RelationExpression subgraphFormToEDOALProperty(SubgraphForOutput s){ + ArrayList <String> properties = new ArrayList<>() ; + ArrayList<Boolean> inverse= new ArrayList<>(); + ArrayList<ClassExpression> types = new ArrayList<>(); + if (s instanceof PathSubgraph) { + try { + + Path p = ((PathSubgraph)s).getMainPath(); + inverse=p.getInverse(); + + for (int i = 0; i < p.getProperties().size(); i++) { + properties.add(p.getProperties().get(i).toStrippedString()); + if(p.getTypes().get(i) != null) { + types.add(new ClassId(new URI(p.getTypes().get(i).toStrippedString()))); + } + else { + types.add(null); + } + } + + if(p.getTypes().get(p.getProperties().size()) != null) { + types.add(new ClassId(new URI(p.getTypes().get(p.getProperties().size()).toStrippedString()))); + } + else { + types.add(null); + } + + } catch (URISyntaxException e) { + e.printStackTrace(); + } + } + return subgraphFormToEDOALProperty(properties,inverse,types); + } +} diff --git a/src/main/java/irit/output/Output.java b/src/main/java/irit/output/Output.java new file mode 100755 index 0000000000000000000000000000000000000000..00d21835f755d8296cc1b4d6ad491502b3ab7ebd --- /dev/null +++ b/src/main/java/irit/output/Output.java @@ -0,0 +1,27 @@ +package irit.output; + +import irit.complex.subgraphs.SubgraphForOutput; +import irit.sparql.query.select.SparqlSelect; + +import java.util.List; + +public abstract class Output { + + protected final String sourceEndpoint; + protected final String targetEndpoint; + + public Output(String sourceEndpoint, String targetEndpoint){ + this.sourceEndpoint = sourceEndpoint; + this.targetEndpoint = targetEndpoint; + + } + + public void init(){ + } + + public void addToOutput(List<SubgraphForOutput> output, SparqlSelect sq){ + } + + public void end() {} + +} diff --git a/src/main/java/irit/output/OutputManager.java b/src/main/java/irit/output/OutputManager.java new file mode 100644 index 0000000000000000000000000000000000000000..091e44b01109515fe6f96ee0769814fd77914eef --- /dev/null +++ b/src/main/java/irit/output/OutputManager.java @@ -0,0 +1,52 @@ +package irit.output; + +import irit.complex.subgraphs.SubgraphForOutput; +import irit.sparql.query.select.SparqlSelect; + +import java.nio.file.Paths; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class OutputManager { + + private Map<Float, ArrayList<Output>> outputs; + public List<Float> th; + + + public void initOutputEdoal(String sourceEndpoint, String targetEndpoint, List<Float> ths, String outputEdoal){ + th = ths; + outputs = new HashMap<>(); + + for (float th : ths) { + outputs.put(th, new ArrayList<>()); + String filePath = String.format("%s/%s_%s/th_%s.edoal", outputEdoal, sourceEndpoint, targetEndpoint, String.format("%.1f", th).replaceAll(",", "_")); + outputs.get(th).add(new EDOALOutput(sourceEndpoint, targetEndpoint, filePath)); + + + for (Output o: outputs.get(th)){ + o.init(); + } + } + } + + public void addToOutput(float th, SparqlSelect sq, List<SubgraphForOutput> subGraph){ + for(Output o: outputs.get(th)){ + o.addToOutput(subGraph, sq); + } + + } + + + public void endOutput(){ + for (Float th : outputs.keySet()) { + for(Output o: outputs.get(th)){ + o.end(); + } + } + + } + + +} diff --git a/src/main/java/irit/output/QueryOutput.java b/src/main/java/irit/output/QueryOutput.java new file mode 100644 index 0000000000000000000000000000000000000000..c079598b9e0578755e7b23aad41ff537bd3e6c67 --- /dev/null +++ b/src/main/java/irit/output/QueryOutput.java @@ -0,0 +1,39 @@ +package irit.output; + +import irit.output.Output; + +import java.util.ArrayList; +import java.util.Map; + +public class QueryOutput extends Output { + + private final String outputFolder; + private final Map<String,String> CQANames; + + public QueryOutput(String source, String target, String outputQueryFolder, Map<String, String> cqaNames) { + super(source, target); + outputFolder = outputQueryFolder; + CQANames = cqaNames; + } + + public String toSubgraphForm(String queryContent, ArrayList<String> selectFocus){ + + String ret = queryContent; + if (selectFocus.size()>1){ + int i=0; + for(String sf: selectFocus){ + ret=ret.replaceAll( "\\?answer"+i+" ",sf+" "); + ret=ret.replaceAll( "\\?answer"+i+"\\.",sf+"."); + ret=ret.replaceAll( "\\?answer"+i+"}",sf+"}"); + i++; + } + } + else{ + ret=ret.replaceAll( "\\?answer ",selectFocus.get(0)+" "); + ret=ret.replaceAll( "\\?answer\\.",selectFocus.get(0)+"."); + ret=ret.replaceAll( "\\?answer}",selectFocus.get(0)+"}"); + } + return ret; + } + +} diff --git a/src/main/java/irit/resource/IRI.java b/src/main/java/irit/resource/IRI.java new file mode 100755 index 0000000000000000000000000000000000000000..170e8ed0384eae2a6a4a7eb28ac186443d282c10 --- /dev/null +++ b/src/main/java/irit/resource/IRI.java @@ -0,0 +1,183 @@ +package irit.resource; + +import irit.complex.subgraphs.Triple; +import irit.dataset.DatasetManager; +import irit.similarity.EmbeddingManager; +import irit.sparql.query.exception.SparqlEndpointUnreachableException; +import irit.sparql.query.exception.SparqlQueryMalFormedException; + +import java.util.*; +import java.util.concurrent.ConcurrentHashMap; +import java.util.regex.Pattern; + +public class IRI extends Resource { + private final Set<String> labels; + private final Set<Triple> triples; + private final Set<IRI> types; + private boolean labelsGot; + private boolean triplesRetrieved; + private final Pattern pattern = Pattern.compile("[+{}.?*^]"); + + public IRI(String iri) { + super(iri); + labels = ConcurrentHashMap.newKeySet(); + triples = ConcurrentHashMap.newKeySet(); + types = ConcurrentHashMap.newKeySet(); + labelsGot = false; + triplesRetrieved = false; + } + + public void retrieveLabels(String endpointUrl) { + if (!labelsGot) { + addLabel(value.replaceAll("[<>]", "")); + + + String substring = value.substring(1); + substring = substring.substring(0, substring.length()-1); + Set<String> labels = DatasetManager.getInstance().labelMaps.get(endpointUrl).labels(substring); + + + for (String s : labels) { + Resource res = new Resource(s); + if (!res.isIRI()) { + addLabel(s); + } + + } + labelsGot = true; + } + + } + + public void retrieveTypes(String endpointUrl) throws SparqlQueryMalFormedException, SparqlEndpointUnreachableException { + + Set<String> lmTypes = DatasetManager.getInstance().labelMaps.get(endpointUrl).types(value.replaceAll("[<>]", "")); + + for (String s : lmTypes) { + types.add(new IRI("<" + s + ">")); + } + for (IRI type : types) { + type.retrieveLabels(endpointUrl); + } + } + + public IRI findMostSimilarType(String endpointUrl, HashSet<String> targetLabels, double threshold) { + if (getTypes().isEmpty()) { + try { + retrieveTypes(endpointUrl); + } catch (SparqlQueryMalFormedException | SparqlEndpointUnreachableException e) { + e.printStackTrace(); + } + } + double scoreTypeMax = -1; + IRI finalType = null; + for (IRI type : getTypes()) { + double scoreType; + type.retrieveLabels(endpointUrl); + scoreType = similarity(type.getLabels(), targetLabels, threshold); + if (scoreTypeMax < scoreType) { + scoreTypeMax = scoreType; + finalType = type; + } + } + return finalType; + } + + public double similarity(Set<String> labels1, HashSet<String> labels2, double threshold){ + double score = 0; + for(String l1 : labels1){ + for(String l2: labels2){ + double sim = EmbeddingManager.getSim(l1, l2); + sim = sim < threshold ? 0 : sim; + score += sim; + } + } + return score; + } + + public void addLabel(String label) { + labels.add(label.trim()); + } + + + public void findExistingMatches(final String sourceEndpoint, final String targetEndpoint) { + + ArrayList<IRI> allMatches = new ArrayList<>(); + + allMatches.add(this); + + String s1 = value.replaceAll("\\$", ""); + Set<String> matched1 = DatasetManager.getInstance().labelMaps.get(sourceEndpoint).getMatched(s1); + Set<String> matched2 = DatasetManager.getInstance().labelMaps.get(targetEndpoint).getMatched(s1); + + for (String s : matched1) { + Resource res = new Resource(s); + if (res.isIRI()) { + allMatches.add(new IRI("<" + s + ">")); + } + } + + for (String s : matched2) { + Resource res = new Resource(s); + if (res.isIRI()) { + allMatches.add(new IRI("<" + s + ">")); + } + } + + /*Check if a match is in the target dataset*/ + for (IRI match : allMatches) { + if ( DatasetManager.getInstance().labelMaps.get(targetEndpoint).exists(match.toString())) { + similarIRIs.add(match); + } + } + + } + + public void findSimilarResource(String targetEndpoint) { + if (labels.isEmpty()) { + retrieveLabels(targetEndpoint); + } + Set<String> nml = new HashSet<>(); + + for (String rawLab : labels) { + + String label = pattern.matcher(rawLab).replaceAll(""); + Set<String> similar = DatasetManager.getInstance().labelMaps.get(targetEndpoint).getSimilar(label.toLowerCase()); + nml.addAll(similar); + + } + + for (String s : nml) { + similarIRIs.add(new IRI("<" + s + ">")); + } + + } + + + public Set<String> getLabels() { + return labels; + } + + public Set<Triple> getTriples() { + return triples; + } + + public Set<IRI> getTypes() { + return types; + } + + public String toStrippedString() { + return value.replaceAll("<", "").replaceAll(">", ""); + } + + + public boolean isTriplesRetrieved() { + return triplesRetrieved; + } + + public void setTriplesRetrieved(boolean triplesRetrieved) { + this.triplesRetrieved = triplesRetrieved; + } + + +} diff --git a/src/main/java/irit/resource/Resource.java b/src/main/java/irit/resource/Resource.java new file mode 100755 index 0000000000000000000000000000000000000000..eac9d0cf334a3f8f7d17fc5a02cadea96d8c9c54 --- /dev/null +++ b/src/main/java/irit/resource/Resource.java @@ -0,0 +1,93 @@ +package irit.resource; + +import irit.dataset.DatasetManager; + +import java.util.*; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + +public class Resource { + + protected final String value; + protected final HashSet<IRI> similarIRIs; + private final Pattern pattern = Pattern.compile("[a-z][/:#]"); + + public Resource(String val) { + Pattern inp = Pattern.compile("\\\\"); + value = inp.matcher(val).replaceAll(""); + similarIRIs = new HashSet<>(); + } + + public boolean isIRI() { + Matcher matcher = pattern.matcher(value); + return !value.contains(" ") && matcher.find(); + } + + + public static String join(String del, Iterable<String> data) { + + Iterator<String> iterator = data.iterator(); + if (!iterator.hasNext()) return ""; + + StringBuilder pref = new StringBuilder(iterator.next()); + + while (iterator.hasNext()) { + pref.append(del).append(iterator.next()); + } + + return pref.toString(); + } + + public void findSimilarResource(String targetEndpoint) { + Set<String> values = new HashSet<>(); + values.add(value); + values.add("\"" + value.substring(0, 1).toUpperCase() + value.substring(1) + "\"@en"); + values.add("\"" + value.substring(0, 1).toUpperCase() + value.substring(1) + "\""); + + + Set<String> nml = new HashSet<>(); + + for (String label : values) { + nml.addAll(DatasetManager.getInstance().labelMaps.get(targetEndpoint).getSimilar(label.toLowerCase())); + } + + + for (String s : nml) { + similarIRIs.add(new IRI("<" + s + ">")); + } + + } + + public String toValueString() { + if (!(isIRI())) { + return "\"" + value + "\""; + } else { + return toString(); + } + } + + public String toString() { + return value; + } + + public int hashCode() { + return value.hashCode(); + } + + public boolean equals(Object obj) { + if (obj instanceof Resource) { + return value.equals(((Resource) obj).value); + } else { + return false; + } + } + + public HashSet<IRI> getSimilarIRIs() { + return similarIRIs; + } + + + public String getValue() { + return value; + } +} diff --git a/src/main/java/irit/similarity/EmbeddingManager.java b/src/main/java/irit/similarity/EmbeddingManager.java new file mode 100644 index 0000000000000000000000000000000000000000..9fd0e8417c760693c332653bc5da24df6627af8c --- /dev/null +++ b/src/main/java/irit/similarity/EmbeddingManager.java @@ -0,0 +1,86 @@ +package irit.similarity; + +import org.nd4j.linalg.api.buffer.DataType; +import org.nd4j.linalg.api.ndarray.INDArray; +import org.nd4j.linalg.factory.Nd4j; +import org.nd4j.linalg.ops.transforms.Transforms; + +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Optional; + +public class EmbeddingManager { + + + private static Map<String, INDArray> embs1 = new HashMap<>(); + public static long[] embshape; + + public static void load(String n1, String e1) throws IOException { + + Map<String, INDArray> embs = loadEmbs(n1, e1); + + Optional<INDArray> first = embs.values().stream().findFirst(); + embshape = first.get().shape(); + + embs1.putAll(embs); + } + + + public static double getSim(String s1, String s2){ + + INDArray n1 = embs1.get(s1); + INDArray n2 = embs1.get(s2); + + if (n1 == null){ + n1 = Nd4j.zeros(DataType.DOUBLE, embshape); + } + + if (n2 == null){ + n2 = Nd4j.zeros(DataType.DOUBLE, embshape); + } + + return Transforms.cosineSim(n1, n2); + } + + private static Map<String, INDArray> loadEmbs(String n1, String e1) throws IOException { + List<String> ents = Files.readAllLines(Paths.get(n1)); + List<String> embs = Files.readAllLines(Paths.get(e1)); + + Map<String, INDArray> embsMap = new HashMap<>(); + + for (int i = 0; i < ents.size(); i++) { + String[] split = embs.get(i).split(", "); + double[] de = new double[split.length]; + for (int j = 0; j < split.length; j++) { + de[j] = Double.parseDouble(split[j]); + } + INDArray indArray = Nd4j.create(de); + embsMap.put(ents.get(i), indArray); + } + return embsMap; + } + + public static INDArray get(String e1){ + if (!embs1.containsKey(e1)) return Nd4j.zeros(DataType.DOUBLE, embshape); + return embs1.get(e1); + } + + + private static String processLabel(String line){ + line = line.replaceAll("\\\\n", "\\n").trim(); + if (line.startsWith("http://") && line.contains("#")){ + String[] split = line.split("#"); + if (split.length > 1){ + line = split[1]; + } else { + line = split[0]; + } + } + return line; + } + +} diff --git a/src/main/java/irit/sparql/SparqlProxy.java b/src/main/java/irit/sparql/SparqlProxy.java new file mode 100755 index 0000000000000000000000000000000000000000..9bb8345080bf83b822ef7c4a6d82c30dba0d2630 --- /dev/null +++ b/src/main/java/irit/sparql/SparqlProxy.java @@ -0,0 +1,42 @@ +package irit.sparql; + + +import irit.dataset.DatasetManager; +import org.apache.jena.query.QueryExecution; +import org.apache.jena.query.ResultSet; +import org.apache.jena.rdf.model.RDFNode; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + + +public class SparqlProxy { + + public static List<Map<String, RDFNode>> query(String dataset, String query) { + List<Map<String, RDFNode>> result = new ArrayList<>(); + + try (QueryExecution queryExecution = QueryExecution.create(query, DatasetManager.getInstance().get(dataset))) { + ResultSet resultSet = queryExecution.execSelect(); + resultSet.forEachRemaining(querySolution -> { + Map<String, RDFNode> stringMap = new HashMap<>(); + for (String resultVar : resultSet.getResultVars()) { + stringMap.put(resultVar, querySolution.get(resultVar)); + } + result.add(stringMap); + }); + + } + + return result; + } + + + public static boolean sendAskQuery(String dataset, String query) { + try (QueryExecution queryExecution = QueryExecution.create(query, DatasetManager.getInstance().get(dataset))) { + return queryExecution.execAsk(); + } + } + +} diff --git a/src/main/java/irit/sparql/exceptions/IncompleteSubstitutionException.java b/src/main/java/irit/sparql/exceptions/IncompleteSubstitutionException.java new file mode 100755 index 0000000000000000000000000000000000000000..8fb2b418808fedcb0b4fcd4b2eff928be6b607f9 --- /dev/null +++ b/src/main/java/irit/sparql/exceptions/IncompleteSubstitutionException.java @@ -0,0 +1,13 @@ +package irit.sparql.exceptions; + +import java.io.Serial; + +public class IncompleteSubstitutionException extends Exception { + + @Serial + private static final long serialVersionUID = -3430981554411926194L; + + public IncompleteSubstitutionException(String m){ + super(m); + } +} diff --git a/src/main/java/irit/sparql/exceptions/NotAFolderException.java b/src/main/java/irit/sparql/exceptions/NotAFolderException.java new file mode 100755 index 0000000000000000000000000000000000000000..2da59b4ceb260251cd4fef2da09ef7b4d1f139e7 --- /dev/null +++ b/src/main/java/irit/sparql/exceptions/NotAFolderException.java @@ -0,0 +1,13 @@ +package irit.sparql.exceptions; + +import java.io.Serial; + +public class NotAFolderException extends Exception { + + @Serial + private static final long serialVersionUID = 2763344757163908098L; + + public NotAFolderException(String m){ + super(m); + } +} diff --git a/src/main/java/irit/sparql/files/FolderManager.java b/src/main/java/irit/sparql/files/FolderManager.java new file mode 100755 index 0000000000000000000000000000000000000000..7a13733ac7f0e3aa881ef3609a1932b2ba179672 --- /dev/null +++ b/src/main/java/irit/sparql/files/FolderManager.java @@ -0,0 +1,46 @@ +package irit.sparql.files; + +import irit.sparql.exceptions.NotAFolderException; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.HashMap; +import java.util.Map; + +public class FolderManager { + final File folder; + Map<String, String> queries; + Map<String, QueryTemplate> queryTemplates; + + public FolderManager(String path) throws NotAFolderException{ + folder = new File(path); + if(!folder.isDirectory()){ + throw new NotAFolderException(path+" is not a folder"); + } + queries = new HashMap<>(); + queryTemplates = new HashMap<>(); + } + + public void loadQueries(){ + for (File fileEntry : folder.listFiles()) { + if (!fileEntry.isDirectory() && fileEntry.getName().endsWith(".sparql")) { + try { + String query = new String(Files.readAllBytes(Paths.get(fileEntry.getPath()))); + if(query.replaceAll("\n", " ").matches("^.*\\{\\{ ?([A-Za-z\\d]+) ?}}.*$")){ + queryTemplates.put(fileEntry.getName().split("\\.")[0], new QueryTemplate(query)); + } else { + queries.put(fileEntry.getName().split("\\.")[0], query); + } + } catch (IOException e) { + e.printStackTrace(); + } + } + } + } + + public Map<String, QueryTemplate> getTemplateQueries(){ + return queryTemplates; + } +} diff --git a/src/main/java/irit/sparql/files/QueryTemplate.java b/src/main/java/irit/sparql/files/QueryTemplate.java new file mode 100755 index 0000000000000000000000000000000000000000..1d679c12d6c61cc57f9b8ff3900c1c3d9518d35f --- /dev/null +++ b/src/main/java/irit/sparql/files/QueryTemplate.java @@ -0,0 +1,40 @@ +package irit.sparql.files; + +import irit.sparql.exceptions.IncompleteSubstitutionException; + +import java.util.HashSet; +import java.util.Map; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + + +public class QueryTemplate { + private final String query; + private final Set<String> toSubstitute; + + public QueryTemplate(String query){ + this.query = query; + toSubstitute = new HashSet<>(); + Pattern p = Pattern.compile("\\{\\{ ?([A-Za-z\\d]+) ?}}"); + Matcher m = p.matcher(query); + while(m.find()){ + toSubstitute.add(m.group(1)); + } + } + + + public String substitute(Map<String, String> substitution) throws IncompleteSubstitutionException{ + String query = this.query; + if(substitution.keySet().containsAll(toSubstitute)){ + for(String key : toSubstitute){ + query = query.replaceAll("\\{\\{ ?"+key+" ?}}", substitution.get(key)); + } + } else { + throw new IncompleteSubstitutionException("Some elements of the substitution "+ toSubstitute+"are not resolved by "+substitution); + } + return query; + } + + +} diff --git a/src/main/java/irit/sparql/query/SparqlQuery.java b/src/main/java/irit/sparql/query/SparqlQuery.java new file mode 100755 index 0000000000000000000000000000000000000000..2a60163cc682bcdbb30eb0ffd72bd5274285fb73 --- /dev/null +++ b/src/main/java/irit/sparql/query/SparqlQuery.java @@ -0,0 +1,113 @@ +package irit.sparql.query; + +import irit.resource.IRI; + +import java.util.AbstractMap; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map.Entry; +import java.util.Set; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + + + +public abstract class SparqlQuery { + private final Set<Entry<String, String>> prefix; + protected String from; + protected String where; + protected String mainQuery; + protected final HashMap<String, IRI> iriList; + + public SparqlQuery(Set<Entry<String, String>> prefix, String from, String where) { + this.prefix = new HashSet<>(); + this.prefix.addAll(prefix); + addDefaultPrefixes(); + this.from = from; + this.where = where; + iriList = new HashMap<>(); + } + + public SparqlQuery(String query) { + prefix = new HashSet<>(); + iriList = new HashMap<>(); + addDefaultPrefixes(); + retrievePrefixes(query); + from = ""; + where = ""; + retrieveIRIs(); + } + + public void addDefaultPrefixes() { + prefix.add(new AbstractMap.SimpleEntry<>("rdf", "http://www.w3.org/1999/02/22-rdf-syntax-ns#")); + prefix.add(new AbstractMap.SimpleEntry<>("rdfs", "http://www.w3.org/2000/01/rdf-schema#")); + prefix.add(new AbstractMap.SimpleEntry<>("owl", "http://www.w3.org/2002/07/owl#")); + prefix.add(new AbstractMap.SimpleEntry<>("xsd", "http://www.w3.org/2001/XMLSchema#")); + prefix.add(new AbstractMap.SimpleEntry<>("skos", "http://www.w3.org/2004/02/skos/core#")); + prefix.add(new AbstractMap.SimpleEntry<>("skos-xl", "http://www.w3.org/2008/05/skos-xl#")); + } + + public void retrievePrefixes(String aQuery) { + aQuery = aQuery.trim().replaceAll("PREFIX", "prefix"); + mainQuery = ""; + + if (aQuery.contains("prefix")) { + String[] pref = aQuery.split("prefix"); + for (int j = 0; j < pref.length; j++) { + String str; + if (!pref[0].equals("")) + str = pref[0]; + else + str = pref[pref.length - 1]; + mainQuery = str.substring(str.indexOf('>') + 1); + } + + for (String s : pref) { + String currPrefix = s.trim(); + if (!currPrefix.equals("") && currPrefix.indexOf('<') != -1 && currPrefix.indexOf('>') != -1) { + int begin = currPrefix.indexOf('<'); + int end = currPrefix.indexOf('>'); + String ns = currPrefix.substring(0, currPrefix.indexOf(':')).trim(); + String iri = currPrefix.substring(begin + 1, end).trim(); + prefix.add(new AbstractMap.SimpleEntry<>(ns, iri)); + mainQuery = Pattern.compile(ns + ":([A-Za-z\\d_-]+)").matcher(mainQuery).replaceAll("<" + iri + "$1>"); + } + } + } else { + mainQuery = aQuery; + } + } + + public void retrieveIRIs() { + Pattern patternIRI = Pattern.compile("<[^>]+>"); + Matcher matcherIRI = patternIRI.matcher(mainQuery); + while (matcherIRI.find()) { + if (!matcherIRI.group().equals("<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>")) { + IRI iri = new IRI(matcherIRI.group()); + iriList.put(matcherIRI.group(), iri); + } + } + + for (Entry<String, String> m : prefix) { + if (m.getKey() != null) { + Pattern patt = Pattern.compile("<" + m.getValue() + "([^>]+)>"); + Matcher match = patt.matcher(mainQuery); + while (match.find()) { + if (!match.group().equals("<http://www.w3.org/1999/02/22-rdf-syntax-ns#type>")) { + iriList.get(match.group()).addLabel(match.group(1)); + } + } + + } + } + + } + + public HashMap<String, IRI> getIRIList() { + return iriList; + } + + public String toUnchangedString() { + return mainQuery; + } +} diff --git a/src/main/java/irit/sparql/query/exception/SparqlEndpointUnreachableException.java b/src/main/java/irit/sparql/query/exception/SparqlEndpointUnreachableException.java new file mode 100755 index 0000000000000000000000000000000000000000..6f258a120f6742b1d8037d5f1a3d973816a79fe0 --- /dev/null +++ b/src/main/java/irit/sparql/query/exception/SparqlEndpointUnreachableException.java @@ -0,0 +1,24 @@ +package irit.sparql.query.exception; + +import java.io.Serial; + +public class SparqlEndpointUnreachableException extends Exception +{ + /** + * + */ + @Serial + private static final long serialVersionUID = -6591977685812151888L; + + private final String message; + + public SparqlEndpointUnreachableException(Exception e) + { + message = e.getLocalizedMessage(); + } + + public String toString() + { + return "The endpoint you specified is unreachable : "+ message; + } +} diff --git a/src/main/java/irit/sparql/query/exception/SparqlQueryMalFormedException.java b/src/main/java/irit/sparql/query/exception/SparqlQueryMalFormedException.java new file mode 100755 index 0000000000000000000000000000000000000000..5503267084b0eff4fe32c950bcfe38d3c851179a --- /dev/null +++ b/src/main/java/irit/sparql/query/exception/SparqlQueryMalFormedException.java @@ -0,0 +1,25 @@ +/* + * To change this license header, choose License Headers in Project Properties. + * To change this template file, choose Tools | Templates + * and open the template in the editor. + */ +package irit.sparql.query.exception; + +/** + * + * @author murloc + */ +public class SparqlQueryMalFormedException extends Exception +{ + final String message; + public SparqlQueryMalFormedException(String message) + { + this.message = message; + } + + public String toString() + { + return "The query is malformed : "+message; + } + +} diff --git a/src/main/java/irit/sparql/query/select/SparqlSelect.java b/src/main/java/irit/sparql/query/select/SparqlSelect.java new file mode 100755 index 0000000000000000000000000000000000000000..74e6c2fe7d19b3425b5c792ebc095ee96c53f154 --- /dev/null +++ b/src/main/java/irit/sparql/query/select/SparqlSelect.java @@ -0,0 +1,105 @@ +package irit.sparql.query.select; + +import irit.resource.IRI; +import irit.sparql.query.SparqlQuery; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + + + +public class SparqlSelect extends SparqlQuery { + private String select; + private final ArrayList<String> selectFocus; + + public SparqlSelect(String query) { + super(query); + mainQuery = mainQuery.trim().replaceAll("SELECT", "select").replaceAll("WHERE", "where").replaceAll("\n", " "); + selectFocus = new ArrayList<>(); + Pattern pattern = Pattern.compile(""" + select[ \t + distncDISTNC]+(\\?[A-Za-z\\d_-]+)[ \t + ]+(\\?*[A-Za-z\\d_-]*[ \t + ]*)where[ \t + ]*\\{(.+)}[ \t + ]*$"""); + Matcher matcher = pattern.matcher(mainQuery); + while (matcher.find()) { + selectFocus.add(matcher.group(1).trim()); + if (!matcher.group(2).trim().isEmpty()) { + selectFocus.add(matcher.group(2).trim()); + } + where = matcher.group(3).trim(); + setAggregate(); + } + Pattern pattern2 = Pattern.compile(""" + select([ \t + distncDISTNC]+\\?[A-Za-z\\d_-]+[ \t + ]+\\?*[A-Za-z\\d_-]*[ \t + ]*)where"""); + Matcher matcher2 = pattern2.matcher(mainQuery); + if (matcher2.find()) { + select = matcher2.group(1); + } + + + } + + public String getSelect() { + return select; + } + + public void setSelect(String select) { + this.select = select; + } + + public void setAggregate() { + } + + public String toString() { + return mainQuery; + } + + public String toSubgraphForm() { + + String ret = where; + if (selectFocus.size() > 1) { + int i = 0; + for (String sf : selectFocus) { + ret = ret.replaceAll(sf.replaceAll("\\?", "\\\\?") + " ", "\\?answer" + i + " "); + ret = ret.replaceAll(sf.replaceAll("\\?", "\\\\?") + "\\.", "\\?answer" + i + "."); + ret = ret.replaceAll(sf.replaceAll("\\?", "\\\\?") + "}", "\\?answer" + i + "}"); + ret = ret.replaceAll(sf.replaceAll("\\?", "\\\\?") + "\\)", "\\?answer" + i + ")"); + i++; + } + } else { + ret = ret.replaceAll(selectFocus.get(0).replaceAll("\\?", "\\\\?") + " ", "\\?answer "); + ret = ret.replaceAll(selectFocus.get(0).replaceAll("\\?", "\\\\?") + "\\.", "\\?answer."); + ret = ret.replaceAll(selectFocus.get(0).replaceAll("\\?", "\\\\?") + "}", "\\?answer}"); + ret = ret.replaceAll(selectFocus.get(0).replaceAll("\\?", "\\\\?") + "\\)", "\\?answer)"); + } + return ret.replaceAll("\n", " ").replaceAll("\"", "\\\""); + } + + public ArrayList<String> getSelectFocus() { + return selectFocus; + } + + public int getFocusLength() { + return selectFocus.size(); + } + + + + public HashSet<String> getLabels(){ + HashSet<String> queryLabels = new HashSet<>(); + + for (Map.Entry<String, IRI> iri : getIRIList().entrySet()){ + queryLabels.addAll(iri.getValue().getLabels()); + } + return queryLabels; + } +} diff --git a/src/main/resources/META-INF/MANIFEST.MF b/src/main/resources/META-INF/MANIFEST.MF new file mode 100644 index 0000000000000000000000000000000000000000..55bd542282e497c15223448770a9107e4be146e6 --- /dev/null +++ b/src/main/resources/META-INF/MANIFEST.MF @@ -0,0 +1,3 @@ +Manifest-Version: 1.0 +Main-Class: irit.complex.ComplexAlignmentGeneration +