Merge pull request #8 from pietercolpaert/main

constraintAutomaton · web-flow · commit ddb0d92f32f1 · 2024-08-30T10:37:58.000+02:00
Feedback before CR
diff --git a/code/Q1.ttl b/code/Q1.ttl
@@ -1,12 +1,12 @@
 PREFIX sosa: <http://www.w3.org/ns/sosa/> 
 PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> 
 PREFIX wgs: <http://www.w3.org/2003/01/geo/wgs84_pos#>
-PREFIX etsi: <https://saref.etsi.org/core/>
+PREFIX saref: <https://saref.etsi.org/core/>
     
 SELECT * WHERE {
-    ?s etsi:hasTimestamp ?t.
-?s etsi:hasValue ?result.
-?s etsi:measurementMadeBy ?sensor.
+    ?s saref:hasTimestamp ?t.
+?s saref:hasValue ?result.
+?s saref:measurementMadeBy ?sensor.
 ?sensor <https://dahcc.idlab.ugent.be/Ontology/Sensors/analyseStateOf> ?stateOf.
 ?sensor <https://saref.etsi.org/core/measuresProperty> <https://dahcc.idlab.ugent.be/Homelab/SensorsAndActuators/energy.consumption>
 
diff --git a/code/Q2.ttl b/code/Q2.ttl
@@ -1,12 +1,12 @@
 PREFIX sosa: <http://www.w3.org/ns/sosa/> 
 PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> 
 PREFIX wgs: <http://www.w3.org/2003/01/geo/wgs84_pos#>
-PREFIX etsi: <https://saref.etsi.org/core/>
+PREFIX saref: <https://saref.etsi.org/core/>
     
 SELECT * WHERE {
-    ?s etsi:hasTimestamp ?t.
-?s etsi:hasValue ?result.
-?s etsi:measurementMadeBy ?sensor.
+    ?s saref:hasTimestamp ?t.
+?s saref:hasValue ?result.
+?s saref:measurementMadeBy ?sensor.
 ?sensor <https://dahcc.idlab.ugent.be/Ontology/Sensors/analyseStateOf> ?stateOf.
 ?sensor <https://saref.etsi.org/core/measuresProperty> <https://dahcc.idlab.ugent.be/Homelab/SensorsAndActuators/environment.light>
 
diff --git a/code/Q3.ttl b/code/Q3.ttl
@@ -1,12 +1,12 @@
 PREFIX sosa: <http://www.w3.org/ns/sosa/> 
 PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> 
 PREFIX wgs: <http://www.w3.org/2003/01/geo/wgs84_pos#>
-PREFIX etsi: <https://saref.etsi.org/core/>
+PREFIX saref: <https://saref.etsi.org/core/>
     
 SELECT * WHERE {
-    ?s etsi:hasTimestamp ?t.
-?s etsi:hasValue ?result.
-?s etsi:measurementMadeBy ?sensor.
+    ?s saref:hasTimestamp ?t.
+?s saref:hasValue ?result.
+?s saref:measurementMadeBy ?sensor.
 ?sensor <https://dahcc.idlab.ugent.be/Ontology/Sensors/analyseStateOf> ?stateOf.
 ?sensor <https://saref.etsi.org/core/measuresProperty> <https://dahcc.idlab.ugent.be/Homelab/SensorsAndActuators/energy.consumption>
 
diff --git a/code/Q4.ttl b/code/Q4.ttl
@@ -1,12 +1,12 @@
 PREFIX sosa: <http://www.w3.org/ns/sosa/> 
 PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> 
 PREFIX wgs: <http://www.w3.org/2003/01/geo/wgs84_pos#>
-PREFIX etsi: <https://saref.etsi.org/core/>
+PREFIX saref: <https://saref.etsi.org/core/>
     
 SELECT * WHERE {
-    ?s etsi:hasTimestamp ?t.
-?s etsi:hasValue ?result.
-?s etsi:measurementMadeBy ?sensor.
+    ?s saref:hasTimestamp ?t.
+?s saref:hasValue ?result.
+?s saref:measurementMadeBy ?sensor.
 ?sensor <https://dahcc.idlab.ugent.be/Ontology/Sensors/analyseStateOf> ?stateOf.
 ?sensor <https://saref.etsi.org/core/measuresProperty> <https://dahcc.idlab.ugent.be/Homelab/SensorsAndActuators/energy.consumption>
 
diff --git a/code/example_sparql_query.ttl b/code/example_sparql_query.ttl
@@ -1,12 +1,11 @@
 PREFIX xsd: <http://www.w3.org/2001/XMLSchema#> 
-PREFIX etsi: <https://saref.etsi.org/core/>
-PREFIX dahcc: <https://dahcc.idlab.ugent.be/Ontology/Sensors/>
 PREFIX saref: <https://saref.etsi.org/core/>
+PREFIX dahcc: <https://dahcc.idlab.ugent.be/Ontology/Sensors/>
 
 SELECT * WHERE {
-  ?s etsi:hasTimestamp ?t;
-    etsi:hasValue ?result;
-    etsi:measurementMadeBy ?sensor.
+  ?s saref:hasTimestamp ?t;
+    saref:hasValue ?result;
+    saref:measurementMadeBy ?sensor.
   ?sensor dahcc:analyseStateOf ?stateOf;
     saref:measuresProperty {:property}.
   FILTER(?t="2022-01-03T10:57:54.000000"^^xsd:dateTime)
diff --git a/code/example_tree_relation.ttl b/code/example_tree_relation.ttl
@@ -1,11 +1,11 @@
 @prefix tree: <https://w3id.org/tree#> .
 @prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
 @prefix ex: <https://example.be/> .
-@prefix etsi: <https://saref.etsi.org/core/>
+@prefix saref: <https://saref.etsi.org/core/> .
 
-ex:node tree:relation [
+<> tree:relation [
     a  tree:GreaterThanOrEqualToRelation ;
-    tree:node ex:nextNode ;
-    tree:value "2022-01-03T09:47:59.000000"^^xsd:dateTime ;
-    tree:path etsi:hasTimestamp 
+    tree:node <nextNode> ;
+    tree:value "2022-01-03T09:47:59"^^xsd:dateTime ;
+    tree:path saref:hasTimestamp 
 ] .
diff --git a/section/conclusion.tex b/section/conclusion.tex
@@ -1,9 +1,9 @@
 \section{Conclusion}
 
-The publication of linked data in SPARQL endpoints is not always a sustainable approach due to unavailability and cost problems.
-Our work is centered around decentralized alternatives for linked data publication.
+This paper reported on preliminary tests to add guided link traversal support into the Comunica querying engine using a rule-based reachability approach.
 Our preliminary results show that our rule-based reachability criterion can significantly reduce the execution time of queries aligned with hypermedia description constraints compared to predicate-based reachability
 opening the possibility for faster and more versatile traversal-based query execution over fragmented RDF documents.
 Our experiment also highlights that the size of the internal data store might have more impact on performance than noted in previous studies.
 In future work, we will perform more exhaustive evaluations of other types of domain-oriented fragmentation strategies such as string evaluation and geospatial,
 and investigate how to generalize our approach to support more expressive online reasoning for online source selection during traversal queries.
+Furthermore, we also showed there is still room for optimization by researching ways for pruning useless quads from the internal quadstore as the link traversal is happening.
diff --git a/section/guided_link_traversal.tex b/section/guided_link_traversal.tex
@@ -21,7 +21,7 @@ \section{A Rule-Based Reachability Criterion}
 
 We define our approach as a rule-based reachability criterion.
 Our approach builds upon the concept of structural assumptions~\cite{taelman2023} to exploit the structural properties of TREE annotated datasets.
-Concretely, we interpret the hypermedia descriptions of constraints in TREE fragments as boolean expressions $E$ ($?t>= \text{2022-01-03T09:47:59.000000}$ in Figure~\ref{lst:system}).
+We therefore interpret the hypermedia descriptions of constraints in TREE fragments as boolean expressions $E$ ($?t>= \text{2022-01-03T09:47:59.000000}$ in Figure~\ref{lst:system}).
 Upon discovery of a document, the query engine gathers the relevant triples to form the boolean expression of the constraint on the data of reachable fragments.
 After the parsing of the expression, the filter expression $F$ of the SPARQL query is \textit{pushed down} into the engine's source selection component.
 The source selection component can be formalized as a reachability criterion~\sepfootnote{sf:reachabilityCriterion} 
@@ -36,16 +36,16 @@ \section{A Rule-Based Reachability Criterion}
 \end{equation}
 hold true given $x$ is the variable targeted by $E_i$ and $i$ is the link towards the next fragment (\texttt{ex:nextNode} from \texttt{ex:node tree:node ex:nextNode} in Figure~\ref{lst:system}).
 A variable targetted by $E$ is defined by an RDF object where the predicate as a value \texttt{?target} from the triple
-defining the fragmentation path in the form \texttt{?s tree:path ?target} (\texttt{etsi:hasTimestamp} in Figure~\ref{lst:system}).
+defining the fragmentation path in the form \texttt{?s tree:path ?target} (\texttt{saref:hasTimestamp} in Figure~\ref{lst:system}).
 Upon satisfaction the IRI targeting the next fragment is added to the link queue otherwise the IRI is pruned.
 The process is schematized in Figure~\ref{fig:process}.
 
 \begin{figure}[htbp]
     \centering
     \includegraphics[width=\linewidth]{image/running_example.drawio.pdf}
     \caption{A schematization of our rule-based reachability criteria with a TREE document.
-     First a TREE node is dereferenced, then the TREE relations are transformed into boolean expressions $E$,
-      followed by the construction of $F$ from the filter expression related to the path of $E$ (the variable $t$ related to \texttt{sosa:resultTime}),
+      First a TREE node is dereferenced, then the TREE relations are transformed into boolean expressions $E$,
+      followed by the construction of $F$ from the filter expression related to the path of $E$ (the variable $t$ related to \texttt{saref:hasTimestamp}),
        then the satisfiability $E \land F$ is determined and finally links to non-query relevant data are pruned.}
     \label{fig:process}
   \end{figure}
@@ -92,6 +92,6 @@ \subsection{Preliminary Results}
 With Q3 we see that the percentage of reduction is 33\%, this lowering of performance gain might be caused by the increase by a factor of 6 in HTTP requests.
 This raises an interesting observation because we do not observe a reduction in execution time with a reduction in HTTP requests.
 Previous research has proposed that inefficient query plans might be the bottleneck of some queries in structured environments~\cite{taelman2023,eschauzier_quweda_2023}.
-However, our results seem to show that the size of the internal data source might have a bigger impact on performance than noted in previous studies.
-This observation might have significant consequences because large-scale web querying might result in the acquisition of a large number of triples.
-The query Q4 was not able to be answered, with any setup, because the query requires a larger number of fragments than the other to be processed.
+However, our results seem to show that the size of the internal quad store might have a bigger impact on performance than noted in previous studies.
+As large-scale guided link traversal over the web will result in the acquisition of a large number of triples, a future interesting research direction would be to find ways to also remove quads that are certain to not lead to a query result anymore from the internal quad store.
+The query Q4 was not able to be answered, with any setup, because the query requires a larger number of fragments than the other to be processed.
diff --git a/section/introduction.tex b/section/introduction.tex
@@ -14,8 +14,8 @@ \section{Introduction}
 For example, in the case of periodic measurements of sensor data, a fragmentation can be made on the publication date of each data entity.
 A fragment can be considered an RDF document published in a server.
 TREE aims to describes dataset fragmentation in ways that enable clients to easily fetch query-relevant subsets.
-The data inside a fragment are bounded with constraints expressed using hypermedia descriptions~\cite{thomasFieldingPhdThesis}.
-More precisely, each fragment describes the constraints of the data of every reachable fragment.
+The data within a fragment are bound by constraints expressed through hypermedia descriptions~\cite{thomasFieldingPhdThesis}.
+Each fragment contains relations to other pages, and those relations contain the constraints of the data of every reachable fragment.
 In this paper, we refer to those constraints as domain-specific expressions.
 They can be expressions such as $?t > \text{2022-01-09T00:00:00.000000} \implies \text{ex:afterFirstSeptember}$ 
 given that $?t$ is the date of publication of sensor data and the implication pertains to the location of the data respecting the constraint.
@@ -36,7 +36,7 @@ \section{Introduction}
 to define a mechanism of traversal centered around rules.
 
 In this paper, we propose to use a boolean solver as the main link pruning mechanism for a reachability criterion to traverse TREE documents.
-The logical operators are defined by the \href{https://treecg.github.io/specification/}{TREE specification}.~\sepfootnote{sf:treeSpec}
+The logical operators are defined by the \href{https://w3id.org/tree/specification/}{TREE specification}.~\sepfootnote{sf:treeSpec}
 As a concrete use case, we consider the publication of (historical) sensor data.
 An example query is presented in Figure~\ref{lst:system} along with the triples representing the link between two documents expressed using the TREE specification.
 
@@ -55,4 +55,4 @@ \section{Introduction}
     The constraint describes publication times ($?t$) where $?t>= \text{2022-01-03T09:47:59.000000}$.}
         \label{lst:system}
     \vspace*{-0.90cm}
-\end{figure}
+\end{figure}