diff --git a/MANIFEST.in b/MANIFEST.in index ab3e9b5e3c..8c9e058fa7 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -56,6 +56,7 @@ recursive-include renku *.html recursive-include renku *.sh recursive-include renku *.txt recursive-include renku *.yml +recursive-include renku *.json recursive-include renku Dockerfile -recursive-include tests *.py *.gz *.yml +recursive-include tests *.py *.gz *.yml *.json prune .github diff --git a/renku/cli/log.py b/renku/cli/log.py index fa4fc83dfe..1873314fe2 100644 --- a/renku/cli/log.py +++ b/renku/cli/log.py @@ -52,6 +52,15 @@ * `ascii` * `dot` +* `dot-full` +* `dot-landscape` +* `dot-full-landscape` +* `dot-debug` +* `json-ld` +* `json-ld-graph` +* `Makefile` +* `nt` +* `rdf` You can generate a PNG of the full history of all files in the repository using the :program:`dot` program. @@ -62,6 +71,15 @@ $ renku log --format dot $FILES | dot -Tpng > /tmp/graph.png $ open /tmp/graph.png +Output validation +~~~~~~~~~~~~~~~~~ + +The ``--strict`` option forces the output to be validated against the Renku +SHACL schema, causing the command to fail if the generated output is not +valid, as well as printing detailed information on all the issues found. +The ``--strict`` option is only supported for the ``jsonld``, ``rdf`` and +``nt`` output formats. + """ import click @@ -86,9 +104,15 @@ default=False, help='Display commands without output files.' ) +@click.option( + '--strict', + is_flag=True, + default=False, + help='Validate triples before output.' +) @click.argument('paths', type=click.Path(exists=True), nargs=-1) @pass_local_client -def log(client, revision, format, no_output, paths): +def log(client, revision, format, no_output, strict, paths): """Show logs for a file.""" graph = Graph(client) if not paths: @@ -108,4 +132,4 @@ def log(client, revision, format, no_output, paths): # NOTE shall we warn when "not no_output and not paths"? graph.build(paths=paths, revision=revision, can_be_cwl=no_output) - FORMATS[format](graph) + FORMATS[format](graph, strict=strict) diff --git a/renku/core/commands/checks/__init__.py b/renku/core/commands/checks/__init__.py index 6025f1fd4c..48ab9788b8 100644 --- a/renku/core/commands/checks/__init__.py +++ b/renku/core/commands/checks/__init__.py @@ -19,6 +19,7 @@ from .migration import check_dataset_metadata, check_missing_files from .references import check_missing_references +from .validate_shacl import check_project_structure, check_datasets_structure # Checks will be executed in the order as they are listed in __all__. # They are mostly used in ``doctor`` command to inspect broken things. @@ -26,4 +27,6 @@ 'check_dataset_metadata', 'check_missing_files', 'check_missing_references', + 'check_project_structure', + 'check_datasets_structure', ) diff --git a/renku/core/commands/checks/validate_shacl.py b/renku/core/commands/checks/validate_shacl.py new file mode 100644 index 0000000000..ae7c212a17 --- /dev/null +++ b/renku/core/commands/checks/validate_shacl.py @@ -0,0 +1,114 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2019 - Swiss Data Science Center (SDSC) +# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and +# Eidgenössische Technische Hochschule Zürich (ETHZ). +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Check KG structure using SHACL.""" +import yaml +from rdflib.namespace import Namespace +from rdflib.term import BNode + +from renku.core.commands.echo import WARNING +from renku.core.compat import pyld +from renku.core.models.jsonld import NoDatesSafeLoader +from renku.core.utils.shacl import validate_graph + + +def _shacl_graph_to_string(graph): + """Converts a shacl validation graph into human readable format.""" + sh = Namespace('http://www.w3.org/ns/shacl#') + + problems = [] + + for _, result in graph.subject_objects(sh.result): + path = graph.value(result, sh.resultPath) + res = graph.value(result, sh.resultMessage) + + if res: + message = '{0}: {1}'.format(path, res) + else: + kind = graph.value(result, sh.sourceConstraintComponent) + focusNode = graph.value(result, sh.focusNode) + + if isinstance(focusNode, BNode): + focusNode = '' + + message = '{0}: Type: {1}, Node ID: {2}'.format( + path, kind, focusNode + ) + + problems.append(message) + + return '\n\t'.join(problems) + + +def check_project_structure(client): + """Validate project metadata against SHACL.""" + project_path = client.renku_metadata_path + + conform, graph, t = check_shacl_structure(project_path) + + if conform: + return True, None + + problems = '{0}Invalid structure of project metadata\n\t{1}'.format( + WARNING, _shacl_graph_to_string(graph) + ) + + return False, problems + + +def check_datasets_structure(client): + """Validate dataset metadata against SHACL.""" + ok = True + + problems = ['{0}Invalid structure of dataset metadata'.format(WARNING)] + + for path in client.renku_datasets_path.rglob(client.METADATA): + try: + conform, graph, t = check_shacl_structure(path) + except (Exception, BaseException) as e: + problems.append('Couldn\'t validate {0}: {1}\n\n'.format(path, e)) + continue + + if conform: + continue + + ok = False + + problems.append( + '{0}\n\t{1}\n'.format(path, _shacl_graph_to_string(graph)) + ) + + if ok: + return True, None + + return False, '\n'.join(problems) + + +def check_shacl_structure(path): + """Validates all metadata aginst the SHACL schema.""" + with path.open(mode='r') as fp: + source = yaml.load(fp, Loader=NoDatesSafeLoader) or {} + + rdf = pyld.jsonld.to_rdf( + source, + options={ + 'format': 'application/n-quads', + 'produceGeneralizedRdf': True + } + ) + + return validate_graph(rdf) diff --git a/renku/core/commands/dataset.py b/renku/core/commands/dataset.py index 288a8983bf..6604b2838a 100644 --- a/renku/core/commands/dataset.py +++ b/renku/core/commands/dataset.py @@ -567,7 +567,7 @@ def update_datasets( file_.dataset = dataset possible_updates.append(file_) - unique_remotes.add(file_.based_on['url']) + unique_remotes.add(file_.based_on.url) if ref and len(unique_remotes) > 1: raise ParameterError( diff --git a/renku/core/commands/format/graph.py b/renku/core/commands/format/graph.py index 4dc3eec0f1..ef4c19f1d5 100644 --- a/renku/core/commands/format/graph.py +++ b/renku/core/commands/format/graph.py @@ -21,12 +21,18 @@ import click +from renku.core.errors import SHACLValidationError +from renku.core.utils.shacl import validate_graph -def ascii(graph): + +def ascii(graph, strict=False): """Format graph as an ASCII art.""" from ..ascii import DAG from ..echo import echo_via_pager + if strict: + raise SHACLValidationError('--strict not supported for json-ld-graph') + echo_via_pager(str(DAG(graph))) @@ -34,30 +40,39 @@ def _jsonld(graph, format, *args, **kwargs): """Return formatted graph in JSON-LD ``format`` function.""" import json - from pyld import jsonld + from renku.core.compat import pyld from renku.core.models.jsonld import asjsonld - output = getattr(jsonld, format)([ + output = getattr(pyld.jsonld, format)([ asjsonld(action) for action in graph.activities.values() ]) return json.dumps(output, indent=2) -def dot(graph, simple=True, debug=False, landscape=False): - """Format graph as a dot file.""" - import sys - +def _conjunctive_graph(graph): + """Convert a renku ``Graph`` to an rdflib ``ConjunctiveGraph``.""" from rdflib import ConjunctiveGraph from rdflib.plugin import register, Parser - from rdflib.tools.rdf2dot import rdf2dot register('json-ld', Parser, 'rdflib_jsonld.parser', 'JsonLDParser') - g = ConjunctiveGraph().parse( + return ConjunctiveGraph().parse( data=_jsonld(graph, 'expand'), format='json-ld', ) + +def dot(graph, simple=True, debug=False, landscape=False, strict=False): + """Format graph as a dot file.""" + import sys + + from rdflib.tools.rdf2dot import rdf2dot + + if strict: + raise SHACLValidationError('--strict not supported for json-ld-graph') + + g = _conjunctive_graph(graph) + g.bind('prov', 'http://www.w3.org/ns/prov#') g.bind('foaf', 'http://xmlns.com/foaf/0.1/') g.bind('wfdesc', 'http://purl.org/wf4ever/wfdesc#') @@ -92,7 +107,7 @@ def _rdf2dot_simple(g, stream): import re path_re = re.compile( - r'file:///(?P[a-zA-Z]+)/' + r'(?Pfile://|https://\w+/\w+/){0,1}(?P[a-zA-Z]+)/' r'(?P\w+)' r'(?P.+)?' ) @@ -293,10 +308,13 @@ def color(p): stream.write('}\n') -def makefile(graph): +def makefile(graph, strict=False): """Format graph as Makefile.""" from renku.core.models.provenance.activities import ProcessRun, WorkflowRun + if strict: + raise SHACLValidationError('--strict not supported for json-ld-graph') + for activity in graph.activities.values(): if not isinstance(activity, ProcessRun): continue @@ -316,44 +334,53 @@ def makefile(graph): ) -def jsonld(graph): +def jsonld(graph, strict=False): """Format graph as JSON-LD file.""" - click.echo(_jsonld(graph, 'expand')) + ld = _jsonld(graph, 'expand') + + if strict: + r, _, t = validate_graph(ld, format='json-ld') + + if not r: + raise SHACLValidationError( + "{}\nCouldn't get log: Invalid Knowledge Graph data".format(t) + ) + click.echo(ld) -def jsonld_graph(graph): +def jsonld_graph(graph, strict=False): """Format graph as JSON-LD graph file.""" + if strict: + raise SHACLValidationError('--strict not supported for json-ld-graph') click.echo(_jsonld(graph, 'flatten')) -def nt(graph): +def nt(graph, strict=False): """Format graph as n-tuples.""" - from rdflib import ConjunctiveGraph - from rdflib.plugin import register, Parser + nt = _conjunctive_graph(graph).serialize(format='nt') + if strict: + r, _, t = validate_graph(nt, format='nt') - register('json-ld', Parser, 'rdflib_jsonld.parser', 'JsonLDParser') + if not r: + raise SHACLValidationError( + "{}\nCouldn't get log: Invalid Knowledge Graph data".format(t) + ) - click.echo( - ConjunctiveGraph().parse( - data=_jsonld(graph, 'expand'), - format='json-ld', - ).serialize(format='nt') - ) + click.echo(nt) -def rdf(graph): +def rdf(graph, strict=False): """Output the graph as RDF.""" - from rdflib import ConjunctiveGraph - from rdflib.plugin import register, Parser + xml = _conjunctive_graph(graph).serialize(format='application/rdf+xml') + if strict: + r, _, t = validate_graph(xml, format='xml') - register('json-ld', Parser, 'rdflib_jsonld.parser', 'JsonLDParser') + if not r: + raise SHACLValidationError( + "{}\nCouldn't get log: Invalid Knowledge Graph data".format(t) + ) - click.echo( - ConjunctiveGraph().parse( - data=_jsonld(graph, 'expand'), - format='json-ld', - ).serialize(format='application/rdf+xml') - ) + click.echo(xml) FORMATS = { diff --git a/renku/core/compat.py b/renku/core/compat.py index cfbbda9161..8bc6ed24d4 100644 --- a/renku/core/compat.py +++ b/renku/core/compat.py @@ -18,10 +18,13 @@ """Compatibility layer for different Python versions.""" import contextlib +import json import os import sys from pathlib import Path +import pyld + if sys.version_info < (3, 6): original_resolve = Path.resolve @@ -63,4 +66,25 @@ def __exit__(self, *excinfo): except NameError: # pragma: no cover FileNotFoundError = IOError -__all__ = ('FileNotFoundError', 'Path', 'contextlib') + +class PatchedActiveContextCache(pyld.jsonld.ActiveContextCache): + """Pyld context cache without issue of missing contexts.""" + + def set(self, active_ctx, local_ctx, result): + if len(self.order) == self.size: + entry = self.order.popleft() + if sum( + e['activeCtx'] == entry['activeCtx'] and + e['localCtx'] == entry['localCtx'] for e in self.order + ) == 0: + # only delete from cache if it doesn't exist in context deque + del self.cache[entry['activeCtx']][entry['localCtx']] + key1 = json.dumps(active_ctx) + key2 = json.dumps(local_ctx) + self.order.append({'activeCtx': key1, 'localCtx': key2}) + self.cache.setdefault(key1, {})[key2] = json.loads(json.dumps(result)) + + +pyld.jsonld._cache = {'activeCtx': PatchedActiveContextCache()} + +__all__ = ('FileNotFoundError', 'Path', 'contextlib', 'pyld') diff --git a/renku/core/errors.py b/renku/core/errors.py index 473ab9701c..92cfe46b36 100644 --- a/renku/core/errors.py +++ b/renku/core/errors.py @@ -373,3 +373,7 @@ class UrlSchemeNotSupported(RenkuException): class OperationError(RenkuException): """Raised when an operation at runtime raises an error.""" + + +class SHACLValidationError(RenkuException): + """Raises when SHACL validation of the graph fails.""" diff --git a/renku/core/models/datasets.py b/renku/core/models/datasets.py index e178440f3d..22295b5e6c 100644 --- a/renku/core/models/datasets.py +++ b/renku/core/models/datasets.py @@ -127,7 +127,7 @@ def _now(self): @_id.default def default_id(self): """Define default value for id field.""" - return '{0}@{1}'.format(self.name, self.commit) + return '_:{0}@{1}'.format(self.name, self.commit) @jsonld.s( @@ -150,6 +150,12 @@ def convert_filename_path(p): return Path(p).name +def convert_based_on(v): + """Convert based_on to DatasetFile.""" + if v: + return DatasetFile.from_jsonld(v) + + @jsonld.s( type='schema:DigitalDocument', slots=True, @@ -179,7 +185,10 @@ class DatasetFile(Entity, CreatorMixin): url = jsonld.ib(default=None, context='schema:url', kw_only=True) based_on = jsonld.ib( - default=None, context='schema:isBasedOn', kw_only=True + default=None, + context='schema:isBasedOn', + kw_only=True, + converter=convert_based_on ) @added.default @@ -213,6 +222,11 @@ def __attrs_post_init__(self): if not self.name: self.name = self.filename + parsed_id = urllib.parse.urlparse(self._id) + + if not parsed_id.scheme: + self._id = 'file://{}'.format(self._id) + def _convert_dataset_files(value): """Convert dataset files.""" diff --git a/renku/core/models/entities.py b/renku/core/models/entities.py index 76fc9c8ffb..ebff9c4407 100644 --- a/renku/core/models/entities.py +++ b/renku/core/models/entities.py @@ -62,7 +62,9 @@ def default_id(self): hexsha = self.commit.hexsha else: hexsha = 'UNCOMMITTED' - return 'blob/{hexsha}/{self.path}'.format(hexsha=hexsha, self=self) + return 'file://blob/{hexsha}/{self.path}'.format( + hexsha=hexsha, self=self + ) @_label.default def default_label(self): diff --git a/renku/core/models/jsonld.py b/renku/core/models/jsonld.py index 8f2ac38413..5e2a9adbd8 100644 --- a/renku/core/models/jsonld.py +++ b/renku/core/models/jsonld.py @@ -30,8 +30,8 @@ from attr._compat import iteritems from attr._funcs import has from attr._make import Factory, fields -from pyld import jsonld as ld +from renku.core.compat import pyld from renku.core.models.locals import ReferenceMixin, with_reference from renku.core.models.migrations import JSONLD_MIGRATIONS @@ -149,7 +149,7 @@ def wrap(cls): # Register class for given JSON-LD @type try: - type_ = ld.expand({ + type_ = pyld.jsonld.expand({ '@type': jsonld_cls._jsonld_type, '@context': context })[0]['@type'] @@ -473,10 +473,10 @@ def from_jsonld( if cls._jsonld_translate: # perform the translation - data = ld.compact(data, cls._jsonld_translate) + data = pyld.jsonld.compact(data, cls._jsonld_translate) # compact using the class json-ld context data.pop('@context', None) - data = ld.compact(data, cls._jsonld_context) + data = pyld.jsonld.compact(data, cls._jsonld_context) data.setdefault('@context', cls._jsonld_context) @@ -504,7 +504,7 @@ def from_jsonld( data['@context'] = {'@base': data['@context']} data['@context'].update(cls._jsonld_context) try: - compacted = ld.compact(data, cls._jsonld_context) + compacted = pyld.jsonld.compact(data, cls._jsonld_context) except Exception: compacted = data else: diff --git a/renku/core/models/provenance/activities.py b/renku/core/models/provenance/activities.py index 1ac7969a41..b658adc0a6 100644 --- a/renku/core/models/provenance/activities.py +++ b/renku/core/models/provenance/activities.py @@ -18,9 +18,10 @@ """Represent a Git commit.""" import os +import urllib import uuid from collections import OrderedDict -from pathlib import Path +from pathlib import Path, posixpath import attr from git import NULL_TREE @@ -217,7 +218,15 @@ def paths(self): @classmethod def generate_id(cls, commit): """Calculate action ID.""" - return 'commit/{commit.hexsha}'.format(commit=commit) + host = os.environ.get('RENKU_DOMAIN') or 'localhost' + + # always set the id by the identifier + return urllib.parse.urljoin( + 'https://{host}'.format(host=host), + posixpath.join( + '/activities', 'commit/{commit.hexsha}'.format(commit=commit) + ) + ) @_id.default def default_id(self): diff --git a/renku/core/models/provenance/agents.py b/renku/core/models/provenance/agents.py index 67f429aef9..7e731b1c5a 100644 --- a/renku/core/models/provenance/agents.py +++ b/renku/core/models/provenance/agents.py @@ -58,9 +58,14 @@ class Person: @_id.default def default_id(self): """Set the default id.""" + import string if self.email: return 'mailto:{email}'.format(email=self.email) - return '_:{}'.format(''.join(self.name.lower().split())) + + # prep name to be a valid ntuple string + name = self.name.translate(str.maketrans('', '', string.punctuation)) + name = ''.join(filter(lambda x: x in string.printable, name)) + return '_:{}'.format(''.join(name.lower().split())) @email.validator def check_email(self, attribute, value): diff --git a/renku/core/utils/shacl.py b/renku/core/utils/shacl.py new file mode 100644 index 0000000000..71e2a15eba --- /dev/null +++ b/renku/core/utils/shacl.py @@ -0,0 +1,44 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2018-2019- Swiss Data Science Center (SDSC) +# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and +# Eidgenössische Technische Hochschule Zürich (ETHZ). +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""JSON-LD SHACL validations.""" + +from pkg_resources import resource_string +from pyshacl import validate + + +def validate_graph(graph, shacl_path=None, format='nquads'): + """Validate the current graph with a SHACL schema. + + Uses default schema if not supplied. + """ + if shacl_path: + with open(shacl_path, 'r', encoding='utf-8') as f: + shacl = f.read() + else: + shacl = resource_string('renku', 'data/shacl_shape.json') + + return validate( + graph, + shacl_graph=shacl, + inference='rdfs', + meta_shacl=True, + debug=False, + data_graph_format=format, + shacl_graph_format='json-ld', + advanced=True + ) diff --git a/renku/data/shacl_shape.json b/renku/data/shacl_shape.json new file mode 100644 index 0000000000..31d53f2277 --- /dev/null +++ b/renku/data/shacl_shape.json @@ -0,0 +1,780 @@ +{ + "@context": { + "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", + "rdfs": "http://www.w3.org/2000/01/rdf-schema#", + "sh": "http://www.w3.org/ns/shacl#", + "xsd": "http://www.w3.org/2001/XMLSchema#", + "schema": "http://schema.org/", + "foaf": "http://xmlns.com/foaf/0.1/", + "prov": "http://www.w3.org/ns/prov#", + "wfprov": "http://purl.org/wf4ever/wfprov#", + "closed": { + "@id": "sh:closed", + "@type": "http://www.w3.org/2001/XMLSchema#boolean" + }, + "datatype": { + "@id": "sh:datatype", + "@type": "@id" + }, + "ignoredProperties": { + "@id": "sh:ignoredProperties", + "@container": "@list" + }, + "or": { + "@id": "sh:or", + "@container": "@list" + }, + "minCount": "sh:minCount", + "maxCount": "sh:maxCount", + "nodeKind": { + "@id": "sh:nodeKind", + "@type": "@id" + }, + "property": "sh:property", + "path": { + "@id": "sh:path", + "@type": "@id" + }, + "targetClass": { + "@id": "sh:targetClass", + "@type": "@id" + }, + "target": { + "@id": "sh:target", + "@type": "@id" + } + }, + "@graph": [ + { + "@id": "schema:", + "sh:declare": [ + { + "sh:prefix": [ + { + "@value": "schema" + } + ], + "sh:namespace": [ + { + "@value": "http://schema.org/", + "@type": "xsd:anyURI" + } + ] + } + ] + }, + { + "@id": "prov:", + "sh:declare": [ + { + "sh:prefix": [ + { + "@value": "prov" + } + ], + "sh:namespace": [ + { + "@value": "http://www.w3.org/ns/prov#", + "@type": "xsd:anyURI" + } + ] + } + ] + }, + { + "@id": "_:oldProjecShape", + "@type": "sh:NodeShape", + "targetClass": "foaf:Project", + "property": [ + { + "nodeKind": "sh:Literal", + "path": "ex:CheckOldProjectMetadata", + "minCount": 99999, + "maxCount": 99999, + "sh:message": "Project should be schema:Project, not foaf:Project" + } + ] + }, + { + "@id": "_:projectShape", + "@type": "sh:NodeShape", + "ignoredProperties": [ + { + "@id": "rdf:type" + } + ], + "closed": true, + "targetClass": "schema:Project", + "property": [ + { + "nodeKind": "sh:Literal", + "path": "schema:dateCreated", + "datatype": { + "@id": "xsd:string" + }, + "minCount": 1, + "maxCount": 1, + "sh:lessThanOrEquals": { + "@id": "schema:dateUpdated" + } + }, + { + "nodeKind": "sh:Literal", + "path": "schema:dateUpdated", + "datatype": { + "@id": "xsd:string" + }, + "minCount": 1, + "maxCount": 1 + }, + { + "nodeKind": "sh:Literal", + "path": "schema:schemaVersion", + "datatype": { + "@id": "xsd:string" + }, + "minCount": 1, + "maxCount": 1 + }, + { + "nodeKind": "sh:Literal", + "path": "schema:name", + "datatype": { + "@id": "xsd:string" + }, + "minCount": 1, + "maxCount": 1 + }, + { + "path": "schema:creator", + "sh:class":{ + "@id": "schema:Person" + }, + "minCount": 1 + } + ] + }, + { + "@id": "_:creatorShape", + "@type": "sh:NodeShape", + "ignoredProperties": [ + { + "@id": "rdf:type" + } + ], + "closed": true, + "target": [ + { + "@type": "sh:SPARQLTarget", + "sh:prefixes": [ + { + "@id": "schema:" + }, + { + "@id": "prov:" + } + ], + "sh:select": [ + { + "@value": "SELECT ?this\nWHERE {\n ?this a schema:Person .\n MINUS { ?this a prov:Person . }\n}\n" + } + ] + } + ], + "property": [ + { + "nodeKind": "sh:Literal", + "path": "schema:name", + "datatype": { + "@id": "xsd:string" + }, + "minCount": 1, + "maxCount": 1 + }, + { + "nodeKind": "sh:Literal", + "path": "schema:email", + "datatype": { + "@id": "xsd:string" + }, + "maxCount": 1 + }, + { + "nodeKind": "sh:Literal", + "path": "schema:alternateName", + "datatype": { + "@id": "xsd:string" + } + }, + { + "nodeKind": "sh:Literal", + "path": "schema:affiliation", + "datatype": { + "@id": "xsd:string" + } + } + ] + }, + { + "@id": "_:datasetShape", + "@type": "sh:NodeShape", + "ignoredProperties": [ + { + "@id": "rdf:type" + }, + { + "@id": "schema:license" + } + ], + "closed": true, + "target": [ + { + "@type": "sh:SPARQLTarget", + "sh:prefixes": [ + { + "@id": "schema:" + } + ], + "sh:select": [ + { + "@value": "SELECT ?this\nWHERE {\n ?this a schema:Dataset .\n MINUS { ?x schema:license ?this .}\n}\n" + } + ] + } + ], + "property": [ + { + "nodeKind": "sh:Literal", + "path": "schema:isBasedOn", + "datatype": { + "@id": "xsd:string" + }, + "maxCount": 1 + }, + { + "nodeKind": "sh:Literal", + "path": "schema:dateCreated", + "datatype": { + "@id": "xsd:string" + }, + "minCount": 1, + "maxCount": 1, + "sh:lessThanOrEquals": { + "@id": "schema:datePublished" + } + }, + { + "path": "schema:creator", + "sh:class": { + "@id": "schema:Person" + }, + "minCount": 1 + }, + { + "nodeKind": "sh:Literal", + "path": "schema:datePublished", + "datatype": { + "@id": "xsd:string" + }, + "maxCount": 1 + }, + { + "nodeKind": "sh:Literal", + "path": "schema:description", + "datatype": { + "@id": "xsd:string" + }, + "minCount": 1, + "maxCount": 1 + }, + { + "nodeKind": "sh:Literal", + "path": "schema:identifier", + "datatype": { + "@id": "xsd:string" + }, + "minCount": 1, + "maxCount": 1 + }, + { + "nodeKind": "sh:Literal", + "path": "schema:keywords", + "datatype": { + "@id": "xsd:string" + } + }, + { + "nodeKind": "sh:Literal", + "path": "schema:name", + "datatype": { + "@id": "xsd:string" + }, + "minCount": 1, + "maxCount": 1 + }, + { + "nodeKind": "sh:Literal", + "path": "prov:atLocation", + "datatype": { + "@id": "xsd:string" + }, + "minCount": 1, + "maxCount": 1 + }, + { + "nodeKind": "sh:Literal", + "path": "schema:sameAs", + "datatype": { + "@id": "xsd:string" + } + }, + { + "nodeKind": "sh:Literal", + "path": "schema:url", + "datatype": { + "@id": "xsd:string" + } + }, + { + "nodeKind": "sh:Literal", + "path": "schema:version", + "datatype": { + "@id": "xsd:string" + } + }, + { + "path": "schema:isPartOf", + "sh:class": { + "@id": "schema:Project" + }, + "minCount": 1, + "maxCount": 1 + }, + { + "path": "schema:subjectOf", + "sh:class": { + "@id": "schema:PublicationEvent" + } + }, + { + "path": "schema:hasPart", + "sh:class": { + "@id": "schema:DigitalDocument" + } + }, + { + "path": "schema:inLanguage", + "sh:class": { + "@id": "schema:Language" + } + }, + { + "nodeKind": "sh:Literal", + "path": "rdfs:label", + "datatype": { + "@id": "xsd:string" + } + }, + { + "path": "prov:qualifiedGeneration", + "sh:class": { + "@id": "prov:Generation" + } + } + ] + }, + { + "@id": "_:inLanguageShape", + "@type": "sh:NodeShape", + "ignoredProperties": [ + { + "@id": "rdf:type" + } + ], + "closed": true, + "targetClass": "schema:Language", + "property": [ + { + "nodeKind": "sh:Literal", + "path": "schema:name", + "datatype": { + "@id": "xsd:string" + }, + "minCount": 1, + "maxCount": 1 + }, + { + "nodeKind": "sh:Literal", + "path": "schema:alternateName", + "datatype": { + "@id": "xsd:string" + } + } + ] + }, + { + "@id": "_:datasetFileShape", + "@type": "sh:NodeShape", + "ignoredProperties": [ + { + "@id": "rdf:type" + } + ], + "closed": true, + "targetClass": "schema:DigitalDocument", + "property": [ + { + "nodeKind": "sh:Literal", + "path": "schema:name", + "datatype": { + "@id": "xsd:string" + }, + "minCount": 1, + "maxCount": 1 + }, + { + "nodeKind": "sh:Literal", + "path": "schema:dateCreated", + "datatype": { + "@id": "xsd:string" + }, + "minCount": 1, + "maxCount": 1 + }, + { + "nodeKind": "sh:Literal", + "path": "schema:url", + "datatype": { + "@id": "xsd:string" + }, + "minCount": 1, + "maxCount": 1 + }, + { + "nodeKind": "sh:Literal", + "path": "prov:atLocation", + "datatype": { + "@id": "xsd:string" + }, + "minCount": 1, + "maxCount": 1 + }, + { + "path": "schema:isPartOf", + "or": [ + { + "sh:class": { + "@id": "schema:Project" + } + }, + { + "nodeKind": "sh:Literal", + "datatype": { + "@id": "xsd:string" + } + } + ] + }, + { + "path": "schema:creator", + "sh:class": { + "@id": "schema:Person" + }, + "minCount": 1 + }, + { + "nodeKind": "sh:Literal", + "path": "rdfs:label", + "datatype": { + "@id": "xsd:string" + } + } + ] + }, + { + "@id": "_:datasetTagShape", + "@type": "sh:NodeShape", + "ignoredProperties": [ + { + "@id": "rdf:type" + } + ], + "closed": true, + "targetClass": "schema:PublicationEvent", + "property": [ + { + "nodeKind": "sh:Literal", + "path": "schema:name", + "datatype": { + "@id": "xsd:string" + }, + "minCount": 1, + "maxCount": 1 + }, + { + "nodeKind": "sh:Literal", + "path": "schema:description", + "datatype": { + "@id": "xsd:string" + }, + "minCount": 1, + "maxCount": 1 + }, + { + "nodeKind": "sh:Literal", + "path": "schema:startDate", + "datatype": { + "@id": "xsd:string" + }, + "minCount": 1, + "maxCount": 1 + }, + { + "nodeKind": "sh:Literal", + "path": "schema:location", + "datatype": { + "@id": "xsd:string" + }, + "minCount": 1, + "maxCount": 1 + }, + { + "nodeKind": "sh:Literal", + "path": "schema:about", + "datatype": { + "@id": "xsd:string" + }, + "minCount": 1, + "maxCount": 1 + } + ] + }, + { + "@id": "_:activityShape", + "@type": "sh:NodeShape", + "ignoredProperties": [ + { + "@id": "rdf:type" + } + ], + "closed": true, + "targetClass": "prov:Activity", + "property": [ + { + "path": "schema:isPartOf", + "sh:class": { + "@id": "schema:Project" + }, + "minCount": 1, + "maxCount": 1 + }, + { + "nodeKind": "sh:Literal", + "path": "rdfs:comment", + "datatype": { + "@id": "xsd:string" + }, + "minCount": 1, + "maxCount": 1 + }, + { + "nodeKind": "sh:Literal", + "path": "rdfs:label", + "datatype": { + "@id": "xsd:string" + } + }, + { + "nodeKind": "sh:IRI", + "path": "prov:wasInformedBy", + "minCount": 1, + "maxCount": 1 + }, + { + "nodeKind": "sh:Literal", + "path": "prov:influenced" + }, + { + "nodeKind": "sh:Literal", + "path": "prov:startedAtTime", + "datatype": { + "@id": "xsd:dateTime" + }, + "minCount": 1, + "maxCount": 1 + }, + { + "nodeKind": "sh:Literal", + "path": "prov:endedAtTime", + "datatype": { + "@id": "xsd:dateTime" + }, + "minCount": 1, + "maxCount": 1 + }, + { + "path": "prov:agent", + "or": [ + { + "sh:class": { + "@id": "prov:SoftwareAgent" + } + }, + { + "sh:class": { + "@id": "schema:Person" + } + }, + { + "nodeKind": "sh:IRI" + } + ], + "minCount": 2, + "maxCount": 2 + }, + { + "nodeKind": "sh:Literal", + "path": "prov:atLocation", + "datatype": { + "@id": "xsd:string" + } + }, + { + "path": "prov:qualifiedUsage", + "sh:class": { + "@id": "prov:Usage" + } + }, + { + "path": "prov:qualifiedAssociation", + "sh:class": { + "@id": "prov:Association" + } + }, + { + "path": "wfprov:wasPartOfWorkflowRun", + "sh:class": { + "@id": "wfprov:WorkflowRun" + } + } + ] + }, + { + "@id": "_:associationShape", + "@type": "sh:NodeShape", + "ignoredProperties": [ + { + "@id": "rdf:type" + } + ], + "closed": true, + "targetClass": "prov:Association", + "property": [ + { + "path": "prov:hadPlan", + "minCount": 1 + }, + { + "path": "prov:agent", + "sh:class": { + "@id": "prov:SoftwareAgent" + }, + "minCount": 1, + "maxCount": 1 + } + ] + }, + { + "@id": "_:usageShape", + "@type": "sh:NodeShape", + "ignoredProperties": [ + { + "@id": "rdf:type" + } + ], + "closed": true, + "targetClass": "prov:Usage", + "property": [ + { + "path": "prov:entity", + "minCount": 1 + }, + { + "nodeKind": "sh:Literal", + "path": "prov:hadRole", + "datatype": { + "@id": "xsd:string" + } + } + ] + }, + { + "@id": "_:softwareAgentShape", + "@type": "sh:NodeShape", + "ignoredProperties": [ + { + "@id": "rdf:type" + } + ], + "closed": true, + "targetClass": "prov:SoftwareAgent", + "property": [ + { + "nodeKind": "sh:Literal", + "path": "rdfs:label", + "datatype": { + "@id": "xsd:string" + } + }, + { + "path": "prov:wasStartedBy", + "or": [ + { + "nodeKind": "sh:IRI" + }, + { + "sh:class": { + "@id": "prov:Person" + } + }], + "maxCount": 1 + } + ] + }, + { + "@id": "_:generationShape", + "@type": "sh:NodeShape", + "ignoredProperties": [ + { + "@id": "rdf:type" + } + ], + "closed": true, + "targetClass": "prov:Generation", + "property": [ + { + "path": { + "sh:inversePath": { + "@id": "prov:qualifiedGeneration" + } + }, + "nodeKind": "sh:BlankNodeOrIRI" + }, + { + "nodeKind": "sh:Literal", + "path": "prov:hadRole", + "datatype": { + "@id": "xsd:string" + } + }, + { + "sh:class": { + "@id": "prov:Activity" + }, + "path": "prov:activity", + "minCount": 1 + } + ] + } + ] +} diff --git a/setup.py b/setup.py index 304b36d25a..ae0d86b42e 100644 --- a/setup.py +++ b/setup.py @@ -87,6 +87,7 @@ 'PyYAML>=3.12', 'pyld>=1.0.3', 'pyOpenSSL>=19.0.0', + 'pyshacl>=0.11.3.post1', 'python-dateutil>=2.6.1', 'python-editor>=1.0.4', 'rdflib-jsonld>=0.4.0', diff --git a/tests/cli/test_integration_datasets.py b/tests/cli/test_integration_datasets.py index d9b83e8eaa..5fe9e2ff05 100644 --- a/tests/cli/test_integration_datasets.py +++ b/tests/cli/test_integration_datasets.py @@ -22,7 +22,6 @@ import git import pytest -import yaml from renku.cli import cli @@ -596,13 +595,11 @@ def test_usage_error_in_add_from_git(runner, client, params, n_urls, message): def read_dataset_file_metadata(client, dataset_name, filename): """Return metadata from dataset's YAML file.""" - path = client.dataset_path(dataset_name) - assert path.exists() + with client.with_dataset(dataset_name) as dataset: + assert client.dataset_path(dataset.name).exists() - with path.open(mode='r') as fp: - metadata = yaml.safe_load(fp) - for file_ in metadata['files']: - if file_['path'].endswith(filename): + for file_ in dataset.files: + if file_.path.endswith(filename): return file_ @@ -631,14 +628,14 @@ def test_dataset_update(client, runner, params): assert 0 == result.exit_code after = read_dataset_file_metadata(client, 'remote', 'CHANGES.rst') - assert after['_id'] == before['_id'] - assert after['_label'] != before['_label'] - assert after['added'] == before['added'] - assert after['url'] == before['url'] - assert after['based_on']['_id'] == before['based_on']['_id'] - assert after['based_on']['_label'] != before['based_on']['_label'] - assert after['based_on']['path'] == before['based_on']['path'] - assert after['based_on']['based_on'] is None + assert after._id == before._id + assert after._label != before._label + assert after.added == before.added + assert after.url == before.url + assert after.based_on._id == before.based_on._id + assert after.based_on._label != before.based_on._label + assert after.based_on.path == before.based_on.path + assert after.based_on.based_on is None @pytest.mark.integration @@ -792,12 +789,12 @@ def test_import_from_renku_project(tmpdir, client, runner): assert 0 == result.exit_code metadata = read_dataset_file_metadata(client, 'remote-dataset', 'file') - assert metadata['creator'][0]['name'] == remote['creator'][0]['name'] - assert metadata['based_on']['_id'] == remote['_id'] - assert metadata['based_on']['_label'] == remote['_label'] - assert metadata['based_on']['path'] == remote['path'] - assert metadata['based_on']['based_on'] is None - assert metadata['based_on']['url'] == REMOTE + assert metadata.creator[0].name == remote.creator[0].name + assert metadata.based_on._id == remote._id + assert metadata.based_on._label == remote._label + assert metadata.based_on.path == remote.path + assert metadata.based_on.based_on is None + assert metadata.based_on.url == REMOTE @pytest.mark.integration diff --git a/tests/cli/test_log.py b/tests/cli/test_log.py new file mode 100644 index 0000000000..a561c4d353 --- /dev/null +++ b/tests/cli/test_log.py @@ -0,0 +1,69 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2017-2019 - Swiss Data Science Center (SDSC) +# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and +# Eidgenössische Technische Hochschule Zürich (ETHZ). +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Test ``log`` command.""" + +from __future__ import absolute_import, print_function + +import pytest + +from renku.cli import cli + + +@pytest.mark.shelled +@pytest.mark.parametrize('format', ['json-ld', 'nt', 'rdf']) +def test_run_log_strict(runner, project, run_shell, format): + """Test log output of run command.""" + # Run a shell command with pipe. + result = run_shell('renku run echo "a" > output') + + # Assert created output file. + result = runner.invoke( + cli, ['log', '--strict', '--format={}'.format(format)] + ) + assert 0 == result.exit_code, result.output + assert '.renku/workflow/' in result.output + + +@pytest.mark.shelled +@pytest.mark.parametrize('format', ['json-ld', 'nt', 'rdf']) +def test_dataset_log_strict(tmpdir, runner, project, client, format): + """Test output of log for dataset add.""" + result = runner.invoke(cli, ['dataset', 'create', 'my-dataset']) + assert 0 == result.exit_code + + paths = [] + test_paths = [] + for i in range(3): + new_file = tmpdir.join('file_{0}'.format(i)) + new_file.write(str(i)) + paths.append(str(new_file)) + test_paths.append(str(new_file.relto(tmpdir.join('..')))) + + # add data + result = runner.invoke( + cli, + ['dataset', 'add', 'my-dataset'] + paths, + ) + assert 0 == result.exit_code + + result = runner.invoke( + cli, ['log', '--strict', '--format={}'.format(format)] + ) + + assert 0 == result.exit_code, result.output + assert all(p in result.output for p in test_paths) diff --git a/tests/cli/test_update.py b/tests/cli/test_update.py index 59593dcc38..b0f5080b3c 100644 --- a/tests/cli/test_update.py +++ b/tests/cli/test_update.py @@ -35,6 +35,8 @@ def update_and_commit(data, file_, repo): def test_update(runner, project, run): """Test automatic file update.""" + from renku.core.utils.shacl import validate_graph + cwd = Path(project) data = cwd / 'data' data.mkdir() @@ -91,9 +93,13 @@ def test_update(runner, project, run): ['log', '--format', output_format], catch_exceptions=False, ) - assert 0 == result.exit_code, output_format + assert 0 == result.exit_code, result.output assert source.name in result.output, output_format + if output_format == 'nt': + r, _, t = validate_graph(result.output, format='nt') + assert r is True, t + def test_workflow_without_outputs(runner, project, run): """Test workflow without outputs.""" diff --git a/tests/core/models/test_shacl_schema.py b/tests/core/models/test_shacl_schema.py new file mode 100644 index 0000000000..fd6d6f1e01 --- /dev/null +++ b/tests/core/models/test_shacl_schema.py @@ -0,0 +1,106 @@ +# -*- coding: utf-8 -*- +# +# Copyright 2017-2019- Swiss Data Science Center (SDSC) +# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and +# Eidgenössische Technische Hochschule Zürich (ETHZ). +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""test KG against SHACL shape.""" + +from renku.cli import cli +from renku.core.compat import Path, pyld +from renku.core.utils.shacl import validate_graph + + +def test_dataset_shacl(tmpdir, runner, project, client): + """Test dataset metadata structure.""" + force_dataset_path = Path( + __file__ + ).parent.parent.parent / 'fixtures' / 'force_dataset_shacl.json' + + force_datasetfile_path = Path( + __file__ + ).parent.parent.parent / 'fixtures' / 'force_datasetfile_shacl.json' + + force_datasettag_path = Path( + __file__ + ).parent.parent.parent / 'fixtures' / 'force_datasettag_shacl.json' + + runner.invoke(cli, ['dataset', 'create', 'dataset']) + + paths = [] + for i in range(3): + new_file = tmpdir.join('file_{0}'.format(i)) + new_file.write(str(i)) + paths.append(str(new_file)) + + # add data + runner.invoke( + cli, + ['dataset', 'add', 'dataset'] + paths, + catch_exceptions=False, + ) + + runner.invoke( + cli, + ['dataset', 'tag', 'dataset', '1.0'], + catch_exceptions=False, + ) + + with client.with_dataset('dataset') as dataset: + g = dataset.asjsonld() + rdf = pyld.jsonld.to_rdf( + g, + options={ + 'format': 'application/n-quads', + 'produceGeneralizedRdf': True + } + ) + + r, _, t = validate_graph(rdf, shacl_path=str(force_dataset_path)) + assert r is True, t + + r, _, t = validate_graph(rdf, shacl_path=str(force_datasetfile_path)) + assert r is True, t + + r, _, t = validate_graph(rdf, shacl_path=str(force_datasettag_path)) + assert r is True, t + + r, _, t = validate_graph(rdf) + assert r is True, t + + +def test_project_shacl(project, client): + """Test project metadata structure.""" + from renku.core.models.provenance.agents import Person + + path = Path( + __file__ + ).parent.parent.parent / 'fixtures' / 'force_project_shacl.json' + + project = client.project + project.creator = Person(email='johndoe@example.com', name='Johnny Doe') + + g = project.asjsonld() + rdf = pyld.jsonld.to_rdf( + g, + options={ + 'format': 'application/n-quads', + 'produceGeneralizedRdf': False + } + ) + r, _, t = validate_graph(rdf, shacl_path=str(path)) + assert r is True, t + + r, _, t = validate_graph(rdf) + assert r is True, t diff --git a/tests/fixtures/force_dataset_shacl.json b/tests/fixtures/force_dataset_shacl.json new file mode 100644 index 0000000000..3c201f0869 --- /dev/null +++ b/tests/fixtures/force_dataset_shacl.json @@ -0,0 +1,32 @@ +{ + "@context": { + "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", + "sh": "http://www.w3.org/ns/shacl#", + "schema": "http://schema.org/" + }, + "@graph": [ + { + "@id": "_:forceDatasetShape", + "@type": "sh:NodeShape", + "sh:targetNode": { + "@id": "schema:Dataset", + "@type": "@id" + }, + "sh:property": [ + { + "sh:path": [ + { + "sh:inversePath": [ + { + "@id": "rdf:type", + "@type": "@id" + } + ] + } + ], + "sh:minCount": 1 + } + ] + } + ] +} diff --git a/tests/fixtures/force_datasetfile_shacl.json b/tests/fixtures/force_datasetfile_shacl.json new file mode 100644 index 0000000000..45470e3740 --- /dev/null +++ b/tests/fixtures/force_datasetfile_shacl.json @@ -0,0 +1,32 @@ +{ + "@context": { + "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", + "sh": "http://www.w3.org/ns/shacl#", + "schema": "http://schema.org/" + }, + "@graph": [ + { + "@id": "_:forceDatasetShape", + "@type": "sh:NodeShape", + "sh:targetNode": { + "@id": "schema:DigitalDocument", + "@type": "@id" + }, + "sh:property": [ + { + "sh:path": [ + { + "sh:inversePath": [ + { + "@id": "rdf:type", + "@type": "@id" + } + ] + } + ], + "sh:minCount": 1 + } + ] + } + ] +} diff --git a/tests/fixtures/force_datasettag_shacl.json b/tests/fixtures/force_datasettag_shacl.json new file mode 100644 index 0000000000..106f5e0e41 --- /dev/null +++ b/tests/fixtures/force_datasettag_shacl.json @@ -0,0 +1,32 @@ +{ + "@context": { + "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", + "sh": "http://www.w3.org/ns/shacl#", + "schema": "http://schema.org/" + }, + "@graph": [ + { + "@id": "_:forceDatasetShape", + "@type": "sh:NodeShape", + "sh:targetNode": { + "@id": "schema:PublicationEvent", + "@type": "@id" + }, + "sh:property": [ + { + "sh:path": [ + { + "sh:inversePath": [ + { + "@id": "rdf:type", + "@type": "@id" + } + ] + } + ], + "sh:minCount": 1 + } + ] + } + ] +} diff --git a/tests/fixtures/force_project_shacl.json b/tests/fixtures/force_project_shacl.json new file mode 100644 index 0000000000..b7fd526983 --- /dev/null +++ b/tests/fixtures/force_project_shacl.json @@ -0,0 +1,32 @@ +{ + "@context": { + "rdf": "http://www.w3.org/1999/02/22-rdf-syntax-ns#", + "sh": "http://www.w3.org/ns/shacl#", + "schema": "http://schema.org/" + }, + "@graph": [ + { + "@id": "_:forceProjectShape", + "@type": "sh:NodeShape", + "sh:targetNode": { + "@id": "schema:Project", + "@type": "@id" + }, + "sh:property": [ + { + "sh:path": [ + { + "sh:inversePath": [ + { + "@id": "rdf:type", + "@type": "@id" + } + ] + } + ], + "sh:minCount": 1 + } + ] + } + ] +}