Skip to content

Commit b357ee7

Browse files
feat: metadata on dataset creation (#850)
* refactor: dataset creation * refactor: rename methods * refactor: consistent use of short_name * refactor: dataset name validation * feat: allow short_name for datasets
1 parent d334cae commit b357ee7

File tree

15 files changed

+346
-104
lines changed

15 files changed

+346
-104
lines changed

renku/cli/dataset.py

Lines changed: 34 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -385,9 +385,34 @@ def dataset(ctx, revision, datadir, format):
385385

386386
@dataset.command()
387387
@click.argument('name')
388-
def create(name):
388+
@click.option(
389+
'--short-name', default='', help='A convenient name for dataset.'
390+
)
391+
@click.option(
392+
'-d', '--description', default='', help='Dataset\'s description.'
393+
)
394+
@click.option(
395+
'-c',
396+
'--creator',
397+
default=None,
398+
multiple=True,
399+
help='Creator\'s name and email ("Name <email>").'
400+
)
401+
def create(name, short_name, description, creator):
389402
"""Create an empty dataset in the current repo."""
390-
create_dataset(name)
403+
creators = creator or ()
404+
405+
dataset = create_dataset(
406+
name=name,
407+
short_name=short_name,
408+
description=description,
409+
creators=creators
410+
)
411+
click.echo(
412+
'Use the name "{}" to refer to this dataset.'.format(
413+
dataset.short_name
414+
)
415+
)
391416
click.secho('OK', fg='green')
392417

393418

@@ -606,14 +631,16 @@ def export_(id, provider, publish, tag):
606631

607632
@dataset.command('import')
608633
@click.argument('uri')
609-
@click.option('-n', '--name', help='Dataset name.')
634+
@click.option(
635+
'--short-name', default='', help='A convenient name for dataset.'
636+
)
610637
@click.option(
611638
'-x',
612639
'--extract',
613640
is_flag=True,
614641
help='Extract files before importing to dataset.'
615642
)
616-
def import_(uri, name, extract):
643+
def import_(uri, short_name, extract):
617644
"""Import data from a 3rd party provider.
618645
619646
Supported providers: [Zenodo, Dataverse]
@@ -638,9 +665,9 @@ def _init(lock, id_queue):
638665
tqdm.set_lock(lock)
639666

640667
import_dataset(
641-
uri,
642-
name,
643-
extract,
668+
uri=uri,
669+
short_name=short_name,
670+
extract=extract,
644671
with_prompt=True,
645672
pool_init_fn=_init,
646673
pool_init_args=(mp.RLock(), id_queue),

renku/core/commands/checks/migration.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -162,7 +162,7 @@ def migrate_broken_dataset_paths(client):
162162
# migrate the refs
163163
ref = LinkReference.create(
164164
client=client,
165-
name='datasets/{0}'.format(dataset.display_name),
165+
name='datasets/{0}'.format(dataset.short_name),
166166
force=True,
167167
)
168168
ref.set_reference(expected_path / client.METADATA)

renku/core/commands/dataset.py

Lines changed: 33 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@
4242
MigrationRequired, ParameterError, UsageError
4343
from renku.core.management.datasets import DATASET_METADATA_PATHS
4444
from renku.core.management.git import COMMIT_DIFF_STRATEGY
45-
from renku.core.models.datasets import Dataset
45+
from renku.core.models.datasets import Dataset, generate_default_short_name
4646
from renku.core.models.provenance.agents import Person
4747
from renku.core.models.refs import LinkReference
4848
from renku.core.models.tabulate import tabulate
@@ -101,15 +101,26 @@ def dataset_parent(client, revision, datadir, format, ctx=None):
101101
@pass_local_client(
102102
clean=False, commit=True, commit_only=DATASET_METADATA_PATHS
103103
)
104-
def create_dataset(client, name, commit_message=None):
104+
def create_dataset(
105+
client, name, short_name, description, creators, commit_message=None
106+
):
105107
"""Create an empty dataset in the current repo.
106108
107109
:raises: ``renku.core.errors.ParameterError``
108110
"""
109-
with client.with_dataset(name=name, create=True) as dataset:
110-
creator = Person.from_git(client.repo)
111-
if creator not in dataset.creator:
112-
dataset.creator.append(creator)
111+
if not creators:
112+
creators = [Person.from_git(client.repo)]
113+
else:
114+
creators = [Person.from_string(c) for c in creators]
115+
116+
dataset, _, __ = client.create_dataset(
117+
name=name,
118+
short_name=short_name,
119+
description=description,
120+
creators=creators
121+
)
122+
123+
return dataset
113124

114125

115126
@pass_local_client(
@@ -284,7 +295,7 @@ def dataset_remove(
284295
commit_message=None
285296
):
286297
"""Delete a dataset."""
287-
datasets = {name: client.dataset_path(name) for name in names}
298+
datasets = {name: client.get_dataset_path(name) for name in names}
288299

289300
if not datasets:
290301
raise ParameterError(
@@ -422,8 +433,8 @@ def export_dataset(
422433
def import_dataset(
423434
client,
424435
uri,
425-
name,
426-
extract,
436+
short_name='',
437+
extract=False,
427438
with_prompt=False,
428439
pool_init_fn=None,
429440
pool_init_args=None,
@@ -474,6 +485,15 @@ def import_dataset(
474485
)
475486

476487
if files:
488+
if not short_name:
489+
short_name = generate_default_short_name(
490+
dataset.name, dataset.version
491+
)
492+
493+
dataset.short_name = short_name
494+
495+
client.create_dataset(name=dataset.name, short_name=short_name)
496+
477497
data_folder = tempfile.mkdtemp()
478498

479499
pool_size = min(
@@ -511,20 +531,18 @@ def import_dataset(
511531
))
512532
pool.close()
513533

514-
dataset_name = name or dataset.display_name
515534
dataset.url = remove_credentials(dataset.url)
516535
add_to_dataset(
517536
client,
518537
urls=[str(p) for p in Path(data_folder).glob('*')],
519-
name=dataset_name,
520-
with_metadata=dataset,
521-
create=True
538+
name=short_name,
539+
with_metadata=dataset
522540
)
523541

524542
if dataset.version:
525543
tag_name = re.sub('[^a-zA-Z0-9.-_]', '_', dataset.version)
526544
tag_dataset(
527-
client, dataset_name, tag_name,
545+
client, short_name, tag_name,
528546
'Tag {} created by renku import'.format(dataset.version)
529547
)
530548

@@ -633,7 +651,7 @@ def _filter(client, names=None, creators=None, include=None, exclude=None):
633651

634652
records = []
635653
for path_, dataset in client.datasets.items():
636-
if not names or dataset.name in names:
654+
if not names or dataset.short_name in names:
637655
for file_ in dataset.files:
638656
file_.dataset = dataset.name
639657
path_ = file_.full_path.relative_to(client.path)

renku/core/commands/format/datasets.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ def tabular(client, datasets):
3131
datasets,
3232
headers=OrderedDict((
3333
('uid', 'id'),
34-
('display_name', None),
34+
('short_name', None),
3535
('version', None),
3636
('created', None),
3737
('creators_csv', 'creators'),

renku/core/errors.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -128,7 +128,7 @@ def __init__(self, message=None):
128128
'Please use the "git config" command to configure it.\n\n'
129129
'\tgit config --set user.email "john.doe@example.com"\n'
130130
)
131-
super(MissingUsername, self).__init__(message)
131+
super().__init__(message)
132132

133133

134134
class AuthenticationError(RenkuException):

renku/core/management/datasets.py

Lines changed: 68 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,8 @@
3636
from renku.core import errors
3737
from renku.core.management.clone import clone
3838
from renku.core.management.config import RENKU_HOME
39-
from renku.core.models.datasets import Dataset, DatasetFile, DatasetTag
39+
from renku.core.models.datasets import Dataset, DatasetFile, DatasetTag, \
40+
generate_default_short_name, is_dataset_name_valid
4041
from renku.core.models.git import GitURL
4142
from renku.core.models.locals import with_reference
4243
from renku.core.models.provenance.agents import Person
@@ -85,31 +86,35 @@ def datasets(self):
8586
result = {}
8687
paths = (self.path / self.renku_datasets_path).rglob(self.METADATA)
8788
for path in paths:
88-
result[path] = self.get_dataset(path)
89+
result[path] = self.load_dataset_from_path(path)
8990
return result
9091

91-
def get_dataset(self, path, commit=None):
92+
def load_dataset_from_path(self, path, commit=None):
9293
"""Return a dataset from a given path."""
94+
path = Path(path)
9395
if not path.is_absolute():
9496
path = self.path / path
9597
return Dataset.from_yaml(path, client=self, commit=commit)
9698

97-
def dataset_path(self, name):
99+
def get_dataset_path(self, name):
98100
"""Get dataset path from name."""
99101
path = self.renku_datasets_path / name / self.METADATA
100102
if not path.exists():
101-
path = LinkReference(
102-
client=self, name='datasets/' + name
103-
).reference
103+
try:
104+
path = LinkReference(
105+
client=self, name='datasets/' + name
106+
).reference
107+
except errors.ParameterError:
108+
return None
104109

105110
return path
106111

107112
def load_dataset(self, name=None):
108113
"""Load dataset reference file."""
109114
if name:
110-
path = self.dataset_path(name)
111-
if path.exists():
112-
return self.get_dataset(path)
115+
path = self.get_dataset_path(name)
116+
if path and path.exists():
117+
return self.load_dataset_from_path(path)
113118

114119
@contextmanager
115120
def with_dataset(self, name=None, identifier=None, create=False):
@@ -118,50 +123,25 @@ def with_dataset(self, name=None, identifier=None, create=False):
118123
clean_up_required = False
119124

120125
if dataset is None:
121-
# Avoid nested datasets: name mustn't have '/' in it
122-
if len(Path(name).parts) > 1:
123-
raise errors.ParameterError(
124-
'Dataset name {} is not valid.'.format(name)
125-
)
126-
127126
if not create:
128127
raise errors.DatasetNotFound
129-
clean_up_required = True
130-
dataset_ref = None
131-
identifier = str(uuid.uuid4())
132-
path = (self.renku_datasets_path / identifier / self.METADATA)
133-
try:
134-
path.parent.mkdir(parents=True, exist_ok=False)
135-
except FileExistsError:
136-
raise errors.DatasetExistsError(
137-
'Dataset with reference {} exists'.format(path.parent)
138-
)
139-
140-
with with_reference(path):
141-
dataset = Dataset(
142-
identifier=identifier, name=name, client=self
143-
)
144-
145-
if name:
146-
dataset_ref = LinkReference.create(
147-
client=self, name='datasets/' + name
148-
)
149-
dataset_ref.set_reference(path)
150128

129+
clean_up_required = True
130+
dataset, path, dataset_ref = self.create_dataset(name)
151131
elif create:
152132
raise errors.DatasetExistsError(
153133
'Dataset exists: "{}".'.format(name)
154134
)
155135

156-
dataset_path = self.path / self.datadir / dataset.name
136+
dataset_path = self.path / self.datadir / dataset.short_name
157137
dataset_path.mkdir(parents=True, exist_ok=True)
158138

159139
try:
160140
yield dataset
161141
except Exception:
162142
# TODO use a general clean-up strategy
163143
# https://github.com/SwissDataScienceCenter/renku-python/issues/736
164-
if clean_up_required and dataset_ref:
144+
if clean_up_required:
165145
dataset_ref.delete()
166146
shutil.rmtree(path.parent, ignore_errors=True)
167147
raise
@@ -174,6 +154,54 @@ def with_dataset(self, name=None, identifier=None, create=False):
174154

175155
dataset.to_yaml()
176156

157+
def create_dataset(
158+
self, name, short_name=None, description='', creators=()
159+
):
160+
"""Create a dataset."""
161+
if not name:
162+
raise errors.ParameterError('Dataset name must be provided.')
163+
164+
if not short_name:
165+
short_name = generate_default_short_name(name, None)
166+
167+
if not is_dataset_name_valid(short_name):
168+
raise errors.ParameterError(
169+
'Dataset name "{}" is not valid.'.format(short_name)
170+
)
171+
172+
if self.load_dataset(name=short_name):
173+
raise errors.DatasetExistsError(
174+
'Dataset exists: "{}".'.format(short_name)
175+
)
176+
177+
identifier = str(uuid.uuid4())
178+
path = (self.renku_datasets_path / identifier / self.METADATA)
179+
try:
180+
path.parent.mkdir(parents=True, exist_ok=False)
181+
except FileExistsError:
182+
raise errors.DatasetExistsError(
183+
'Dataset with reference {} exists'.format(path.parent)
184+
)
185+
186+
with with_reference(path):
187+
dataset = Dataset(
188+
client=self,
189+
identifier=identifier,
190+
name=name,
191+
short_name=short_name,
192+
description=description,
193+
creator=creators
194+
)
195+
196+
dataset_ref = LinkReference.create(
197+
client=self, name='datasets/' + short_name
198+
)
199+
dataset_ref.set_reference(path)
200+
201+
dataset.to_yaml()
202+
203+
return dataset, path, dataset_ref
204+
177205
def add_data_to_dataset(
178206
self,
179207
dataset,
@@ -186,7 +214,7 @@ def add_data_to_dataset(
186214
):
187215
"""Import the data into the data directory."""
188216
warning_message = ''
189-
dataset_path = self.path / self.datadir / dataset.name
217+
dataset_path = self.path / self.datadir / dataset.short_name
190218

191219
destination = destination or Path('.')
192220
destination = self._resolve_path(dataset_path, destination)

0 commit comments

Comments
 (0)