Skip to content

Commit 8cc2834

Browse files
feat: local git repo not treated as remote
1 parent 5559985 commit 8cc2834

File tree

3 files changed

+97
-56
lines changed

3 files changed

+97
-56
lines changed

renku/core/commands/dataset.py

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,7 @@ def add_to_dataset(
183183
name=name, identifier=identifier, create=create
184184
) as dataset:
185185
with urlscontext(urls) as bar:
186-
client.add_data_to_dataset(
186+
message = client.add_data_to_dataset(
187187
dataset,
188188
bar,
189189
link=link,
@@ -193,6 +193,9 @@ def add_to_dataset(
193193
ref=ref
194194
)
195195

196+
if message:
197+
click.echo(message)
198+
196199
if with_metadata:
197200
for file_ in with_metadata.files:
198201
for added_ in dataset.files:

renku/core/management/datasets.py

Lines changed: 60 additions & 55 deletions
Original file line numberDiff line numberDiff line change
@@ -184,12 +184,13 @@ def add_data_to_dataset(
184184
link=False
185185
):
186186
"""Import the data into the data directory."""
187+
return_message = ''
187188
dataset_path = self.path / self.datadir / dataset.name
188189

189190
files = []
190191

191192
for url in urls:
192-
git = git or check_for_git_repo(url)
193+
git = git or _is_remote_git_repo(url)
193194

194195
if git:
195196
sources = sources or ()
@@ -199,6 +200,14 @@ def add_data_to_dataset(
199200
)
200201
)
201202
else:
203+
if sources:
204+
raise errors.UsageError(
205+
'Cannot use "--source" with URLs or local files.'
206+
)
207+
if _is_local_git_repo(url):
208+
return_message = 'Adding data from local Git repository.' \
209+
' Use remote\'s Git URL instead to ' \
210+
'enable lineage information and updates.'
202211
files.extend(
203212
self._add_from_url(
204213
dataset, dataset_path, url, link, destination
@@ -236,6 +245,8 @@ def add_data_to_dataset(
236245
dataset_files.append(datasetfile)
237246
dataset.update_files(dataset_files)
238247

248+
return return_message
249+
239250
def _add_from_url(self, dataset, dataset_path, url, link, destination):
240251
"""Process an add from url and return the location on disk."""
241252
u = parse.urlparse(url)
@@ -257,10 +268,32 @@ def _add_from_url(self, dataset, dataset_path, url, link, destination):
257268
if u.scheme in ('', 'file'):
258269
src = Path(u.path).absolute()
259270

271+
if not src.exists():
272+
raise errors.ParameterError(
273+
'Cannot find file/directory: {}'.format(url)
274+
)
275+
276+
try:
277+
Path(u.path).resolve().relative_to(self.path)
278+
except ValueError:
279+
pass
280+
else:
281+
if not destination:
282+
return [{
283+
'path': url,
284+
'url': url,
285+
'creator': dataset.creator,
286+
'dataset': dataset.name,
287+
'parent': self
288+
}]
289+
260290
# if we have a directory, recurse
261291
if src.is_dir():
292+
if src.name == '.git':
293+
return []
294+
262295
if dst.exists() and not dst.is_dir():
263-
raise errors.InvalidFileOperation(
296+
raise errors.ParameterError(
264297
'Cannot copy directory to a file'
265298
)
266299

@@ -325,40 +358,10 @@ def _add_from_git(
325358

326359
u = parse.urlparse(url)
327360

328-
is_local_repo = u.scheme in ('', 'file') and not url.startswith('git@')
329-
330-
if is_local_repo:
331-
try:
332-
Path(u.path).resolve().relative_to(self.path)
333-
except ValueError:
334-
pass
335-
else:
336-
if not destination:
337-
return [{
338-
'path': url,
339-
'url': url,
340-
'creator': dataset.creator,
341-
'dataset': dataset.name,
342-
'parent': self
343-
}]
344-
345-
# determine where is the base repo path
346-
repo = Repo(u.path, search_parent_directories=True)
347-
repo_path = Path(repo.git_dir).parent.resolve()
348-
349-
# if repo path is a parent of the url, treat the url as a source
350-
if repo_path != Path(u.path).resolve():
351-
if sources:
352-
raise errors.UsageError(
353-
'Cannot use --source within local repo subdirectories'
354-
)
355-
source = Path(u.path).resolve().relative_to(repo_path)
356-
sources = (source, )
357-
url = repo_path.as_posix()
358-
elif u.scheme not in {'http', 'https', 'git+https', 'git+ssh'} and \
361+
if u.scheme not in {'http', 'https', 'git+https', 'git+ssh'} and \
359362
not url.startswith('git@'):
360363
raise errors.UrlSchemeNotSupported(
361-
'Scheme {} not supported'.format(u.scheme)
364+
'Scheme "{}" not supported'.format(u.scheme)
362365
)
363366

364367
repo, repo_path = self._prepare_git_repo(url, ref)
@@ -410,11 +413,6 @@ def _add_from_git(
410413

411414
for path, src, dst, _ in files:
412415
if not src.is_dir():
413-
if is_local_repo:
414-
dst_url = None
415-
else:
416-
dst_url = url
417-
418416
# Use original metadata if it exists
419417
based_on = metadata.get(path)
420418
if based_on:
@@ -437,7 +435,7 @@ def _add_from_git(
437435

438436
results.append({
439437
'path': path_in_dst_repo,
440-
'url': dst_url,
438+
'url': url,
441439
'creator': creators,
442440
'dataset': dataset.name,
443441
'parent': self,
@@ -460,7 +458,7 @@ def _resolve_paths(self, root_path, paths):
460458
root_path = Path(root_path).resolve()
461459
path = (root_path / p).resolve().relative_to(root_path)
462460
except ValueError:
463-
raise errors.InvalidFileOperation(
461+
raise errors.ParameterError(
464462
'File {} is not within path {}'.format(p, root_path)
465463
)
466464
else:
@@ -500,9 +498,9 @@ def _get_src_and_dst(self, path, repo_path, sources, dst_root):
500498
if len(sources) == 1 and not src.is_dir():
501499
dst = dst_root
502500
elif not sources:
503-
raise errors.InvalidFileOperation('Cannot copy repo to file')
501+
raise errors.ParameterError('Cannot copy repo to file')
504502
else:
505-
raise errors.InvalidFileOperation(
503+
raise errors.ParameterError(
506504
'Cannot copy multiple files or directories to a file'
507505
)
508506

@@ -678,7 +676,7 @@ def update_dataset_files(self, files, ref, delete=False):
678676
added=based_on.added
679677
)
680678
except KeyError:
681-
raise errors.InvalidFileOperation(
679+
raise errors.ParameterError(
682680
'Cannot find file {} in the repo {}'.format(
683681
based_on.url, url
684682
)
@@ -835,22 +833,29 @@ def _get_commit_sha_from_label(dataset_file):
835833
return label
836834

837835

838-
def check_for_git_repo(url):
836+
def _is_remote_git_repo(url):
839837
"""Check if a url points to a git repository."""
840838
u = parse.urlparse(url)
841-
is_git = False
839+
is_remote = u.scheme not in ('', 'file') or url.startswith('git@')
840+
841+
return is_remote and os.path.splitext(u.path)[1] == '.git'
842842

843-
if os.path.splitext(u.path)[1] == '.git':
844-
is_git = True
845-
elif u.scheme in ('', 'file'):
846-
from git import InvalidGitRepositoryError, Repo
847843

844+
def _is_local_git_repo(url):
845+
"""Check if a path is within a git repository."""
846+
from git import InvalidGitRepositoryError, NoSuchPathError, Repo
847+
848+
u = parse.urlparse(url)
849+
850+
if u.scheme in ('', 'file'):
848851
try:
849-
Repo(u.path, search_parent_directories=True)
850-
is_git = True
851-
except InvalidGitRepositoryError:
852-
is_git = False
853-
return is_git
852+
Repo(url, search_parent_directories=True)
853+
except (InvalidGitRepositoryError, NoSuchPathError):
854+
pass
855+
else:
856+
return True
857+
858+
return False
854859

855860

856861
DATASET_METADATA_PATHS = [

tests/cli/test_datasets.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -335,6 +335,39 @@ def test_relative_import_to_dataset(tmpdir, runner, client):
335335
)
336336

337337

338+
@pytest.mark.parametrize(
339+
'params,message', [
340+
(['-s', 'file', 'https://example.com'
341+
], 'Cannot use "--source" with URLs or local files.'),
342+
(['-s', 'file', '/some/local/path'
343+
], 'Cannot use "--source" with URLs or local files.'),
344+
]
345+
)
346+
def test_usage_error_in_add_from_url(runner, client, params, message):
347+
"""Test user's errors when adding URL/local file to a dataset."""
348+
result = runner.invoke(
349+
cli,
350+
['dataset', 'add', 'remote', '--create'] + params,
351+
catch_exceptions=False,
352+
)
353+
assert result.exit_code == 2
354+
assert message in result.output
355+
356+
357+
def test_add_from_local_repo_warning(
358+
runner, client, data_repository, directory_tree
359+
):
360+
"""Test a warning is printed when adding from a local git repo."""
361+
result = runner.invoke(
362+
cli,
363+
['dataset', 'add', 'dataset', '--create',
364+
str(directory_tree)],
365+
catch_exceptions=False,
366+
)
367+
assert result.exit_code == 0
368+
assert 'Use remote\'s Git URL instead to enable lineage ' in result.output
369+
370+
338371
def test_dataset_add_with_link(tmpdir, runner, project, client):
339372
"""Test adding data to dataset with --link flag."""
340373
import stat

0 commit comments

Comments
 (0)