@@ -184,12 +184,13 @@ def add_data_to_dataset(
184184 link = False
185185 ):
186186 """Import the data into the data directory."""
187+ return_message = ''
187188 dataset_path = self .path / self .datadir / dataset .name
188189
189190 files = []
190191
191192 for url in urls :
192- git = git or check_for_git_repo (url )
193+ git = git or _is_remote_git_repo (url )
193194
194195 if git :
195196 sources = sources or ()
@@ -199,6 +200,14 @@ def add_data_to_dataset(
199200 )
200201 )
201202 else :
203+ if sources :
204+ raise errors .UsageError (
205+ 'Cannot use "--source" with URLs or local files.'
206+ )
207+ if _is_local_git_repo (url ):
208+ return_message = 'Adding data from local Git repository.' \
209+ ' Use remote\' s Git URL instead to ' \
210+ 'enable lineage information and updates.'
202211 files .extend (
203212 self ._add_from_url (
204213 dataset , dataset_path , url , link , destination
@@ -236,6 +245,8 @@ def add_data_to_dataset(
236245 dataset_files .append (datasetfile )
237246 dataset .update_files (dataset_files )
238247
248+ return return_message
249+
239250 def _add_from_url (self , dataset , dataset_path , url , link , destination ):
240251 """Process an add from url and return the location on disk."""
241252 u = parse .urlparse (url )
@@ -257,10 +268,32 @@ def _add_from_url(self, dataset, dataset_path, url, link, destination):
257268 if u .scheme in ('' , 'file' ):
258269 src = Path (u .path ).absolute ()
259270
271+ if not src .exists ():
272+ raise errors .ParameterError (
273+ 'Cannot find file/directory: {}' .format (url )
274+ )
275+
276+ try :
277+ Path (u .path ).resolve ().relative_to (self .path )
278+ except ValueError :
279+ pass
280+ else :
281+ if not destination :
282+ return [{
283+ 'path' : url ,
284+ 'url' : url ,
285+ 'creator' : dataset .creator ,
286+ 'dataset' : dataset .name ,
287+ 'parent' : self
288+ }]
289+
260290 # if we have a directory, recurse
261291 if src .is_dir ():
292+ if src .name == '.git' :
293+ return []
294+
262295 if dst .exists () and not dst .is_dir ():
263- raise errors .InvalidFileOperation (
296+ raise errors .ParameterError (
264297 'Cannot copy directory to a file'
265298 )
266299
@@ -325,40 +358,10 @@ def _add_from_git(
325358
326359 u = parse .urlparse (url )
327360
328- is_local_repo = u .scheme in ('' , 'file' ) and not url .startswith ('git@' )
329-
330- if is_local_repo :
331- try :
332- Path (u .path ).resolve ().relative_to (self .path )
333- except ValueError :
334- pass
335- else :
336- if not destination :
337- return [{
338- 'path' : url ,
339- 'url' : url ,
340- 'creator' : dataset .creator ,
341- 'dataset' : dataset .name ,
342- 'parent' : self
343- }]
344-
345- # determine where is the base repo path
346- repo = Repo (u .path , search_parent_directories = True )
347- repo_path = Path (repo .git_dir ).parent .resolve ()
348-
349- # if repo path is a parent of the url, treat the url as a source
350- if repo_path != Path (u .path ).resolve ():
351- if sources :
352- raise errors .UsageError (
353- 'Cannot use --source within local repo subdirectories'
354- )
355- source = Path (u .path ).resolve ().relative_to (repo_path )
356- sources = (source , )
357- url = repo_path .as_posix ()
358- elif u .scheme not in {'http' , 'https' , 'git+https' , 'git+ssh' } and \
361+ if u .scheme not in {'http' , 'https' , 'git+https' , 'git+ssh' } and \
359362 not url .startswith ('git@' ):
360363 raise errors .UrlSchemeNotSupported (
361- 'Scheme {} not supported' .format (u .scheme )
364+ 'Scheme "{}" not supported' .format (u .scheme )
362365 )
363366
364367 repo , repo_path = self ._prepare_git_repo (url , ref )
@@ -410,11 +413,6 @@ def _add_from_git(
410413
411414 for path , src , dst , _ in files :
412415 if not src .is_dir ():
413- if is_local_repo :
414- dst_url = None
415- else :
416- dst_url = url
417-
418416 # Use original metadata if it exists
419417 based_on = metadata .get (path )
420418 if based_on :
@@ -437,7 +435,7 @@ def _add_from_git(
437435
438436 results .append ({
439437 'path' : path_in_dst_repo ,
440- 'url' : dst_url ,
438+ 'url' : url ,
441439 'creator' : creators ,
442440 'dataset' : dataset .name ,
443441 'parent' : self ,
@@ -460,7 +458,7 @@ def _resolve_paths(self, root_path, paths):
460458 root_path = Path (root_path ).resolve ()
461459 path = (root_path / p ).resolve ().relative_to (root_path )
462460 except ValueError :
463- raise errors .InvalidFileOperation (
461+ raise errors .ParameterError (
464462 'File {} is not within path {}' .format (p , root_path )
465463 )
466464 else :
@@ -500,9 +498,9 @@ def _get_src_and_dst(self, path, repo_path, sources, dst_root):
500498 if len (sources ) == 1 and not src .is_dir ():
501499 dst = dst_root
502500 elif not sources :
503- raise errors .InvalidFileOperation ('Cannot copy repo to file' )
501+ raise errors .ParameterError ('Cannot copy repo to file' )
504502 else :
505- raise errors .InvalidFileOperation (
503+ raise errors .ParameterError (
506504 'Cannot copy multiple files or directories to a file'
507505 )
508506
@@ -678,7 +676,7 @@ def update_dataset_files(self, files, ref, delete=False):
678676 added = based_on .added
679677 )
680678 except KeyError :
681- raise errors .InvalidFileOperation (
679+ raise errors .ParameterError (
682680 'Cannot find file {} in the repo {}' .format (
683681 based_on .url , url
684682 )
@@ -835,22 +833,29 @@ def _get_commit_sha_from_label(dataset_file):
835833 return label
836834
837835
838- def check_for_git_repo (url ):
836+ def _is_remote_git_repo (url ):
839837 """Check if a url points to a git repository."""
840838 u = parse .urlparse (url )
841- is_git = False
839+ is_remote = u .scheme not in ('' , 'file' ) or url .startswith ('git@' )
840+
841+ return is_remote and os .path .splitext (u .path )[1 ] == '.git'
842842
843- if os .path .splitext (u .path )[1 ] == '.git' :
844- is_git = True
845- elif u .scheme in ('' , 'file' ):
846- from git import InvalidGitRepositoryError , Repo
847843
844+ def _is_local_git_repo (url ):
845+ """Check if a path is within a git repository."""
846+ from git import InvalidGitRepositoryError , NoSuchPathError , Repo
847+
848+ u = parse .urlparse (url )
849+
850+ if u .scheme in ('' , 'file' ):
848851 try :
849- Repo (u .path , search_parent_directories = True )
850- is_git = True
851- except InvalidGitRepositoryError :
852- is_git = False
853- return is_git
852+ Repo (url , search_parent_directories = True )
853+ except (InvalidGitRepositoryError , NoSuchPathError ):
854+ pass
855+ else :
856+ return True
857+
858+ return False
854859
855860
856861DATASET_METADATA_PATHS = [
0 commit comments