From f9f091dd7ea7d9a9112926b694495ae530fa5778 Mon Sep 17 00:00:00 2001 From: zhzhang Date: Thu, 30 Oct 2025 17:41:56 +0800 Subject: [PATCH 1/7] Enhance repo_type_and_id_from_hf_id of hf_api --- src/huggingface_hub/hf_api.py | 10 ++++++++-- tests/test_hf_api.py | 3 +++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index 7e78889372..e2c00f1842 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -238,8 +238,14 @@ def repo_type_and_id_from_hf_id(hf_id: str, hub_url: Optional[str] = None) -> tu """ input_hf_id = hf_id - hub_url = re.sub(r"https?://", "", hub_url if hub_url is not None else constants.ENDPOINT) - is_hf_url = hub_url in hf_id and "@" not in hf_id + hub_url = hub_url if hub_url is not None else constants.ENDPOINT + hub_url = hub_url.rstrip("/") + if hf_id.startswith(hub_url): + hf_id = hf_id[len(hub_url):].lstrip("/") + elif hf_id.startswith(hub_url.replace("https://", "").replace("http://", "")): + # Handle urls like "localhost:8080/hf/model/xxx" + # https://github.com/huggingface/huggingface_hub/issues/3494 + hf_id = hf_id[len(hub_url.replace("https://", "").replace("http://", "")):].lstrip("/") HFFS_PREFIX = "hf://" if hf_id.startswith(HFFS_PREFIX): # Remove "hf://" prefix if exists diff --git a/tests/test_hf_api.py b/tests/test_hf_api.py index bbf00090ea..bf3916aeb4 100644 --- a/tests/test_hf_api.py +++ b/tests/test_hf_api.py @@ -2758,6 +2758,9 @@ def test_git_push_end_to_end(self): class ParseHFUrlTest(unittest.TestCase): def test_repo_type_and_id_from_hf_id_on_correct_values(self): possible_values = { + "http://localhost:8080/hf/user/id": [None, "user", "id"], + "http://localhost:8080/hf/datasets/user/id": ["dataset", "user", "id"], + "http://localhost:8080/hf/models/user/id": ["model", "user", "id"], "https://huggingface.co/id": [None, None, "id"], "https://huggingface.co/user/id": [None, "user", "id"], "https://huggingface.co/datasets/user/id": ["dataset", "user", "id"], From d67b270a12eecd16c49c3bc1acb6c8c9d01df574 Mon Sep 17 00:00:00 2001 From: zhzhang Date: Thu, 30 Oct 2025 18:07:21 +0800 Subject: [PATCH 2/7] Add missing line --- src/huggingface_hub/hf_api.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index e2c00f1842..877157c865 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -239,6 +239,8 @@ def repo_type_and_id_from_hf_id(hf_id: str, hub_url: Optional[str] = None) -> tu input_hf_id = hf_id hub_url = hub_url if hub_url is not None else constants.ENDPOINT + is_hf_url = hub_url in hf_id and "@" not in hf_id + hub_url = hub_url.rstrip("/") if hf_id.startswith(hub_url): hf_id = hf_id[len(hub_url):].lstrip("/") From 30489735b7b8d93d9a0d33d5fafcaaeec9183416 Mon Sep 17 00:00:00 2001 From: zhzhang Date: Tue, 18 Nov 2025 15:15:27 +0800 Subject: [PATCH 3/7] Fix failed tests --- src/huggingface_hub/hf_api.py | 74 +++++++++++++++++++---------------- tests/test_hf_api.py | 44 ++++++++++++--------- 2 files changed, 66 insertions(+), 52 deletions(-) diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index 877157c865..c0bc13c208 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -238,64 +238,72 @@ def repo_type_and_id_from_hf_id(hf_id: str, hub_url: Optional[str] = None) -> tu """ input_hf_id = hf_id - hub_url = hub_url if hub_url is not None else constants.ENDPOINT - is_hf_url = hub_url in hf_id and "@" not in hf_id - - hub_url = hub_url.rstrip("/") - if hf_id.startswith(hub_url): - hf_id = hf_id[len(hub_url):].lstrip("/") - elif hf_id.startswith(hub_url.replace("https://", "").replace("http://", "")): - # Handle urls like "localhost:8080/hf/model/xxx" - # https://github.com/huggingface/huggingface_hub/issues/3494 - hf_id = hf_id[len(hub_url.replace("https://", "").replace("http://", "")):].lstrip("/") + hub_url = hub_url or constants.ENDPOINT + hub_url_no_proto = re.sub(r"^https?://", "", hub_url).rstrip("/") + + hf_id_no_proto = re.sub(r"^https?://", "", hf_id) + + is_hf_url = hf_id_no_proto.startswith(hub_url_no_proto) and "@" not in hf_id + + if is_hf_url: + hf_id = hf_id_no_proto[len(hub_url_no_proto):].lstrip("/") HFFS_PREFIX = "hf://" if hf_id.startswith(HFFS_PREFIX): # Remove "hf://" prefix if exists - hf_id = hf_id[len(HFFS_PREFIX) :] + hf_id = hf_id[len(HFFS_PREFIX):] - url_segments = hf_id.split("/") - is_hf_id = len(url_segments) <= 3 + url_segments = [s for s in hf_id.split("/") if s] + seg_len = len(url_segments) + + repo_type: Optional[str] = None + namespace: Optional[str] = None + repo_id: str - namespace: Optional[str] if is_hf_url: - namespace, repo_id = url_segments[-2:] - if namespace == hub_url: - namespace = None - if len(url_segments) > 2 and hub_url not in url_segments[-3]: - repo_type = url_segments[-3] - elif namespace in constants.REPO_TYPES_MAPPING: - # Mean canonical dataset or model - repo_type = constants.REPO_TYPES_MAPPING[namespace] + if seg_len == 1: + repo_id = url_segments[0] namespace = None - else: repo_type = None - elif is_hf_id: - if len(url_segments) == 3: + elif seg_len == 2: + namespace, repo_id = url_segments + repo_type = None + else: + namespace, repo_id = url_segments[-2:] + repo_type = url_segments[-3] if seg_len >= 3 else None + if namespace in constants.REPO_TYPES_MAPPING: + # canonical dataset/model + repo_type = constants.REPO_TYPES_MAPPING[namespace] + namespace = None + + elif seg_len <= 3: + if seg_len == 3: # Passed // or // - repo_type, namespace, repo_id = url_segments[-3:] - elif len(url_segments) == 2: + repo_type, namespace, repo_id = url_segments + elif seg_len == 2: if url_segments[0] in constants.REPO_TYPES_MAPPING: # Passed '' or 'datasets/' for a canonical model or dataset repo_type = constants.REPO_TYPES_MAPPING[url_segments[0]] namespace = None - repo_id = hf_id.split("/")[-1] + repo_id = url_segments[1] else: # Passed / or / - namespace, repo_id = hf_id.split("/")[-2:] + namespace, repo_id = url_segments repo_type = None else: - # Passed repo_id = url_segments[0] - namespace, repo_type = None, None + namespace = None + repo_type = None else: - raise ValueError(f"Unable to retrieve user and repo ID from the passed HF ID: {hf_id}") + raise ValueError( + f"Unable to retrieve user and repo ID from the passed HF ID: {hf_id}" + ) # Check if repo type is known (mapping "spaces" => "space" + empty value => `None`) if repo_type in constants.REPO_TYPES_MAPPING: repo_type = constants.REPO_TYPES_MAPPING[repo_type] if repo_type == "": repo_type = None - if repo_type not in constants.REPO_TYPES: + if repo_type not in constants.REPO_TYPES and repo_type is not None: raise ValueError(f"Unknown `repo_type`: '{repo_type}' ('{input_hf_id}')") return repo_type, namespace, repo_id diff --git a/tests/test_hf_api.py b/tests/test_hf_api.py index bf3916aeb4..693437f9b6 100644 --- a/tests/test_hf_api.py +++ b/tests/test_hf_api.py @@ -2758,28 +2758,34 @@ def test_git_push_end_to_end(self): class ParseHFUrlTest(unittest.TestCase): def test_repo_type_and_id_from_hf_id_on_correct_values(self): possible_values = { - "http://localhost:8080/hf/user/id": [None, "user", "id"], - "http://localhost:8080/hf/datasets/user/id": ["dataset", "user", "id"], - "http://localhost:8080/hf/models/user/id": ["model", "user", "id"], - "https://huggingface.co/id": [None, None, "id"], - "https://huggingface.co/user/id": [None, "user", "id"], - "https://huggingface.co/datasets/user/id": ["dataset", "user", "id"], - "https://huggingface.co/spaces/user/id": ["space", "user", "id"], - "user/id": [None, "user", "id"], - "dataset/user/id": ["dataset", "user", "id"], - "space/user/id": ["space", "user", "id"], - "id": [None, None, "id"], - "hf://id": [None, None, "id"], - "hf://user/id": [None, "user", "id"], - "hf://model/user/name": ["model", "user", "name"], # 's' is optional - "hf://models/user/name": ["model", "user", "name"], + "saas": { + "https://huggingface.co/id": [None, None, "id"], + "https://huggingface.co/user/id": [None, "user", "id"], + "https://huggingface.co/datasets/user/id": ["dataset", "user", "id"], + "https://huggingface.co/spaces/user/id": ["space", "user", "id"], + "user/id": [None, "user", "id"], + "dataset/user/id": ["dataset", "user", "id"], + "space/user/id": ["space", "user", "id"], + "id": [None, None, "id"], + "hf://id": [None, None, "id"], + "hf://user/id": [None, "user", "id"], + "hf://model/user/name": ["model", "user", "name"], # 's' is optional + "hf://models/user/name": ["model", "user", "name"] + }, + "self-hosted": { + "http://localhost:8080/hf/user/id": [None, "user", "id"], + "http://localhost:8080/hf/datasets/user/id": ["dataset", "user", "id"], + "http://localhost:8080/hf/models/user/id": ["model", "user", "id"], + }, } for key, value in possible_values.items(): - self.assertEqual( - repo_type_and_id_from_hf_id(key, hub_url=ENDPOINT_PRODUCTION), - tuple(value), - ) + hub_url = ENDPOINT_PRODUCTION if key == "saas" else "http://localhost:8080/hf" + for key, value in value.items(): + self.assertEqual( + repo_type_and_id_from_hf_id(key, hub_url=hub_url), + tuple(value), + ) def test_repo_type_and_id_from_hf_id_on_wrong_values(self): for hub_id in [ From c3656efb57d9b33ae72ac2198c91fafc515bce94 Mon Sep 17 00:00:00 2001 From: zhzhang Date: Wed, 19 Nov 2025 10:58:27 +0800 Subject: [PATCH 4/7] Format code --- src/huggingface_hub/hf_api.py | 8 +++----- tests/test_hf_api.py | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index c0bc13c208..b17c8caab3 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -246,11 +246,11 @@ def repo_type_and_id_from_hf_id(hf_id: str, hub_url: Optional[str] = None) -> tu is_hf_url = hf_id_no_proto.startswith(hub_url_no_proto) and "@" not in hf_id if is_hf_url: - hf_id = hf_id_no_proto[len(hub_url_no_proto):].lstrip("/") + hf_id = hf_id_no_proto[len(hub_url_no_proto) :].lstrip("/") HFFS_PREFIX = "hf://" if hf_id.startswith(HFFS_PREFIX): # Remove "hf://" prefix if exists - hf_id = hf_id[len(HFFS_PREFIX):] + hf_id = hf_id[len(HFFS_PREFIX) :] url_segments = [s for s in hf_id.split("/") if s] seg_len = len(url_segments) @@ -294,9 +294,7 @@ def repo_type_and_id_from_hf_id(hf_id: str, hub_url: Optional[str] = None) -> tu namespace = None repo_type = None else: - raise ValueError( - f"Unable to retrieve user and repo ID from the passed HF ID: {hf_id}" - ) + raise ValueError(f"Unable to retrieve user and repo ID from the passed HF ID: {hf_id}") # Check if repo type is known (mapping "spaces" => "space" + empty value => `None`) if repo_type in constants.REPO_TYPES_MAPPING: diff --git a/tests/test_hf_api.py b/tests/test_hf_api.py index 693437f9b6..8207d580ef 100644 --- a/tests/test_hf_api.py +++ b/tests/test_hf_api.py @@ -2770,7 +2770,7 @@ def test_repo_type_and_id_from_hf_id_on_correct_values(self): "hf://id": [None, None, "id"], "hf://user/id": [None, "user", "id"], "hf://model/user/name": ["model", "user", "name"], # 's' is optional - "hf://models/user/name": ["model", "user", "name"] + "hf://models/user/name": ["model", "user", "name"], }, "self-hosted": { "http://localhost:8080/hf/user/id": [None, "user", "id"], From 136c1fcafdcc741e54ded7fa0c2a71bd701abefe Mon Sep 17 00:00:00 2001 From: zhzhang Date: Wed, 19 Nov 2025 13:25:36 +0800 Subject: [PATCH 5/7] Fix tests and format code --- src/huggingface_hub/hf_api.py | 88 +++++++++++++++++++++-------------- 1 file changed, 54 insertions(+), 34 deletions(-) diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index b17c8caab3..85b7fd54c3 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -238,61 +238,81 @@ def repo_type_and_id_from_hf_id(hf_id: str, hub_url: Optional[str] = None) -> tu """ input_hf_id = hf_id - hub_url = hub_url or constants.ENDPOINT - hub_url_no_proto = re.sub(r"^https?://", "", hub_url).rstrip("/") + # Get the hub_url (with or without protocol) + full_hub_url = hub_url if hub_url is not None else constants.ENDPOINT + hub_url_without_protocol = re.sub(r"https?://", "", full_hub_url) - hf_id_no_proto = re.sub(r"^https?://", "", hf_id) - - is_hf_url = hf_id_no_proto.startswith(hub_url_no_proto) and "@" not in hf_id - - if is_hf_url: - hf_id = hf_id_no_proto[len(hub_url_no_proto) :].lstrip("/") + # Check if hf_id is a URL containing the hub_url (check both with and without protocol) + hf_id_without_protocol = re.sub(r"https?://", "", hf_id) + is_hf_url = hub_url_without_protocol in hf_id_without_protocol and "@" not in hf_id HFFS_PREFIX = "hf://" if hf_id.startswith(HFFS_PREFIX): # Remove "hf://" prefix if exists hf_id = hf_id[len(HFFS_PREFIX) :] - url_segments = [s for s in hf_id.split("/") if s] - seg_len = len(url_segments) + # If it's a URL, strip the endpoint prefix to get the path + if is_hf_url: + # Remove protocol if present + hf_id_normalized = re.sub(r"https?://", "", hf_id) - repo_type: Optional[str] = None - namespace: Optional[str] = None - repo_id: str + # Remove the hub_url prefix to get the relative path + if hf_id_normalized.startswith(hub_url_without_protocol): + # Strip the hub URL and any leading slashes + hf_id = hf_id_normalized[len(hub_url_without_protocol) :].lstrip("/") + url_segments = hf_id.split("/") + is_hf_id = len(url_segments) <= 3 + + namespace: Optional[str] if is_hf_url: - if seg_len == 1: - repo_id = url_segments[0] - namespace = None - repo_type = None - elif seg_len == 2: - namespace, repo_id = url_segments - repo_type = None - else: - namespace, repo_id = url_segments[-2:] - repo_type = url_segments[-3] if seg_len >= 3 else None + # For URLs, we need to extract repo_type, namespace, repo_id + # Expected format after stripping endpoint: [repo_type]/namespace/repo_id or namespace/repo_id + + if len(url_segments) >= 3: + # Check if first segment is a repo type + if url_segments[0] in constants.REPO_TYPES_MAPPING: + repo_type = constants.REPO_TYPES_MAPPING[url_segments[0]] + namespace = url_segments[1] + repo_id = url_segments[2] + else: + # First segment is namespace + namespace = url_segments[0] + repo_id = url_segments[1] + repo_type = None + elif len(url_segments) == 2: + namespace = url_segments[0] + repo_id = url_segments[1] + + # Check if namespace is actually a repo type mapping if namespace in constants.REPO_TYPES_MAPPING: - # canonical dataset/model + # Mean canonical dataset or model repo_type = constants.REPO_TYPES_MAPPING[namespace] namespace = None - - elif seg_len <= 3: - if seg_len == 3: + else: + repo_type = None + else: + # Single segment + repo_id = url_segments[0] + namespace = None + repo_type = None + elif is_hf_id: + if len(url_segments) == 3: # Passed // or // - repo_type, namespace, repo_id = url_segments - elif seg_len == 2: + repo_type, namespace, repo_id = url_segments[-3:] + elif len(url_segments) == 2: if url_segments[0] in constants.REPO_TYPES_MAPPING: # Passed '' or 'datasets/' for a canonical model or dataset repo_type = constants.REPO_TYPES_MAPPING[url_segments[0]] namespace = None - repo_id = url_segments[1] + repo_id = hf_id.split("/")[-1] else: # Passed / or / - namespace, repo_id = url_segments + namespace, repo_id = hf_id.split("/")[-2:] repo_type = None else: + # Passed repo_id = url_segments[0] - namespace = None - repo_type = None + namespace, repo_type = None, None else: raise ValueError(f"Unable to retrieve user and repo ID from the passed HF ID: {hf_id}") @@ -301,7 +321,7 @@ def repo_type_and_id_from_hf_id(hf_id: str, hub_url: Optional[str] = None) -> tu repo_type = constants.REPO_TYPES_MAPPING[repo_type] if repo_type == "": repo_type = None - if repo_type not in constants.REPO_TYPES and repo_type is not None: + if repo_type not in constants.REPO_TYPES: raise ValueError(f"Unknown `repo_type`: '{repo_type}' ('{input_hf_id}')") return repo_type, namespace, repo_id From 800a09117d7922f68f4454dfc522de9ad614cb8e Mon Sep 17 00:00:00 2001 From: Lucain Date: Mon, 24 Nov 2025 13:30:57 +0100 Subject: [PATCH 6/7] Apply suggestions from code review --- tests/test_hf_api.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/tests/test_hf_api.py b/tests/test_hf_api.py index 8207d580ef..ec54c95958 100644 --- a/tests/test_hf_api.py +++ b/tests/test_hf_api.py @@ -2758,7 +2758,7 @@ def test_git_push_end_to_end(self): class ParseHFUrlTest(unittest.TestCase): def test_repo_type_and_id_from_hf_id_on_correct_values(self): possible_values = { - "saas": { + "hub": { "https://huggingface.co/id": [None, None, "id"], "https://huggingface.co/user/id": [None, "user", "id"], "https://huggingface.co/datasets/user/id": ["dataset", "user", "id"], @@ -2780,12 +2780,9 @@ def test_repo_type_and_id_from_hf_id_on_correct_values(self): } for key, value in possible_values.items(): - hub_url = ENDPOINT_PRODUCTION if key == "saas" else "http://localhost:8080/hf" + hub_url = ENDPOINT_PRODUCTION if key == "hub" else "http://localhost:8080/hf" for key, value in value.items(): - self.assertEqual( - repo_type_and_id_from_hf_id(key, hub_url=hub_url), - tuple(value), - ) + assert repo_type_and_id_from_hf_id(key, hub_url=hub_url) == tuple(value) def test_repo_type_and_id_from_hf_id_on_wrong_values(self): for hub_id in [ From c7052380a26f513a902c51778390add8d1c96e35 Mon Sep 17 00:00:00 2001 From: zhzhang Date: Wed, 26 Nov 2025 14:23:37 +0800 Subject: [PATCH 7/7] Optimize code --- src/huggingface_hub/hf_api.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/huggingface_hub/hf_api.py b/src/huggingface_hub/hf_api.py index f5131a635f..00e7f914b7 100644 --- a/src/huggingface_hub/hf_api.py +++ b/src/huggingface_hub/hf_api.py @@ -194,6 +194,7 @@ USERNAME_PLACEHOLDER = "hf_user" _REGEX_DISCUSSION_URL = re.compile(r".*/discussions/(\d+)$") +_REGEX_HTTP_PROTOCOL = re.compile(r"https?://") _CREATE_COMMIT_NO_REPO_ERROR_MESSAGE = ( "\nNote: Creating a commit assumes that the repo already exists on the" @@ -240,10 +241,10 @@ def repo_type_and_id_from_hf_id(hf_id: str, hub_url: Optional[str] = None) -> tu # Get the hub_url (with or without protocol) full_hub_url = hub_url if hub_url is not None else constants.ENDPOINT - hub_url_without_protocol = re.sub(r"https?://", "", full_hub_url) + hub_url_without_protocol = _REGEX_HTTP_PROTOCOL.sub("", full_hub_url) # Check if hf_id is a URL containing the hub_url (check both with and without protocol) - hf_id_without_protocol = re.sub(r"https?://", "", hf_id) + hf_id_without_protocol = _REGEX_HTTP_PROTOCOL.sub("", hf_id) is_hf_url = hub_url_without_protocol in hf_id_without_protocol and "@" not in hf_id HFFS_PREFIX = "hf://" @@ -253,7 +254,7 @@ def repo_type_and_id_from_hf_id(hf_id: str, hub_url: Optional[str] = None) -> tu # If it's a URL, strip the endpoint prefix to get the path if is_hf_url: # Remove protocol if present - hf_id_normalized = re.sub(r"https?://", "", hf_id) + hf_id_normalized = _REGEX_HTTP_PROTOCOL.sub("", hf_id) # Remove the hub_url prefix to get the relative path if hf_id_normalized.startswith(hub_url_without_protocol):