Skip to content

Commit d5448c3

Browse files
committed
Adds multi-part namespace support to catalogs
Enables catalogs to support hierarchical organization of tables through multi-part namespaces. Introduces changes to the Catalog class and its implementations (DynamoDB, REST, SQL) to handle multi-part namespaces. A new document is added to explain the usage of the multi-part namespace feature. enable multipart namespace Multipart namespace efers to a namespace that is structured with multiple levels or parts and default separated by a delmiiter dot (`.`)
1 parent 36906ed commit d5448c3

File tree

6 files changed

+98
-22
lines changed

6 files changed

+98
-22
lines changed
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# Multi-Part Namespace Support
2+
3+
Some catalog implementations support multi-part namespaces, which allows for hierarchical organization of tables. The following table summarizes the support for multi-part namespaces across different catalog implementations in Iceberg Python.
4+
5+
| Catalog Implementation | Multi-Part Namespace Support | Notes |
6+
|------------------------|------------------------------|-------|
7+
| REST Catalog | ✅ Yes | Fully supports multi-part namespace as defined by the REST catalog specification. |
8+
| Hive Catalog | ❌ No | Spark does not support multi-part namespace. |
9+
| DynamoDB Catalog | ✅ Yes | Namespace is represented as a composite key in DynamoDB. |
10+
| Glue Catalog | ❌ No | Uses AWS Glue databases which don't support multi-part namespace. |
11+
| File Catalog | ✅ Yes | Namespace parts are represented as directory hierarchies in the file system. |
12+
| In-Memory Catalog | ✅ Yes | Supports multi-part namespace for testing purposes. |
13+
14+
## Usage Example
15+
16+
```python
17+
from pyiceberg.catalog import load_catalog
18+
19+
# Using a catalog with multi-part namespace support
20+
catalog = load_catalog("my_catalog")
21+
22+
# Creating a table with a multi-part namespace
23+
catalog.create_table("default.multi.table_name", schema, spec)
24+
25+
# Listing tables in a multi-part namespace
26+
tables = catalog.list_tables("default.multi")
27+
```
28+
29+
## Configuration
30+
31+
When using catalogs that support multi-part namespaces, make sure to use the appropriate delimiter (typically `.`) when referencing namespaces in your code.

pyiceberg/catalog/__init__.py

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,7 @@ class Catalog(ABC):
347347

348348
name: str
349349
properties: Properties
350+
_support_namespaces: bool = False
350351

351352
def __init__(self, name: str, **properties: str):
352353
self.name = name
@@ -723,25 +724,44 @@ def namespace_to_string(identifier: str | Identifier, err: type[ValueError] | ty
723724
return ".".join(segment.strip() for segment in tuple_identifier)
724725

725726
@staticmethod
727+
def namespace_level(identifier: Union[str, Identifier]) -> int:
728+
"""Get the level of a namespace identifier.
729+
730+
Args:
731+
identifier (Union[str, Identifier]): a namespace identifier.
732+
733+
Returns:
734+
int: The level of the namespace.
735+
"""
736+
if not identifier:
737+
return 1
738+
tuple_identifier = Catalog.identifier_to_tuple(identifier)
739+
return len(tuple_identifier) + 1
740+
741+
@classmethod
726742
def identifier_to_database(
727-
identifier: str | Identifier, err: type[ValueError] | type[NoSuchNamespaceError] = ValueError
743+
cls, identifier: str | Identifier, err: type[ValueError] | type[NoSuchNamespaceError] = ValueError
728744
) -> str:
729745
tuple_identifier = Catalog.identifier_to_tuple(identifier)
730-
if len(tuple_identifier) != 1:
731-
raise err(f"Invalid database, hierarchical namespaces are not supported: {identifier}")
732746

733-
return tuple_identifier[0]
747+
if not cls._support_namespaces:
748+
if len(tuple_identifier) != 1:
749+
raise err(f"Invalid database, hierarchical namespaces are not supported: {identifier}")
750+
else:
751+
return tuple_identifier[0]
752+
753+
return ".".join(tuple_identifier)
734754

735-
@staticmethod
755+
@classmethod
736756
def identifier_to_database_and_table(
757+
cls,
737758
identifier: str | Identifier,
738759
err: type[ValueError] | type[NoSuchTableError] | type[NoSuchNamespaceError] = ValueError,
739760
) -> tuple[str, str]:
740761
tuple_identifier = Catalog.identifier_to_tuple(identifier)
741-
if len(tuple_identifier) != 2:
762+
if not cls._support_namespaces and len(tuple_identifier) != 2:
742763
raise err(f"Invalid path, hierarchical namespaces are not supported: {identifier}")
743-
744-
return tuple_identifier[0], tuple_identifier[1]
764+
return ".".join(tuple_identifier[:-1]), tuple_identifier[-1]
745765

746766
def _load_file_io(self, properties: Properties = EMPTY_DICT, location: str | None = None) -> FileIO:
747767
return load_file_io({**self.properties, **properties}, location)

pyiceberg/catalog/dynamodb.py

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,8 @@
9292

9393

9494
class DynamoDbCatalog(MetastoreCatalog):
95+
_support_namespaces: bool = True
96+
9597
def __init__(self, name: str, client: Optional["DynamoDBClient"] = None, **properties: str):
9698
"""Dynamodb catalog.
9799
@@ -441,29 +443,32 @@ def list_tables(self, namespace: str | Identifier) -> list[Identifier]:
441443
return table_identifiers
442444

443445
def list_namespaces(self, namespace: str | Identifier = ()) -> list[Identifier]:
444-
"""List top-level namespaces from the catalog.
445-
446-
We do not support hierarchical namespace.
446+
"""List namespaces from the catalog.
447447
448448
Returns:
449449
List[Identifier]: a List of namespace identifiers.
450450
"""
451-
# Hierarchical namespace is not supported. Return an empty list
451+
level = self.namespace_level(namespace)
452+
conditions = f"{DYNAMODB_COL_IDENTIFIER} = :identifier"
453+
expression_attribute_values = {
454+
":identifier": {
455+
"S": DYNAMODB_NAMESPACE,
456+
}
457+
}
452458
if namespace:
453-
return []
459+
conditions += f" AND begins_with({DYNAMODB_COL_NAMESPACE},:ns)"
460+
expression_attribute_values[":ns"] = {
461+
"S": self.namespace_to_string(namespace) + ".",
462+
}
454463

455464
paginator = self.dynamodb.get_paginator("query")
456465

457466
try:
458467
page_iterator = paginator.paginate(
459468
TableName=self.dynamodb_table_name,
460469
ConsistentRead=True,
461-
KeyConditionExpression=f"{DYNAMODB_COL_IDENTIFIER} = :identifier",
462-
ExpressionAttributeValues={
463-
":identifier": {
464-
"S": DYNAMODB_NAMESPACE,
465-
}
466-
},
470+
KeyConditionExpression=conditions,
471+
ExpressionAttributeValues=expression_attribute_values,
467472
)
468473
except (
469474
self.dynamodb.exceptions.ProvisionedThroughputExceededException,
@@ -473,14 +478,14 @@ def list_namespaces(self, namespace: str | Identifier = ()) -> list[Identifier]:
473478
) as e:
474479
raise GenericDynamoDbError(e.message) from e
475480

476-
database_identifiers = []
481+
database_identifiers = set()
477482
for page in page_iterator:
478483
for item in page["Items"]:
479484
_dict = _convert_dynamo_item_to_regular_dict(item)
480485
namespace_col = _dict[DYNAMODB_COL_NAMESPACE]
481-
database_identifiers.append(self.identifier_to_tuple(namespace_col))
486+
database_identifiers.add(self.identifier_to_tuple(namespace_col)[:level])
482487

483-
return database_identifiers
488+
return list(database_identifiers)
484489

485490
def load_namespace_properties(self, namespace: str | Identifier) -> Properties:
486491
"""

pyiceberg/catalog/rest/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -214,6 +214,7 @@ class ListViewsResponse(IcebergBaseModel):
214214
class RestCatalog(Catalog):
215215
uri: str
216216
_session: Session
217+
_support_namespaces: bool = True
217218

218219
def __init__(self, name: str, **properties: str):
219220
"""Rest Catalog.

pyiceberg/catalog/sql.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -113,6 +113,8 @@ class SqlCatalog(MetastoreCatalog):
113113
The `SqlCatalog` has a different convention where a `TableIdentifier` requires a `Namespace`.
114114
"""
115115

116+
_support_namespaces: bool = True
117+
116118
def __init__(self, name: str, **properties: str):
117119
super().__init__(name, **properties)
118120

tests/catalog/test_rest.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -544,6 +544,23 @@ def test_list_namespaces_200(rest_mock: Mocker) -> None:
544544
]
545545

546546

547+
def test_list_multipart_namespaces_200(rest_mock: Mocker) -> None:
548+
rest_mock.get(
549+
f"{TEST_URI}v1/namespaces",
550+
json={"namespaces": [["default"], ["multipart"]]},
551+
status_code=200,
552+
request_headers=TEST_HEADERS,
553+
)
554+
assert RestCatalog("rest", uri=TEST_URI, token=TEST_TOKEN).list_namespaces() == [("default",), ("multipart",)]
555+
556+
rest_mock.get(
557+
f"{TEST_URI}v1/namespaces?parent=multipart",
558+
json={"namespaces": [["multipart", "namespace1"], ["multipart", "namespace2"]]},
559+
status_code=200,
560+
request_headers=TEST_HEADERS,
561+
)
562+
563+
547564
def test_list_namespace_with_parent_200(rest_mock: Mocker) -> None:
548565
rest_mock.get(
549566
f"{TEST_URI}v1/namespaces?parent=accounting",

0 commit comments

Comments
 (0)