Skip to content

Commit f83fb5e

Browse files
committed
Adds multi-part namespace support to catalogs
Enables catalogs to support hierarchical organization of tables through multi-part namespaces. Introduces changes to the Catalog class and its implementations (DynamoDB, REST, SQL) to handle multi-part namespaces. A new document is added to explain the usage of the multi-part namespace feature. enable multipart namespace Multipart namespace efers to a namespace that is structured with multiple levels or parts and default separated by a delmiiter dot (`.`)
1 parent 2a9f2ea commit f83fb5e

File tree

6 files changed

+98
-22
lines changed

6 files changed

+98
-22
lines changed
Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
# Multi-Part Namespace Support
2+
3+
Some catalog implementations support multi-part namespaces, which allows for hierarchical organization of tables. The following table summarizes the support for multi-part namespaces across different catalog implementations in Iceberg Python.
4+
5+
| Catalog Implementation | Multi-Part Namespace Support | Notes |
6+
|------------------------|------------------------------|-------|
7+
| REST Catalog | ✅ Yes | Fully supports multi-part namespace as defined by the REST catalog specification. |
8+
| Hive Catalog | ❌ No | Spark does not support multi-part namespace. |
9+
| DynamoDB Catalog | ✅ Yes | Namespace is represented as a composite key in DynamoDB. |
10+
| Glue Catalog | ❌ No | Uses AWS Glue databases which don't support multi-part namespace. |
11+
| File Catalog | ✅ Yes | Namespace parts are represented as directory hierarchies in the file system. |
12+
| In-Memory Catalog | ✅ Yes | Supports multi-part namespace for testing purposes. |
13+
14+
## Usage Example
15+
16+
```python
17+
from pyiceberg.catalog import load_catalog
18+
19+
# Using a catalog with multi-part namespace support
20+
catalog = load_catalog("my_catalog")
21+
22+
# Creating a table with a multi-part namespace
23+
catalog.create_table("default.multi.table_name", schema, spec)
24+
25+
# Listing tables in a multi-part namespace
26+
tables = catalog.list_tables("default.multi")
27+
```
28+
29+
## Configuration
30+
31+
When using catalogs that support multi-part namespaces, make sure to use the appropriate delimiter (typically `.`) when referencing namespaces in your code.

pyiceberg/catalog/__init__.py

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -354,6 +354,7 @@ class Catalog(ABC):
354354

355355
name: str
356356
properties: Properties
357+
_support_namespaces: bool = False
357358

358359
def __init__(self, name: str, **properties: str):
359360
self.name = name
@@ -732,25 +733,44 @@ def namespace_to_string(
732733
return ".".join(segment.strip() for segment in tuple_identifier)
733734

734735
@staticmethod
736+
def namespace_level(identifier: Union[str, Identifier]) -> int:
737+
"""Get the level of a namespace identifier.
738+
739+
Args:
740+
identifier (Union[str, Identifier]): a namespace identifier.
741+
742+
Returns:
743+
int: The level of the namespace.
744+
"""
745+
if not identifier:
746+
return 1
747+
tuple_identifier = Catalog.identifier_to_tuple(identifier)
748+
return len(tuple_identifier) + 1
749+
750+
@classmethod
735751
def identifier_to_database(
736-
identifier: Union[str, Identifier], err: Union[Type[ValueError], Type[NoSuchNamespaceError]] = ValueError
752+
cls, identifier: Union[str, Identifier], err: Union[Type[ValueError], Type[NoSuchNamespaceError]] = ValueError
737753
) -> str:
738754
tuple_identifier = Catalog.identifier_to_tuple(identifier)
739-
if len(tuple_identifier) != 1:
740-
raise err(f"Invalid database, hierarchical namespaces are not supported: {identifier}")
741755

742-
return tuple_identifier[0]
756+
if not cls._support_namespaces:
757+
if len(tuple_identifier) != 1:
758+
raise err(f"Invalid database, hierarchical namespaces are not supported: {identifier}")
759+
else:
760+
return tuple_identifier[0]
761+
762+
return ".".join(tuple_identifier)
743763

744-
@staticmethod
764+
@classmethod
745765
def identifier_to_database_and_table(
766+
cls,
746767
identifier: Union[str, Identifier],
747768
err: Union[Type[ValueError], Type[NoSuchTableError], Type[NoSuchNamespaceError]] = ValueError,
748769
) -> Tuple[str, str]:
749770
tuple_identifier = Catalog.identifier_to_tuple(identifier)
750-
if len(tuple_identifier) != 2:
771+
if not cls._support_namespaces and len(tuple_identifier) != 2:
751772
raise err(f"Invalid path, hierarchical namespaces are not supported: {identifier}")
752-
753-
return tuple_identifier[0], tuple_identifier[1]
773+
return ".".join(tuple_identifier[:-1]), tuple_identifier[-1]
754774

755775
def _load_file_io(self, properties: Properties = EMPTY_DICT, location: Optional[str] = None) -> FileIO:
756776
return load_file_io({**self.properties, **properties}, location)

pyiceberg/catalog/dynamodb.py

Lines changed: 19 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,8 @@
9696

9797

9898
class DynamoDbCatalog(MetastoreCatalog):
99+
_support_namespaces: bool = True
100+
99101
def __init__(self, name: str, client: Optional["DynamoDBClient"] = None, **properties: str):
100102
"""Dynamodb catalog.
101103
@@ -445,29 +447,32 @@ def list_tables(self, namespace: Union[str, Identifier]) -> List[Identifier]:
445447
return table_identifiers
446448

447449
def list_namespaces(self, namespace: Union[str, Identifier] = ()) -> List[Identifier]:
448-
"""List top-level namespaces from the catalog.
449-
450-
We do not support hierarchical namespace.
450+
"""List namespaces from the catalog.
451451
452452
Returns:
453453
List[Identifier]: a List of namespace identifiers.
454454
"""
455-
# Hierarchical namespace is not supported. Return an empty list
455+
level = self.namespace_level(namespace)
456+
conditions = f"{DYNAMODB_COL_IDENTIFIER} = :identifier"
457+
expression_attribute_values = {
458+
":identifier": {
459+
"S": DYNAMODB_NAMESPACE,
460+
}
461+
}
456462
if namespace:
457-
return []
463+
conditions += f" AND begins_with({DYNAMODB_COL_NAMESPACE},:ns)"
464+
expression_attribute_values[":ns"] = {
465+
"S": self.namespace_to_string(namespace) + ".",
466+
}
458467

459468
paginator = self.dynamodb.get_paginator("query")
460469

461470
try:
462471
page_iterator = paginator.paginate(
463472
TableName=self.dynamodb_table_name,
464473
ConsistentRead=True,
465-
KeyConditionExpression=f"{DYNAMODB_COL_IDENTIFIER} = :identifier",
466-
ExpressionAttributeValues={
467-
":identifier": {
468-
"S": DYNAMODB_NAMESPACE,
469-
}
470-
},
474+
KeyConditionExpression=conditions,
475+
ExpressionAttributeValues=expression_attribute_values,
471476
)
472477
except (
473478
self.dynamodb.exceptions.ProvisionedThroughputExceededException,
@@ -477,14 +482,14 @@ def list_namespaces(self, namespace: Union[str, Identifier] = ()) -> List[Identi
477482
) as e:
478483
raise GenericDynamoDbError(e.message) from e
479484

480-
database_identifiers = []
485+
database_identifiers = set()
481486
for page in page_iterator:
482487
for item in page["Items"]:
483488
_dict = _convert_dynamo_item_to_regular_dict(item)
484489
namespace_col = _dict[DYNAMODB_COL_NAMESPACE]
485-
database_identifiers.append(self.identifier_to_tuple(namespace_col))
490+
database_identifiers.add(self.identifier_to_tuple(namespace_col)[:level])
486491

487-
return database_identifiers
492+
return list(database_identifiers)
488493

489494
def load_namespace_properties(self, namespace: Union[str, Identifier]) -> Properties:
490495
"""

pyiceberg/catalog/rest/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,7 @@ class ListViewsResponse(IcebergBaseModel):
219219
class RestCatalog(Catalog):
220220
uri: str
221221
_session: Session
222+
_support_namespaces: bool = True
222223

223224
def __init__(self, name: str, **properties: str):
224225
"""Rest Catalog.

pyiceberg/catalog/sql.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,8 @@ class SqlCatalog(MetastoreCatalog):
117117
The `SqlCatalog` has a different convention where a `TableIdentifier` requires a `Namespace`.
118118
"""
119119

120+
_support_namespaces: bool = True
121+
120122
def __init__(self, name: str, **properties: str):
121123
super().__init__(name, **properties)
122124

tests/catalog/test_rest.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -544,6 +544,23 @@ def test_list_namespaces_200(rest_mock: Mocker) -> None:
544544
]
545545

546546

547+
def test_list_multipart_namespaces_200(rest_mock: Mocker) -> None:
548+
rest_mock.get(
549+
f"{TEST_URI}v1/namespaces",
550+
json={"namespaces": [["default"], ["multipart"]]},
551+
status_code=200,
552+
request_headers=TEST_HEADERS,
553+
)
554+
assert RestCatalog("rest", uri=TEST_URI, token=TEST_TOKEN).list_namespaces() == [("default",), ("multipart",)]
555+
556+
rest_mock.get(
557+
f"{TEST_URI}v1/namespaces?parent=multipart",
558+
json={"namespaces": [["multipart", "namespace1"], ["multipart", "namespace2"]]},
559+
status_code=200,
560+
request_headers=TEST_HEADERS,
561+
)
562+
563+
547564
def test_list_namespace_with_parent_200(rest_mock: Mocker) -> None:
548565
rest_mock.get(
549566
f"{TEST_URI}v1/namespaces?parent=accounting",

0 commit comments

Comments
 (0)