Skip to content

Commit 6358bbc

Browse files
docs(bigquery): schema resolver strategy (#15164)
Co-authored-by: Cursor Agent <cursoragent@cursor.com>
1 parent 17ca9dd commit 6358bbc

File tree

2 files changed

+14
-1
lines changed

2 files changed

+14
-1
lines changed

metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,13 @@ def test_connection(config_dict: dict) -> TestConnectionReport:
219219
return BigQueryTestConnection.test_connection(config_dict)
220220

221221
def _init_schema_resolver(self) -> SchemaResolver:
222+
"""
223+
The ininitialization of SchemaResolver prefetches all existing urns and schemas in the env/platform/instance.
224+
Because of that, it's important all classes requiring a SchemaResolver use this instance, as it has an already pre-populated cache.
225+
An alternative strategy would be to do an on-demand resolution of the urns/schemas.
226+
227+
TODO: prove pre-fetch is better strategy than on-demand resolution or make this behaviour configurable.
228+
"""
222229
schema_resolution_required = (
223230
self.config.use_queries_v2 or self.config.lineage_use_sql_parser
224231
)

metadata-ingestion/src/datahub/sql_parsing/schema_resolver.py

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import contextlib
2+
import logging
23
import pathlib
34
from dataclasses import dataclass
45
from typing import Dict, List, Optional, Protocol, Set, Tuple
@@ -19,6 +20,8 @@
1920
from datahub.utilities.file_backed_collections import ConnectionWrapper, FileBackedDict
2021
from datahub.utilities.urns.field_paths import get_simple_field_path_from_v2_field_path
2122

23+
logger = logging.getLogger(__name__)
24+
2225
# A lightweight table schema: column -> type mapping.
2326
SchemaInfo = Dict[str, str]
2427

@@ -168,7 +171,10 @@ def resolve_table(self, table: _TableName) -> Tuple[str, Optional[SchemaInfo]]:
168171
self._track_cache_hit()
169172
return urn_mixed, schema_info
170173

171-
# Track cache miss for the final attempt
174+
logger.debug(
175+
f"Schema resolution failed for table {table}. Tried URNs: "
176+
f"primary={urn}, lower={urn_lower}, mixed={urn_mixed}"
177+
)
172178
self._track_cache_miss()
173179

174180
if self._prefers_urn_lower():

0 commit comments

Comments
 (0)