From b05793e4dd8790fc2b607a324b6cb2aa2acbc632 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Mon, 10 Nov 2025 11:06:42 +0100 Subject: [PATCH 1/5] debug(bigquery): add debug logging for temp table inference issue --- .../source/bigquery_v2/bigquery_schema_gen.py | 46 +++++++++++++++++-- .../source/bigquery_v2/queries_extractor.py | 15 ++++++ 2 files changed, 57 insertions(+), 4 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py index da05aaf5e4e177..c87ea5f9f5c835 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py @@ -527,23 +527,41 @@ def _process_schema( ) elif self.store_table_refs: # Need table_refs to calculate lineage and usage + logger.debug( + f"Lightweight table discovery for dataset {dataset_name} in project {project_id}" + ) + table_count = 0 for table_item in self.schema_api.list_tables(dataset_name, project_id): + table_count += 1 + table_type = getattr(table_item, "table_type", "UNKNOWN") + identifier = BigqueryTableIdentifier( project_id=project_id, dataset=dataset_name, table=table_item.table_id, ) + + logger.debug(f"Processing {table_type}: {identifier.raw_table_name()}") + if not self.config.table_pattern.allowed(identifier.raw_table_name()): + logger.debug( + f"Dropped by table_pattern: {identifier.raw_table_name()}" + ) self.report.report_dropped(identifier.raw_table_name()) continue try: - self.table_refs.add( - str(BigQueryTableRef(identifier).get_sanitized_table_ref()) + table_ref = str( + BigQueryTableRef(identifier).get_sanitized_table_ref() ) + self.table_refs.add(table_ref) + logger.debug(f"Added to table_refs: {table_ref}") except Exception as e: logger.warning( f"Could not create table ref for {table_item.path}: {e}" ) + logger.debug( + f"Lightweight discovery for {dataset_name}: processed {table_count} objects, table_refs now has {len(self.table_refs)} total entries" + ) return if self.config.include_tables: @@ -628,14 +646,24 @@ def _process_table( table_identifier = BigqueryTableIdentifier(project_id, dataset_name, table.name) self.report.report_entity_scanned(table_identifier.raw_table_name()) + logger.debug( + f"Full schema processing - Scanning TABLE: {table_identifier.raw_table_name()}" + ) if not self.config.table_pattern.allowed(table_identifier.raw_table_name()): + logger.debug( + f"Full schema processing - Dropped TABLE by table_pattern: {table_identifier.raw_table_name()}" + ) self.report.report_dropped(table_identifier.raw_table_name()) return if self.store_table_refs: - self.table_refs.add( - str(BigQueryTableRef(table_identifier).get_sanitized_table_ref()) + table_ref = str( + BigQueryTableRef(table_identifier).get_sanitized_table_ref() + ) + self.table_refs.add(table_ref) + logger.debug( + f"Full schema processing - Added TABLE to table_refs: {table_ref}" ) table.column_count = len(columns) @@ -668,13 +696,20 @@ def _process_view( table_identifier = BigqueryTableIdentifier(project_id, dataset_name, view.name) self.report.report_entity_scanned(table_identifier.raw_table_name(), "view") + logger.debug( + f"Full schema processing - Scanning VIEW: {table_identifier.raw_table_name()}" + ) if not self.config.view_pattern.allowed(table_identifier.raw_table_name()): + logger.debug( + f"Full schema processing - Dropped VIEW by view_pattern: {table_identifier.raw_table_name()}" + ) self.report.report_dropped(table_identifier.raw_table_name()) return table_ref = str(BigQueryTableRef(table_identifier).get_sanitized_table_ref()) self.table_refs.add(table_ref) + logger.debug(f"Full schema processing - Added VIEW to table_refs: {table_ref}") if view.view_definition: self.view_refs_by_project[project_id].add(table_ref) self.view_definitions[table_ref] = view.view_definition @@ -720,6 +755,9 @@ def _process_snapshot( table_ref = str(BigQueryTableRef(table_identifier).get_sanitized_table_ref()) self.table_refs.add(table_ref) + logger.debug( + f"Full schema processing - Added SNAPSHOT to table_refs: {table_ref}" + ) if snapshot.base_table_identifier: self.snapshot_refs_by_project[project_id].add(table_ref) self.snapshots_by_ref[table_ref] = snapshot diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py index 8097890d38afcf..7f1d12adc29068 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py @@ -245,6 +245,21 @@ def is_temp_table(self, name: str) -> bool: # 1. this name would be allowed by the dataset patterns, and # 2. we have a list of discovered tables, and # 3. it's not in the discovered tables list + + # Debug the individual conditions + is_allowed = self.filters.is_allowed(table) + has_discovered_tables = bool(self.discovered_tables) + table_ref_str = str(BigQueryTableRef(table)) + not_in_discovered = ( + table_ref_str not in self.discovered_tables + if self.discovered_tables + else False + ) + + logger.debug( + f"Temp table check for {name}: is_allowed={is_allowed}, has_discovered_tables={has_discovered_tables}, not_in_discovered={not_in_discovered}, table_ref={table_ref_str}" + ) + if ( self.filters.is_allowed(table) and self.discovered_tables From 784dc09bb2a89362e6cf44042e07a746f5bc3b7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Thu, 13 Nov 2025 11:28:23 +0100 Subject: [PATCH 2/5] fix: standardize case in temp table lookup to match discovered_tables --- .../ingestion/source/bigquery_v2/queries_extractor.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py index 7f1d12adc29068..d8ef0f4a326fbc 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py @@ -249,7 +249,9 @@ def is_temp_table(self, name: str) -> bool: # Debug the individual conditions is_allowed = self.filters.is_allowed(table) has_discovered_tables = bool(self.discovered_tables) - table_ref_str = str(BigQueryTableRef(table)) + table_ref_str = self.identifiers.standardize_identifier_case( + str(BigQueryTableRef(table)) + ) not_in_discovered = ( table_ref_str not in self.discovered_tables if self.discovered_tables @@ -263,7 +265,10 @@ def is_temp_table(self, name: str) -> bool: if ( self.filters.is_allowed(table) and self.discovered_tables - and str(BigQueryTableRef(table)) not in self.discovered_tables + and self.identifiers.standardize_identifier_case( + str(BigQueryTableRef(table)) + ) + not in self.discovered_tables ): logger.debug(f"inferred as temp table {name}") self.report.inferred_temp_tables.add(name) From c176ba5b32016d46dee6703152d0b34b8677479a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Tue, 18 Nov 2025 12:47:22 +0100 Subject: [PATCH 3/5] fix: apply case normalization consistently in is_allowed_table method - Fix case sensitivity bug in is_allowed_table() discovered_tables lookup - Both is_temp_table() and is_allowed_table() now consistently use standardize_identifier_case() - Resolves 'not allowed table' debug messages for legitimate production tables - Ensures convert_urns_to_lowercase config is respected across all table lookups --- .../ingestion/source/bigquery_v2/queries_extractor.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py index d8ef0f4a326fbc..c4f43c581cea14 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py @@ -283,7 +283,10 @@ def is_allowed_table(self, name: str) -> bool: table = BigqueryTableIdentifier.from_string_name(name) if ( self.discovered_tables - and str(BigQueryTableRef(table)) not in self.discovered_tables + and self.identifiers.standardize_identifier_case( + str(BigQueryTableRef(table)) + ) + not in self.discovered_tables ): logger.debug(f"not allowed table {name}") return False From 5c119bea91a03e4b5f8704b5ec5c429c7de361cb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Sat, 22 Nov 2025 10:30:29 +0100 Subject: [PATCH 4/5] cleanup: remove verbose debug logging while keeping useful temp table inference logs - Removed overly verbose table discovery debug logs from bigquery_schema_gen.py - Simplified temp table inference logging in queries_extractor.py - Kept the essential 'Inferred as temp table' message for troubleshooting - Case normalization fixes remain in place --- .../source/bigquery_v2/bigquery_schema_gen.py | 5 ----- .../source/bigquery_v2/queries_extractor.py | 20 +++---------------- 2 files changed, 3 insertions(+), 22 deletions(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py index c87ea5f9f5c835..cb37cb6e390d7b 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/bigquery_schema_gen.py @@ -530,9 +530,7 @@ def _process_schema( logger.debug( f"Lightweight table discovery for dataset {dataset_name} in project {project_id}" ) - table_count = 0 for table_item in self.schema_api.list_tables(dataset_name, project_id): - table_count += 1 table_type = getattr(table_item, "table_type", "UNKNOWN") identifier = BigqueryTableIdentifier( @@ -559,9 +557,6 @@ def _process_schema( logger.warning( f"Could not create table ref for {table_item.path}: {e}" ) - logger.debug( - f"Lightweight discovery for {dataset_name}: processed {table_count} objects, table_refs now has {len(self.table_refs)} total entries" - ) return if self.config.include_tables: diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py index c4f43c581cea14..81597efde3cac7 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py @@ -246,22 +246,6 @@ def is_temp_table(self, name: str) -> bool: # 2. we have a list of discovered tables, and # 3. it's not in the discovered tables list - # Debug the individual conditions - is_allowed = self.filters.is_allowed(table) - has_discovered_tables = bool(self.discovered_tables) - table_ref_str = self.identifiers.standardize_identifier_case( - str(BigQueryTableRef(table)) - ) - not_in_discovered = ( - table_ref_str not in self.discovered_tables - if self.discovered_tables - else False - ) - - logger.debug( - f"Temp table check for {name}: is_allowed={is_allowed}, has_discovered_tables={has_discovered_tables}, not_in_discovered={not_in_discovered}, table_ref={table_ref_str}" - ) - if ( self.filters.is_allowed(table) and self.discovered_tables @@ -270,7 +254,9 @@ def is_temp_table(self, name: str) -> bool: ) not in self.discovered_tables ): - logger.debug(f"inferred as temp table {name}") + logger.debug( + f"Inferred as temp table {name} (is_allowed?{self.filters.is_allowed(table)}" + ) self.report.inferred_temp_tables.add(name) return True From 812ef3f4bfb773ee514e452a1e96b2b4a537c54e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Sergio=20G=C3=B3mez=20Villamor?= Date: Mon, 24 Nov 2025 10:18:24 +0100 Subject: [PATCH 5/5] Update queries_extractor.py --- .../datahub/ingestion/source/bigquery_v2/queries_extractor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py index 81597efde3cac7..fcd7e8b7491450 100644 --- a/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py +++ b/metadata-ingestion/src/datahub/ingestion/source/bigquery_v2/queries_extractor.py @@ -255,7 +255,7 @@ def is_temp_table(self, name: str) -> bool: not in self.discovered_tables ): logger.debug( - f"Inferred as temp table {name} (is_allowed?{self.filters.is_allowed(table)}" + f"Inferred as temp table {name} (is_allowed?{self.filters.is_allowed(table)})" ) self.report.inferred_temp_tables.add(name) return True