datahub-project
diff --git a/‎metadata-ingestion/docs/sources/looker/lookml_post.md‎
Lines changed: 397 additions & 31 deletions b/‎metadata-ingestion/docs/sources/looker/lookml_post.md‎
Lines changed: 397 additions & 31 deletions
diff --git a/‎metadata-ingestion/docs/sources/looker/lookml_pre.md‎
Lines changed: 7 additions & 0 deletions b/‎metadata-ingestion/docs/sources/looker/lookml_pre.md‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎metadata-ingestion/docs/sources/looker/lookml_recipe.yml‎
Lines changed: 24 additions & 0 deletions b/‎metadata-ingestion/docs/sources/looker/lookml_recipe.yml‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py‎
Lines changed: 10 additions & 2 deletions b/‎metadata-ingestion/src/datahub/ingestion/source/looker/looker_lib_wrapper.py‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py‎
Lines changed: 43 additions & 1 deletion b/‎metadata-ingestion/src/datahub/ingestion/source/looker/lookml_config.py‎
Lines changed: 43 additions & 1 deletion
diff --git a/‎metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py‎
Lines changed: 79 additions & 10 deletions b/‎metadata-ingestion/src/datahub/ingestion/source/looker/lookml_source.py‎
Lines changed: 79 additions & 10 deletions
@@ -122,6 +122,13 @@ jobs:
                 client_id: ${LOOKER_CLIENT_ID}
                 client_secret: ${LOOKER_CLIENT_SECRET}
                 base_url: ${LOOKER_BASE_URL}
+              # Enable API-based lineage extraction (required for field splitting features)
+              use_api_for_view_lineage: true
+              # Optional: Large view handling configuration
+              # field_threshold_for_splitting: 100
+              # allow_partial_lineage_results: true
+              # enable_individual_field_fallback: true
+              # max_workers_for_parallel_processing: 10
           sink:
             type: datahub-rest
             config:
 
@@ -18,6 +18,30 @@ source:
       client_id: ${LOOKER_CLIENT_ID}
       client_secret: ${LOOKER_CLIENT_SECRET}
 
+    # Enable API-based lineage extraction (REQUIRED for field splitting features)
+    # When enabled, uses Looker API to get SQL representation of views for lineage parsing
+    # This enables field splitting, parallel processing, and individual field fallback for large views
+    # NOTE: Only works for "reachable views" - views that are referenced by explores in model files
+    # Unreachable views will fall back to regex-based parsing
+    use_api_for_view_lineage: true
+
+    # Control whether unreachable views are processed
+    # If true (default), only views referenced by explores are processed
+    # If false, all views are processed, but unreachable ones use regex parsing instead of API
+    # emit_reachable_views_only: true
+
+    # Optional: Enable API caching for better performance
+    # use_api_cache_for_view_lineage: true
+
+    # Large View Handling (for views with 100+ fields)
+    # These options help handle large views with many fields by splitting them into chunks
+    # and processing in parallel for better performance and reliability
+    # NOTE: Requires 'api' configuration and 'use_api_for_view_lineage: true' to work
+    # field_threshold_for_splitting: 100  # Split views with more than this many fields (default: 100)
+    # allow_partial_lineage_results: true  # Return partial lineage if some chunks fail (default: true)
+    # enable_individual_field_fallback: true  # Process fields individually if chunk fails (default: true)
+    # max_workers_for_parallel_processing: 10  # Parallel workers for processing (default: 10, max: 100)
+
     # Alternative to API section above if you want a purely file-based ingestion with no api calls to Looker or if you want to provide platform_instance ids for your connections
     # project_name: PROJECT_NAME # See (https://docs.looker.com/data-modeling/getting-started/how-project-works) to understand what is your project name
     # connection_to_platform_map:
 
@@ -228,10 +228,18 @@ def all_lookml_models(self) -> Sequence[LookmlModel]:
             transport_options=self.transport_options,
         )
 
-    def lookml_model_explore(self, model: str, explore_name: str) -> LookmlModelExplore:
+    def lookml_model_explore(
+        self,
+        model: str,
+        explore_name: str,
+        fields: Optional[List[str]] = None,
+    ) -> LookmlModelExplore:
         self.client_stats.explore_calls += 1
         return self.client.lookml_model_explore(
-            model, explore_name, transport_options=self.transport_options
+            model,
+            explore_name,
+            fields=self.__fields_mapper(fields) if fields else None,
+            transport_options=self.transport_options,
         )
 
     @lru_cache(maxsize=1000)
 
@@ -5,7 +5,7 @@
 from typing import Any, Dict, Literal, Optional, Union
 
 import pydantic
-from pydantic import model_validator
+from pydantic import field_validator, model_validator
 from pydantic.fields import Field
 
 from datahub.configuration.common import AllowDenyPattern
@@ -211,6 +211,48 @@ class LookMLSourceConfig(
         "All if comments are evaluated to true for configured looker_environment value",
     )
 
+    field_threshold_for_splitting: int = Field(
+        100,
+        description="When the total number of fields returned by Looker API exceeds this threshold, "
+        "the fields will be split into multiple API calls to avoid SQL parsing failures. "
+        "This helps provide partial column and table lineage when dealing with large field sets.",
+    )
+
+    allow_partial_lineage_results: bool = Field(
+        True,
+        description="When enabled, allows partial lineage results to be returned even when some field chunks "
+        "fail or when there are SQL parsing errors. This provides better resilience for large field sets "
+        "and ensures some lineage information is available rather than complete failure.",
+    )
+
+    enable_individual_field_fallback: bool = Field(
+        True,
+        description="When enabled, if a field chunk fails, the system will attempt to process each field "
+        "individually to maximize information and isolate problematic fields. This helps identify "
+        "which specific fields are causing issues while still getting lineage for working fields.",
+    )
+
+    max_workers_for_parallel_processing: int = Field(
+        10,
+        description="Maximum number of worker threads to use for parallel processing of field chunks and individual fields. "
+        "Set to 1 to process everything sequentially. Higher values can improve performance but may increase memory usage. "
+        "Maximum allowed value is 100 to prevent resource exhaustion.",
+    )
+
+    @field_validator("max_workers_for_parallel_processing")
+    def validate_max_workers(cls, v: int) -> int:
+        if v < 1:
+            raise ValueError(
+                f"max_workers_for_parallel_processing must be at least 1, got {v}"
+            )
+        if v > 100:
+            logger.warning(
+                f"max_workers_for_parallel_processing is set to {v}, which exceeds the recommended maximum of 100. "
+                f"This may cause resource exhaustion. Using 100 instead."
+            )
+            return 100
+        return v
+
     @model_validator(mode="before")
     @classmethod
     def convert_string_to_connection_def(cls, values: Dict[str, Any]) -> Dict[str, Any]:
 
@@ -1,7 +1,7 @@
 import logging
 import pathlib
 import tempfile
-from collections import OrderedDict
+from collections import OrderedDict, defaultdict
 from dataclasses import dataclass
 from datetime import datetime, timezone
 from typing import Dict, Iterable, List, Optional, Set, Tuple, Union
@@ -709,10 +709,14 @@ def get_internal_workunits(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
         # Value: Tuple(model file name, connection name)
         view_connection_map: Dict[str, Tuple[str, str]] = {}
 
-        # Map of view name to explore name for API-based view lineage
-        # A view can be referenced by multiple explores, we only need one of the explores to use Looker Query API
-        # Key: view_name, Value: explore_name
-        view_to_explore_map: Dict[str, str] = {}
+        # Map of view name to all possible explores for API-based view lineage
+        # A view can be referenced by multiple explores, we'll optimize the assignment
+        # Key: view_name, Value: set of explore_names
+        view_to_explores: Dict[str, Set[str]] = defaultdict(set)
+
+        # Temporary map to keep track of the views in an explore
+        # Key: explore_name, Value: set of view_names
+        explore_to_views: Dict[str, Set[str]] = defaultdict(set)
 
         # The ** means "this directory and all subdirectories", and hence should
         # include all the files we want.
@@ -789,8 +793,9 @@ def get_internal_workunits(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
                         for view_name in explore.upstream_views:
                             if self.source_config.emit_reachable_views_only:
                                 explore_reachable_views.add(view_name.include)
-                            # Build view to explore mapping for API-based view lineage
-                            view_to_explore_map[view_name.include] = explore.name
+
+                            view_to_explores[view_name.include].add(explore.name)
+                            explore_to_views[explore.name].add(view_name.include)
                 except Exception as e:
                     self.reporter.report_warning(
                         title="Failed to process explores",
@@ -804,6 +809,16 @@ def get_internal_workunits(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
                 model.connection, set()
             )
 
+            view_to_explore_map = {}
+            if view_to_explores and explore_to_views:
+                view_to_explore_map = self._optimize_views_by_common_explore(
+                    view_to_explores, explore_to_views
+                )
+            else:
+                logger.warning(
+                    f"Either view_to_explores: {view_to_explores} or explore_to_views: {explore_to_views} is empty"
+                )
+
             project_name = self.get_project_name(model_name)
 
             looker_view_id_cache: LookerViewIdCache = LookerViewIdCache(
@@ -888,9 +903,7 @@ def get_internal_workunits(self) -> Iterable[Union[MetadataWorkUnit, Entity]]:
                                 config=self.source_config,
                                 ctx=self.ctx,
                                 looker_client=self.looker_client,
-                                view_to_explore_map=view_to_explore_map
-                                if view_to_explore_map
-                                else None,
+                                view_to_explore_map=view_to_explore_map,
                             )
                         except Exception as e:
                             self.reporter.report_warning(
@@ -1040,5 +1053,61 @@ def report_skipped_unreachable_views(
                     context=(f"Project: {project}, View File Path: {path}"),
                 )
 
+    def _optimize_views_by_common_explore(
+        self,
+        view_to_explores: Dict[str, Set[str]],
+        explore_to_views: Dict[str, Set[str]],
+    ) -> Dict[str, str]:
+        """
+        Optimize view-to-explore mapping by grouping views to minimize API calls.
+
+        This uses a greedy algorithm that prioritizes explores that appear in the most views,
+        maximizing the number of views assigned to the same explore.
+
+        Args:
+            view_to_explores: Dict mapping view_name -> set of explore_names
+            explore_to_views: Dict mapping explore_name -> set of view_names
+
+        Returns:
+            Dict mapping view_name -> explore_name (optimized assignment)
+        """
+
+        # Pre-compute explore sizes
+        explore_sizes = {
+            explore: len(views) for explore, views in explore_to_views.items()
+        }
+
+        # Build view-to-explore mapping using dynamic programming approach
+        view_to_explore: Dict[str, str] = {}
+
+        # For each view, find the explore with maximum size that contains it
+        for view_name, candidate_explores in view_to_explores.items():
+            if candidate_explores:
+                # Find explore with maximum size using max() with key function
+                # This assings the view to the explore with the most views that contains it
+                best_explore = max(
+                    candidate_explores, key=lambda explore: explore_sizes[explore]
+                )
+                view_to_explore[view_name] = best_explore
+
+        # Log optimization results
+        unique_explores_used = len(set(view_to_explore.values()))
+        total_views = len(view_to_explore)
+        total_explores = len(explore_to_views)
+
+        if total_explores > 0:
+            efficiency = (1 - unique_explores_used / total_explores) * 100
+            logger.info(
+                f"View-explore optimization: Using {unique_explores_used}/{total_explores} "
+                f"explores for {total_views} views (efficiency: {efficiency:.1f}% savings)"
+            )
+        else:
+            logger.info(
+                f"View-explore optimization: No explores to optimize for {total_views} views"
+            )
+
+        logger.debug(f"Final View-to-explore mapping: {view_to_explore}")
+        return view_to_explore
+
     def get_report(self):
         return self.reporter