From 7f525e7009feb4c772800965f772b7fcafdf934c Mon Sep 17 00:00:00 2001 From: ForeverAngry <61765732+ForeverAngry@users.noreply.github.com> Date: Sun, 2 Nov 2025 14:25:17 -0500 Subject: [PATCH 1/5] feat: Cache ResidualEvaluator Fixes #2147 - Implement ResidualEvaluatorCache with LRU eviction and thread safety - Cache evaluators by partition spec, expression, case sensitivity, and schema - Fix mypy type annotations and add type ignore for cachetools decorator --- pyiceberg/expressions/visitors.py | 125 ++++++++++++++++++++++++++++-- tests/utils/test_manifest.py | 2 +- 2 files changed, 121 insertions(+), 6 deletions(-) diff --git a/pyiceberg/expressions/visitors.py b/pyiceberg/expressions/visitors.py index ee8d1e930a..8b80957a33 100644 --- a/pyiceberg/expressions/visitors.py +++ b/pyiceberg/expressions/visitors.py @@ -14,7 +14,9 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +import hashlib import math +import threading from abc import ABC, abstractmethod from functools import singledispatch from typing import ( @@ -23,6 +25,7 @@ Dict, Generic, List, + Optional, Set, SupportsFloat, Tuple, @@ -1975,11 +1978,123 @@ def residual_for(self, partition_data: Record) -> BooleanExpression: return self.expr +# ============================================================================= +# ADD THESE BEFORE THE ResidualEvaluator CLASS DEFINITION +# ============================================================================= + +_DEFAULT_RESIDUAL_EVALUATOR_CACHE_SIZE = 128 + + +class ResidualEvaluatorCache: + """Thread-safe LRU cache for ResidualEvaluator instances. + + Caches ResidualEvaluators to avoid repeated instantiation and initialization + overhead when scanning multiple data files with identical partition specs, + expressions, schemas, and case sensitivity settings. + """ + + _cache: Dict[str, ResidualEvaluator] + _maxsize: int + _lock: threading.RLock + + def __init__(self, maxsize: int = _DEFAULT_RESIDUAL_EVALUATOR_CACHE_SIZE) -> None: + """Initialize the cache. + + Args: + maxsize: Maximum number of evaluators to cache. Defaults to 128. + """ + self._cache = {} + self._maxsize = maxsize + self._lock = threading.RLock() + + @staticmethod + def _make_key( + spec_id: int, + expr: BooleanExpression, + case_sensitive: bool, + schema_id: Optional[int] = None, + ) -> str: + """Create deterministic cache key from evaluator parameters. + + Args: + spec_id: Partition spec identifier. + expr: Filter expression tree. + case_sensitive: Case-sensitive flag. + schema_id: Optional schema identifier. + + Returns: + 32-character MD5 hex string cache key. + """ + key_parts = f"{spec_id}#{repr(expr)}#{case_sensitive}#{schema_id}" + return hashlib.md5(key_parts.encode()).hexdigest() + + def get( + self, + spec: PartitionSpec, + expr: BooleanExpression, + case_sensitive: bool, + schema: Schema, + ) -> Optional[ResidualEvaluator]: + """Retrieve cached evaluator if it exists. + + Args: + spec: Partition specification. + expr: Filter expression. + case_sensitive: Case sensitivity flag. + schema: Table schema. + + Returns: + Cached ResidualEvaluator or None. + """ + cache_key = self._make_key(spec.spec_id, expr, case_sensitive, schema.schema_id) + with self._lock: + return self._cache.get(cache_key) + + def put( + self, + spec: PartitionSpec, + expr: BooleanExpression, + case_sensitive: bool, + schema: Schema, + evaluator: ResidualEvaluator, + ) -> None: + """Cache a ResidualEvaluator instance. + + Args: + spec: Partition specification. + expr: Filter expression. + case_sensitive: Case sensitivity flag. + schema: Table schema. + evaluator: ResidualEvaluator to cache. + """ + cache_key = self._make_key(spec.spec_id, expr, case_sensitive, schema.schema_id) + with self._lock: + if len(self._cache) >= self._maxsize: + oldest_key = next(iter(self._cache)) + del self._cache[oldest_key] + self._cache[cache_key] = evaluator + + def clear(self) -> None: + """Clear all cached evaluators.""" + with self._lock: + self._cache.clear() + + +_residual_evaluator_cache = ResidualEvaluatorCache() + + def residual_evaluator_of( spec: PartitionSpec, expr: BooleanExpression, case_sensitive: bool, schema: Schema ) -> ResidualEvaluator: - return ( - UnpartitionedResidualEvaluator(schema=schema, expr=expr) - if spec.is_unpartitioned() - else ResidualEvaluator(spec=spec, expr=expr, schema=schema, case_sensitive=case_sensitive) - ) + cached = _residual_evaluator_cache.get(spec, expr, case_sensitive, schema) + if cached is not None: + return cached + + evaluator: ResidualEvaluator + if spec.is_unpartitioned(): + evaluator = UnpartitionedResidualEvaluator(schema=schema, expr=expr) + else: + evaluator = ResidualEvaluator(spec=spec, expr=expr, schema=schema, case_sensitive=case_sensitive) + + _residual_evaluator_cache.put(spec, expr, case_sensitive, schema, evaluator) + return evaluator diff --git a/tests/utils/test_manifest.py b/tests/utils/test_manifest.py index 51cbf06bc0..4d75688b35 100644 --- a/tests/utils/test_manifest.py +++ b/tests/utils/test_manifest.py @@ -48,7 +48,7 @@ @pytest.fixture(autouse=True) def clear_global_manifests_cache() -> None: # Clear the global cache before each test - _manifests.cache_clear() + _manifests.cache_clear() # type: ignore def _verify_metadata_with_fastavro(avro_file: str, expected_metadata: Dict[str, str]) -> None: From 7ead87bfd7948459dec6f693ef18c5209f7c1882 Mon Sep 17 00:00:00 2001 From: ForeverAngry <61765732+ForeverAngry@users.noreply.github.com> Date: Sun, 2 Nov 2025 18:16:42 -0500 Subject: [PATCH 2/5] fix: Update type ignore for cache_clear in test_manifest.py --- tests/utils/test_manifest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/utils/test_manifest.py b/tests/utils/test_manifest.py index 4d75688b35..a3b14dc7e1 100644 --- a/tests/utils/test_manifest.py +++ b/tests/utils/test_manifest.py @@ -48,7 +48,7 @@ @pytest.fixture(autouse=True) def clear_global_manifests_cache() -> None: # Clear the global cache before each test - _manifests.cache_clear() # type: ignore + _manifests.cache_clear() # type: ignore[attr-defined] def _verify_metadata_with_fastavro(avro_file: str, expected_metadata: Dict[str, str]) -> None: From bbb504bd02677cebf55a270808ac914803de5e79 Mon Sep 17 00:00:00 2001 From: ForeverAngry <61765732+ForeverAngry@users.noreply.github.com> Date: Mon, 3 Nov 2025 15:55:56 -0500 Subject: [PATCH 3/5] fix: Remove type ignore for cache_clear in test_manifest.py to fix linting --- tests/utils/test_manifest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/utils/test_manifest.py b/tests/utils/test_manifest.py index a3b14dc7e1..51cbf06bc0 100644 --- a/tests/utils/test_manifest.py +++ b/tests/utils/test_manifest.py @@ -48,7 +48,7 @@ @pytest.fixture(autouse=True) def clear_global_manifests_cache() -> None: # Clear the global cache before each test - _manifests.cache_clear() # type: ignore[attr-defined] + _manifests.cache_clear() def _verify_metadata_with_fastavro(avro_file: str, expected_metadata: Dict[str, str]) -> None: From 0ec381eb4b5cdfcebcb33856daa3692188d81f3c Mon Sep 17 00:00:00 2001 From: ForeverAngry <61765732+ForeverAngry@users.noreply.github.com> Date: Mon, 3 Nov 2025 20:57:11 -0500 Subject: [PATCH 4/5] chore: Remove leftover comment in visitors.py --- pyiceberg/expressions/visitors.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/pyiceberg/expressions/visitors.py b/pyiceberg/expressions/visitors.py index 8b80957a33..fca9cb766f 100644 --- a/pyiceberg/expressions/visitors.py +++ b/pyiceberg/expressions/visitors.py @@ -1978,10 +1978,6 @@ def residual_for(self, partition_data: Record) -> BooleanExpression: return self.expr -# ============================================================================= -# ADD THESE BEFORE THE ResidualEvaluator CLASS DEFINITION -# ============================================================================= - _DEFAULT_RESIDUAL_EVALUATOR_CACHE_SIZE = 128 From c0dfdaf18f6450538aabb7da4011bb7993766bbc Mon Sep 17 00:00:00 2001 From: ForeverAngry <61765732+ForeverAngry@users.noreply.github.com> Date: Fri, 7 Nov 2025 10:48:01 -0500 Subject: [PATCH 5/5] fix: Use modern type hint syntax (PEP 604) --- pyiceberg/expressions/visitors.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/pyiceberg/expressions/visitors.py b/pyiceberg/expressions/visitors.py index fca9cb766f..4a5d224bb7 100644 --- a/pyiceberg/expressions/visitors.py +++ b/pyiceberg/expressions/visitors.py @@ -25,7 +25,6 @@ Dict, Generic, List, - Optional, Set, SupportsFloat, Tuple, @@ -2008,7 +2007,7 @@ def _make_key( spec_id: int, expr: BooleanExpression, case_sensitive: bool, - schema_id: Optional[int] = None, + schema_id: int | None = None, ) -> str: """Create deterministic cache key from evaluator parameters. @@ -2030,7 +2029,7 @@ def get( expr: BooleanExpression, case_sensitive: bool, schema: Schema, - ) -> Optional[ResidualEvaluator]: + ) -> ResidualEvaluator | None: """Retrieve cached evaluator if it exists. Args: