From c1d651580beb2daeef19f3a1f8d6a5e501bb66cf Mon Sep 17 00:00:00 2001 From: kdt523 Date: Tue, 21 Oct 2025 14:43:34 +0530 Subject: [PATCH 1/5] Fix MaxScoreBulkScorer window bounds bug and add CHANGES entry --- lucene/CHANGES.txt | 2 + .../lucene/search/MaxScoreBulkScorer.java | 15 +++- .../TestMaxScoreBulkScorerFilterBounds.java | 80 +++++++++++++++++++ 3 files changed, 93 insertions(+), 4 deletions(-) create mode 100644 lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorerFilterBounds.java diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 8ba40ef88d06..f3c90d447886 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -7,6 +7,8 @@ http://s.apache.org/luceneversions API Changes --------------------- +* GITHUB#XXXX: (contributor: kdt523) + * GITHUB#15215: Switch to Java 25 as the minimum required platform. Upgrade to gradle 9.1.0. (Robert Muir, Kaival Parikh, Dawid Weiss) diff --git a/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java b/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java index 343239e8c7a4..4e6dc894bbac 100644 --- a/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java +++ b/lucene/core/src/java/org/apache/lucene/search/MaxScoreBulkScorer.java @@ -89,11 +89,15 @@ public int score(LeafCollector collector, Bits acceptDocs, int min, int max) thr // Then within these outer windows, it creates inner windows of size WINDOW_SIZE that help // collect matches into a bitset and save the overhead of rebalancing the priority queue on // every match. + // Never iterate beyond this leaf's maxDoc to avoid scoring invalid doc IDs. + final int loopMax = Math.min(max, maxDoc); + int outerWindowMin = min; outer: - while (outerWindowMin < max) { + while (outerWindowMin < loopMax) { int outerWindowMax = computeOuterWindowMax(outerWindowMin); - outerWindowMax = Math.min(outerWindowMax, max); + // Cap outer window by loopMax (which itself is <= maxDoc) + outerWindowMax = Math.min(outerWindowMax, loopMax); while (true) { updateMaxWindowScores(outerWindowMin, outerWindowMax); @@ -178,7 +182,9 @@ private void scoreInnerWindowWithFilter( // Only score an inner window, after that we'll check if the min competitive score has increased // enough for a more favorable partitioning to be used. int innerWindowMin = top.doc; - int innerWindowMax = MathUtil.unsignedMin(max, innerWindowMin + INNER_WINDOW_SIZE); + // Ensure innerWindowMax never exceeds maxDoc + int innerWindowMax = + Math.min(maxDoc, MathUtil.unsignedMin(max, innerWindowMin + INNER_WINDOW_SIZE)); docAndScoreAccBuffer.size = 0; while (top.doc < innerWindowMax) { @@ -241,7 +247,8 @@ private void scoreInnerWindowMultipleEssentialClauses( DisiWrapper top = essentialQueue.top(); int innerWindowMin = top.doc; - int innerWindowMax = MathUtil.unsignedMin(max, innerWindowMin + INNER_WINDOW_SIZE); + int innerWindowMax = + Math.min(maxDoc, MathUtil.unsignedMin(max, innerWindowMin + INNER_WINDOW_SIZE)); int innerWindowSize = innerWindowMax - innerWindowMin; // Collect matches of essential clauses into a bitset diff --git a/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorerFilterBounds.java b/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorerFilterBounds.java new file mode 100644 index 000000000000..3506aa587cd2 --- /dev/null +++ b/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorerFilterBounds.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search; + +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.document.StringField; +import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.Term; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.tests.util.LuceneTestCase; + +/** + * Regression test for a bug where MaxScoreBulkScorer could score past leaf maxDoc when a + * restrictive filter and disjunction were used together. + */ +public class TestMaxScoreBulkScorerFilterBounds extends LuceneTestCase { + + public void testFilteredDisjunctionDoesNotScorePastMaxDoc() throws Exception { + Directory dir = new RAMDirectory(); + IndexWriterConfig iwc = new IndexWriterConfig(); + try (IndexWriter w = new IndexWriter(dir, iwc)) { + // Create a small index where one clause matches more docs than the other, and a restrictive + // filter + for (int i = 0; i < 200; i++) { + Document d = new Document(); + // Clause A matches ~1/3 + d.add(new StringField("a", (i % 3 == 0) ? "yes" : "no", Field.Store.NO)); + // Clause B matches ~1/9 + d.add(new StringField("b", (i % 9 == 0) ? "yes" : "no", Field.Store.NO)); + // Restrictive filter matches ~1% + d.add(new StringField("f", (i % 100 == 0) ? "on" : "off", Field.Store.NO)); + w.addDocument(d); + } + } + + try (DirectoryReader reader = DirectoryReader.open(dir)) { + IndexSearcher searcher = new IndexSearcher(reader); + + Query disjunction = + new BooleanQuery.Builder() + .add(new TermQuery(new Term("a", "yes")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("b", "yes")), BooleanClause.Occur.SHOULD) + .build(); + + Query filter = new TermQuery(new Term("f", "on")); + + Query filtered = + new BooleanQuery.Builder() + .add(disjunction, BooleanClause.Occur.SHOULD) + .add(filter, BooleanClause.Occur.FILTER) + .build(); + + // This triggers TOP_SCORES path internally; just execute to ensure no exceptions + TopDocs td = searcher.search(filtered, 10); + assertNotNull(td); + // Optionally assert we got at most 2 hits (since ~200 docs, ~1% filter) but not necessary for + // regression + } finally { + dir.close(); + } + } +} From f84913b9aa35a618b8b57c72b085c1c12d57d2b3 Mon Sep 17 00:00:00 2001 From: kdt523 Date: Tue, 21 Oct 2025 22:27:33 +0530 Subject: [PATCH 2/5] Add CHANGES entry for MaxScoreBulkScorer fix --- lucene/CHANGES.txt | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index 0ea0f41615a5..c04aebb6eb54 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -7,7 +7,7 @@ http://s.apache.org/luceneversions API Changes --------------------- -* GITHUB#XXXX: (contributor: kdt523) +* GITHUB#15324: Fix MaxScoreBulkScorer could call TermScorer with docID >= maxDoc, causing EOFException on norms access (contributor: kdt523) * GITHUB#15215: Switch to Java 25 as the minimum required platform. Upgrade to gradle 9.1.0. (Robert Muir, Kaival Parikh, Dawid Weiss) @@ -202,9 +202,6 @@ Optimizations * GITHUB#15261: Implement longValues for MultiFieldNormValues to speedup CombinedQuery (Ge Song) -* GITHUB#15343: Ensure that `AcceptDocs#cost()` only ever calls `BitSets#cardinality()` - once per instance to avoid redundant computation. (Ben Trent) - Bug Fixes --------------------- * GITHUB#14161: PointInSetQuery's constructor now throws IllegalArgumentException From 3fec857377cf40ae4e0a6b9bb1ae2fa93086ad51 Mon Sep 17 00:00:00 2001 From: kdt523 Date: Wed, 29 Oct 2025 11:55:26 +0530 Subject: [PATCH 3/5] restore:Add-back-GITHUB15343-AcceptDocs-cost-CHANGES-entry --- lucene/CHANGES.txt | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index c04aebb6eb54..ee401b8b8303 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -100,6 +100,9 @@ Bug Fixes * GITHUB#15125: Handle inconsistent schema on flush with index sorts (Nhat Nguyen) +* GITHUB#15343: Ensure that `AcceptDocs#cost()` only ever calls `BitSets#cardinality()` + once per instance to avoid redundant computation. (Ben Trent) + Changes in Runtime Behavior --------------------- * GITHUB#14187: The query cache is now disabled by default. (Adrien Grand) From ae6fab5c19f2e887fb49a1807184a3cf3b96eba8 Mon Sep 17 00:00:00 2001 From: kdt523 Date: Wed, 29 Oct 2025 12:01:51 +0530 Subject: [PATCH 4/5] fix test: use newDirectory instead of RAMDirectory in TestMaxScoreBulkScorerFilterBounds --- .../lucene/search/TestMaxScoreBulkScorerFilterBounds.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorerFilterBounds.java b/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorerFilterBounds.java index 3506aa587cd2..1411f49cc1bb 100644 --- a/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorerFilterBounds.java +++ b/lucene/core/src/test/org/apache/lucene/search/TestMaxScoreBulkScorerFilterBounds.java @@ -24,7 +24,7 @@ import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; -import org.apache.lucene.store.RAMDirectory; +// ...existing code... import org.apache.lucene.tests.util.LuceneTestCase; /** @@ -34,7 +34,7 @@ public class TestMaxScoreBulkScorerFilterBounds extends LuceneTestCase { public void testFilteredDisjunctionDoesNotScorePastMaxDoc() throws Exception { - Directory dir = new RAMDirectory(); + Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(); try (IndexWriter w = new IndexWriter(dir, iwc)) { // Create a small index where one clause matches more docs than the other, and a restrictive From 9c227569ca7f752c3c89632010ae9b0304214983 Mon Sep 17 00:00:00 2001 From: kdt523 Date: Wed, 29 Oct 2025 12:08:26 +0530 Subject: [PATCH 5/5] fix: resolve CHANGES.txt merge conflict and restore GITHUB#15343 and GITHUB#14963 entries --- lucene/CHANGES.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt index ee401b8b8303..59f493015dce 100644 --- a/lucene/CHANGES.txt +++ b/lucene/CHANGES.txt @@ -205,8 +205,10 @@ Optimizations * GITHUB#15261: Implement longValues for MultiFieldNormValues to speedup CombinedQuery (Ge Song) +* GITHUB#15343: Ensure that `AcceptDocs#cost()` only ever calls `BitSets#cardinality()` + once per instance to avoid redundant computation. (Ben Trent) +* GITHUB#14963: Bypass HNSW graph building for tiny segments. (Shubham Chaudhary, Ben Trent) Bug Fixes ---------------------- * GITHUB#14161: PointInSetQuery's constructor now throws IllegalArgumentException instead of UnsupportedOperationException when values are out of order. (Shubham Sharma)