From 25b6e4dac7c7839066e3b996bf602b46cc033e04 Mon Sep 17 00:00:00 2001 From: Ankit Jain Date: Wed, 29 Oct 2025 12:58:20 -0700 Subject: [PATCH 1/3] Initial implementation of bulk collection from bulk scorer Signed-off-by: Ankit Jain --- .../apache/lucene/search/LeafCollector.java | 29 ++++++++++ .../lucene/search/TopScoreDocCollector.java | 5 ++ .../java/org/apache/lucene/search/Weight.java | 54 ++++++++++++++----- 3 files changed, 76 insertions(+), 12 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/search/LeafCollector.java b/lucene/core/src/java/org/apache/lucene/search/LeafCollector.java index f8765c18372d..b019784b3a11 100644 --- a/lucene/core/src/java/org/apache/lucene/search/LeafCollector.java +++ b/lucene/core/src/java/org/apache/lucene/search/LeafCollector.java @@ -123,6 +123,35 @@ default void collect(DocIdStream stream) throws IOException { stream.forEach(this::collect); } + /** + * Bulk-collect doc IDs. + * + *

Note: The provided int[] may be reused across calls and should be consumed + * immediately. + * + *

Note: The provided int[] typically only holds a small subset of query matches. + * This method may be called multiple times per segment. + * + *

Like {@link #collect(int)}, it is guaranteed that doc IDs get collected in order, ie. doc + * IDs are collected in order within a int[], and if called twice, all doc IDs from + * the second int[] will be greater than all doc IDs from the first int[]. + * + *

It is legal for callers to mix calls to {@link #collect(int[])}, {@link #collect(DocIdStream)} + * and {@link #collect(int)}. + * + *

The default implementation calls + * {@code + * for(int i = 0; i < count; ++i) { + * collect(docs[i]); + * }; + * }. + */ + default void collect(int[] docs, int count) throws IOException { + for(int i = 0; i < count; ++i) { + collect(docs[i]); + }; + } + /** * Optionally returns an iterator over competitive documents. * diff --git a/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollector.java b/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollector.java index e878f6f880b8..09a7146f8246 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollector.java +++ b/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollector.java @@ -127,6 +127,11 @@ public void collect(int doc) throws IOException { } } + @Override + public void collect(int[] docs, int count) { + collect(docs, count); + } + private void collectCompetitiveHit(int doc, float score) throws IOException { final long code = DocScoreEncoder.encode(doc + docBase, score); topCode = heap.updateTop(code); diff --git a/lucene/core/src/java/org/apache/lucene/search/Weight.java b/lucene/core/src/java/org/apache/lucene/search/Weight.java index 341dd3cadf6a..04e019d791b4 100644 --- a/lucene/core/src/java/org/apache/lucene/search/Weight.java +++ b/lucene/core/src/java/org/apache/lucene/search/Weight.java @@ -275,29 +275,35 @@ public int score(LeafCollector collector, Bits acceptDocs, int min, int max) // collect() because only a subset of collectors produce a competitive iterator, and the set // of implementing classes for two-phase approximations is smaller than the set of doc id set // iterator implementations. + + // Is it better to initialize the buffer within each iterator implementation? + int[] docBuffer = new int[64]; if (twoPhase == null && competitiveIterator == null) { // Optimize simple iterators with collectors that can't skip - scoreIterator(collector, acceptDocs, iterator, max); + scoreIterator(collector, acceptDocs, iterator, max, docBuffer); } else if (competitiveIterator == null) { - scoreTwoPhaseIterator(collector, acceptDocs, iterator, twoPhase, max); + scoreTwoPhaseIterator(collector, acceptDocs, iterator, twoPhase, max, docBuffer); } else if (twoPhase == null) { - scoreCompetitiveIterator(collector, acceptDocs, iterator, competitiveIterator, max); + scoreCompetitiveIterator(collector, acceptDocs, iterator, competitiveIterator, max, docBuffer); } else { scoreTwoPhaseOrCompetitiveIterator( - collector, acceptDocs, iterator, twoPhase, competitiveIterator, max); + collector, acceptDocs, iterator, twoPhase, competitiveIterator, max, docBuffer); } return iterator.docID(); } private static void scoreIterator( - LeafCollector collector, Bits acceptDocs, DocIdSetIterator iterator, int max) + LeafCollector collector, Bits acceptDocs, DocIdSetIterator iterator, int max, int[] docs) throws IOException { + int count = 0; for (int doc = iterator.docID(); doc < max; doc = iterator.nextDoc()) { if (acceptDocs == null || acceptDocs.get(doc)) { - collector.collect(doc); + count = collect(collector, docs, count, doc); } } + + collect(collector, docs, count, -1); } private static void scoreTwoPhaseIterator( @@ -305,13 +311,17 @@ private static void scoreTwoPhaseIterator( Bits acceptDocs, DocIdSetIterator iterator, TwoPhaseIterator twoPhase, - int max) + int max, + int[] docs) throws IOException { + int count = 0; for (int doc = iterator.docID(); doc < max; doc = iterator.nextDoc()) { if ((acceptDocs == null || acceptDocs.get(doc)) && twoPhase.matches()) { - collector.collect(doc); + count = collect(collector, docs, count, doc); } } + + collect(collector, docs, count, -1); } private static void scoreCompetitiveIterator( @@ -319,8 +329,10 @@ private static void scoreCompetitiveIterator( Bits acceptDocs, DocIdSetIterator iterator, DocIdSetIterator competitiveIterator, - int max) + int max, + int[] docs) throws IOException { + int count = 0; for (int doc = iterator.docID(); doc < max; ) { assert competitiveIterator.docID() <= doc; // invariant if (competitiveIterator.docID() < doc) { @@ -332,11 +344,13 @@ private static void scoreCompetitiveIterator( } if ((acceptDocs == null || acceptDocs.get(doc))) { - collector.collect(doc); + count = collect(collector, docs, count, doc); } doc = iterator.nextDoc(); } + + collect(collector, docs, count, -1); } private static void scoreTwoPhaseOrCompetitiveIterator( @@ -345,8 +359,10 @@ private static void scoreTwoPhaseOrCompetitiveIterator( DocIdSetIterator iterator, TwoPhaseIterator twoPhase, DocIdSetIterator competitiveIterator, - int max) + int max, + int[] docs) throws IOException { + int count = 0; for (int doc = iterator.docID(); doc < max; ) { assert competitiveIterator.docID() <= doc; // invariant if (competitiveIterator.docID() < doc) { @@ -358,11 +374,25 @@ private static void scoreTwoPhaseOrCompetitiveIterator( } if ((acceptDocs == null || acceptDocs.get(doc)) && twoPhase.matches()) { - collector.collect(doc); + count = collect(collector, docs, count, doc); } doc = iterator.nextDoc(); } + + collect(collector, docs, count, -1); + } + + private static int collect(LeafCollector collector, int[] docs, int count, int docId) throws IOException { + if (count == docs.length || docId == -1) { + collector.collect(docs, count); + count = 0; + } + + // count is always expected to be less than docs.length + docs[count++] = docId; + + return count; } } } From feed60b71f9cc59e72bd0050ac740e5a69b631a2 Mon Sep 17 00:00:00 2001 From: Ankit Jain Date: Wed, 29 Oct 2025 13:00:39 -0700 Subject: [PATCH 2/3] tidy Signed-off-by: Ankit Jain --- .../apache/lucene/search/LeafCollector.java | 32 ++++++++----------- .../java/org/apache/lucene/search/Weight.java | 6 ++-- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/search/LeafCollector.java b/lucene/core/src/java/org/apache/lucene/search/LeafCollector.java index b019784b3a11..8d772cbcd6d5 100644 --- a/lucene/core/src/java/org/apache/lucene/search/LeafCollector.java +++ b/lucene/core/src/java/org/apache/lucene/search/LeafCollector.java @@ -126,30 +126,26 @@ default void collect(DocIdStream stream) throws IOException { /** * Bulk-collect doc IDs. * - *

Note: The provided int[] may be reused across calls and should be consumed - * immediately. + *

Note: The provided int[] may be reused across calls and should be consumed immediately. * - *

Note: The provided int[] typically only holds a small subset of query matches. - * This method may be called multiple times per segment. + *

Note: The provided int[] typically only holds a small subset of query matches. This method + * may be called multiple times per segment. * *

Like {@link #collect(int)}, it is guaranteed that doc IDs get collected in order, ie. doc - * IDs are collected in order within a int[], and if called twice, all doc IDs from - * the second int[] will be greater than all doc IDs from the first int[]. - * - *

It is legal for callers to mix calls to {@link #collect(int[])}, {@link #collect(DocIdStream)} - * and {@link #collect(int)}. - * - *

The default implementation calls - * {@code - * for(int i = 0; i < count; ++i) { - * collect(docs[i]); - * }; - * }. + * IDs are collected in order within a int[], and if called twice, all doc IDs from the second + * int[] will be greater than all doc IDs from the first int[]. + * + *

It is legal for callers to mix calls to {@link #collect(int[], int)}, {@link + * #collect(DocIdStream)} and {@link #collect(int)}. + * + *

The default implementation calls {@code for(int i = 0; i < count; ++i) { collect(docs[i]); + * }; }. */ default void collect(int[] docs, int count) throws IOException { - for(int i = 0; i < count; ++i) { + for (int i = 0; i < count; ++i) { collect(docs[i]); - }; + } + ; } /** diff --git a/lucene/core/src/java/org/apache/lucene/search/Weight.java b/lucene/core/src/java/org/apache/lucene/search/Weight.java index 04e019d791b4..e10155e2df01 100644 --- a/lucene/core/src/java/org/apache/lucene/search/Weight.java +++ b/lucene/core/src/java/org/apache/lucene/search/Weight.java @@ -284,7 +284,8 @@ public int score(LeafCollector collector, Bits acceptDocs, int min, int max) } else if (competitiveIterator == null) { scoreTwoPhaseIterator(collector, acceptDocs, iterator, twoPhase, max, docBuffer); } else if (twoPhase == null) { - scoreCompetitiveIterator(collector, acceptDocs, iterator, competitiveIterator, max, docBuffer); + scoreCompetitiveIterator( + collector, acceptDocs, iterator, competitiveIterator, max, docBuffer); } else { scoreTwoPhaseOrCompetitiveIterator( collector, acceptDocs, iterator, twoPhase, competitiveIterator, max, docBuffer); @@ -383,7 +384,8 @@ private static void scoreTwoPhaseOrCompetitiveIterator( collect(collector, docs, count, -1); } - private static int collect(LeafCollector collector, int[] docs, int count, int docId) throws IOException { + private static int collect(LeafCollector collector, int[] docs, int count, int docId) + throws IOException { if (count == docs.length || docId == -1) { collector.collect(docs, count); count = 0; From 82fe4e63f93c79e7c68892fe59ffa2dccd031459 Mon Sep 17 00:00:00 2001 From: Ankit Jain Date: Wed, 29 Oct 2025 18:05:15 -0700 Subject: [PATCH 3/3] Minor fi Signed-off-by: Ankit Jain --- .../src/java/org/apache/lucene/search/LeafCollector.java | 1 - .../java/org/apache/lucene/search/TopScoreDocCollector.java | 6 ++++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/lucene/core/src/java/org/apache/lucene/search/LeafCollector.java b/lucene/core/src/java/org/apache/lucene/search/LeafCollector.java index 8d772cbcd6d5..ac54455559e5 100644 --- a/lucene/core/src/java/org/apache/lucene/search/LeafCollector.java +++ b/lucene/core/src/java/org/apache/lucene/search/LeafCollector.java @@ -145,7 +145,6 @@ default void collect(int[] docs, int count) throws IOException { for (int i = 0; i < count; ++i) { collect(docs[i]); } - ; } /** diff --git a/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollector.java b/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollector.java index 09a7146f8246..90fbc4472373 100644 --- a/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollector.java +++ b/lucene/core/src/java/org/apache/lucene/search/TopScoreDocCollector.java @@ -128,8 +128,10 @@ public void collect(int doc) throws IOException { } @Override - public void collect(int[] docs, int count) { - collect(docs, count); + public void collect(int[] docs, int count) throws IOException { + for (int i = 0; i < count; ++i) { + collect(docs[i]); + } } private void collectCompetitiveHit(int doc, float score) throws IOException {