-
Notifications
You must be signed in to change notification settings - Fork 25.6k
POC-2 - Auto prefiltering for semantic text queries #137739
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
6d68f0b
6e4ff43
9d6a2c8
eda1d9e
77e74bf
a34531c
3459d31
36e2695
7cfadd5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,41 @@ | ||
| /* | ||
| * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one | ||
| * or more contributor license agreements. Licensed under the "Elastic License | ||
| * 2.0", the "GNU Affero General Public License v3.0 only", and the "Server Side | ||
| * Public License v 1"; you may not use this file except in compliance with, at | ||
| * your election, the "Elastic License 2.0", the "GNU Affero General Public | ||
| * License v3.0 only", or the "Server Side Public License, v 1". | ||
| */ | ||
|
|
||
| package org.elasticsearch.index.query.support; | ||
|
|
||
| import org.elasticsearch.index.query.QueryBuilder; | ||
|
|
||
| import java.util.Deque; | ||
| import java.util.LinkedList; | ||
| import java.util.List; | ||
|
|
||
| /** | ||
| * During query parsing this keeps track of the current prefiltering level. | ||
| */ | ||
| public final class AutoPrefilteringScope implements AutoCloseable { | ||
|
|
||
| private final Deque<List<QueryBuilder>> prefiltersStack = new LinkedList<>(); | ||
|
|
||
| public List<QueryBuilder> getPrefilters() { | ||
| return prefiltersStack.stream().flatMap(List::stream).toList(); | ||
| } | ||
|
|
||
| public void push(List<QueryBuilder> prefilters) { | ||
| prefiltersStack.push(prefilters); | ||
| } | ||
|
|
||
| public void pop() { | ||
| prefiltersStack.pop(); | ||
| } | ||
|
|
||
| @Override | ||
| public void close() { | ||
| pop(); | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change | ||
|---|---|---|---|---|
|
|
@@ -56,6 +56,9 @@ | |||
| * {@link org.apache.lucene.search.KnnByteVectorQuery}. | ||||
| */ | ||||
| public class KnnVectorQueryBuilder extends AbstractQueryBuilder<KnnVectorQueryBuilder> { | ||||
|
|
||||
| public static final TransportVersion AUTO_PREFILTERING = TransportVersion.fromName("knn_vector_query_auto_prefiltering"); | ||||
|
|
||||
| public static final String NAME = "knn"; | ||||
| private static final int NUM_CANDS_LIMIT = 10_000; | ||||
| private static final float NUM_CANDS_MULTIPLICATIVE_FACTOR = 1.5f; | ||||
|
|
@@ -121,7 +124,7 @@ public static KnnVectorQueryBuilder fromXContent(XContentParser parser) { | |||
| return PARSER.apply(parser, null); | ||||
| } | ||||
|
|
||||
| private static final TransportVersion VISIT_PERCENTAGE = TransportVersion.fromName("visit_percentage"); | ||||
| public static final TransportVersion VISIT_PERCENTAGE = TransportVersion.fromName("visit_percentage"); | ||||
|
|
||||
| private final String fieldName; | ||||
| private final VectorData queryVector; | ||||
|
|
@@ -133,6 +136,7 @@ public static KnnVectorQueryBuilder fromXContent(XContentParser parser) { | |||
| private final QueryVectorBuilder queryVectorBuilder; | ||||
| private final Supplier<float[]> queryVectorSupplier; | ||||
| private final RescoreVectorBuilder rescoreVectorBuilder; | ||||
| private boolean isAutoPrefiltering = false; | ||||
|
|
||||
| public KnnVectorQueryBuilder( | ||||
| String fieldName, | ||||
|
|
@@ -302,6 +306,9 @@ public KnnVectorQueryBuilder(StreamInput in) throws IOException { | |||
| } else { | ||||
| this.rescoreVectorBuilder = null; | ||||
| } | ||||
| if (in.getTransportVersion().supports(AUTO_PREFILTERING)) { | ||||
| this.isAutoPrefiltering = in.readBoolean(); | ||||
| } | ||||
|
|
||||
| this.queryVectorSupplier = null; | ||||
| } | ||||
|
|
@@ -357,6 +364,10 @@ public KnnVectorQueryBuilder addFilterQueries(List<QueryBuilder> filterQueries) | |||
| return this; | ||||
| } | ||||
|
|
||||
| public boolean isAutoPrefiltering() { | ||||
| return isAutoPrefiltering; | ||||
| } | ||||
|
|
||||
| @Override | ||||
| protected void doWriteTo(StreamOutput out) throws IOException { | ||||
| if (queryVectorSupplier != null) { | ||||
|
|
@@ -417,6 +428,9 @@ protected void doWriteTo(StreamOutput out) throws IOException { | |||
| if (out.getTransportVersion().supports(TransportVersions.V_8_18_0)) { | ||||
| out.writeOptionalWriteable(rescoreVectorBuilder); | ||||
| } | ||||
| if (out.getTransportVersion().supports(AUTO_PREFILTERING)) { | ||||
| out.writeBoolean(isAutoPrefiltering); | ||||
| } | ||||
| } | ||||
|
|
||||
| @Override | ||||
|
|
@@ -479,7 +493,7 @@ protected QueryBuilder doRewrite(QueryRewriteContext ctx) throws IOException { | |||
| visitPercentage, | ||||
| rescoreVectorBuilder, | ||||
| vectorSimilarity | ||||
| ).boost(boost).queryName(queryName).addFilterQueries(filterQueries); | ||||
| ).boost(boost).queryName(queryName).addFilterQueries(filterQueries).setAutoPrefiltering(isAutoPrefiltering); | ||||
| } | ||||
| if (queryVectorBuilder != null) { | ||||
| SetOnce<float[]> toSet = new SetOnce<>(); | ||||
|
|
@@ -509,7 +523,7 @@ protected QueryBuilder doRewrite(QueryRewriteContext ctx) throws IOException { | |||
| visitPercentage, | ||||
| rescoreVectorBuilder, | ||||
| vectorSimilarity | ||||
| ).boost(boost).queryName(queryName).addFilterQueries(filterQueries); | ||||
| ).boost(boost).queryName(queryName).addFilterQueries(filterQueries).setAutoPrefiltering(isAutoPrefiltering); | ||||
| } | ||||
| boolean changed = false; | ||||
| List<QueryBuilder> rewrittenQueries = new ArrayList<>(filterQueries.size()); | ||||
|
|
@@ -534,7 +548,7 @@ protected QueryBuilder doRewrite(QueryRewriteContext ctx) throws IOException { | |||
| visitPercentage, | ||||
| rescoreVectorBuilder, | ||||
| vectorSimilarity | ||||
| ).boost(boost).queryName(queryName).addFilterQueries(rewrittenQueries); | ||||
| ).boost(boost).queryName(queryName).addFilterQueries(rewrittenQueries).setAutoPrefiltering(isAutoPrefiltering); | ||||
| } | ||||
| if (ctx.convertToInnerHitsRewriteContext() != null) { | ||||
| QueryBuilder exactKnnQuery = new ExactKnnQueryBuilder(queryVector, fieldName, vectorSimilarity); | ||||
|
|
@@ -579,8 +593,9 @@ protected Query doToQuery(SearchExecutionContext context) throws IOException { | |||
| } | ||||
| DenseVectorFieldType vectorFieldType = (DenseVectorFieldType) fieldType; | ||||
|
|
||||
| List<Query> filtersInitial = new ArrayList<>(filterQueries.size()); | ||||
| for (QueryBuilder query : this.filterQueries) { | ||||
| List<QueryBuilder> allApplicableFilters = getAllApplicableFilters(context); | ||||
| List<Query> filtersInitial = new ArrayList<>(allApplicableFilters.size()); | ||||
| for (QueryBuilder query : allApplicableFilters) { | ||||
| filtersInitial.add(query.toQuery(context)); | ||||
| } | ||||
| if (context.getAliasFilter() != null) { | ||||
|
|
@@ -650,6 +665,14 @@ protected Query doToQuery(SearchExecutionContext context) throws IOException { | |||
| ); | ||||
| } | ||||
|
|
||||
| private List<QueryBuilder> getAllApplicableFilters(SearchExecutionContext context) { | ||||
| List<QueryBuilder> applicableFilters = new ArrayList<>(filterQueries); | ||||
| if (isAutoPrefiltering) { | ||||
| applicableFilters.addAll(context.autoPrefilteringScope().getPrefilters()); | ||||
| } | ||||
| return applicableFilters; | ||||
| } | ||||
|
Comment on lines
+668
to
+674
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. we should not mix and match user provided filters and auto-prefilters. It might lead to surprising behaviors.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Specifically, as a user, I would expect the filters I provide particularly are the only ones applied.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Which, I think we can safely assert that filters are empty here, right? Since the only thing that applies
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right. As things are today, it is impossible to have auto and explicit prefilters at the same time. I'll safe-guard against mixing them.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, but unfortunately we are programmatically adding prefilters in Line 161 in fdeeded
Not sure how to enforce this. But we would have to mess up to apply auto prefiltering to a user knn query, as there is no way to enable auto prefiltering for a user knn query.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
🤔 This is messy, we need to ensure that users cannot do this.
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just to be clear, users cannot end up getting auto-prefilters on their bespoke knn queries. They cannot enable
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
My worry is that somebody in the future will try to enable it. Can we add a test or something to signal this shouldn't be allowed?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah, I can add a test where we have explicit knn query under a bool and check it doesn't get any extra prefilters. With some commentary too. If a PR popped that changes that test people would at least have to read the warning, etc. Sounds good?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yep! Thats great! Just making sure we have an adequate fence |
||||
|
|
||||
| private static Query buildFilterQuery(List<Query> filters) { | ||||
| BooleanQuery.Builder builder = new BooleanQuery.Builder(); | ||||
| for (Query f : filters) { | ||||
|
|
@@ -671,7 +694,8 @@ protected int doHashCode() { | |||
| filterQueries, | ||||
| vectorSimilarity, | ||||
| queryVectorBuilder, | ||||
| rescoreVectorBuilder | ||||
| rescoreVectorBuilder, | ||||
| isAutoPrefiltering | ||||
| ); | ||||
| } | ||||
|
|
||||
|
|
@@ -685,11 +709,17 @@ protected boolean doEquals(KnnVectorQueryBuilder other) { | |||
| && Objects.equals(filterQueries, other.filterQueries) | ||||
| && Objects.equals(vectorSimilarity, other.vectorSimilarity) | ||||
| && Objects.equals(queryVectorBuilder, other.queryVectorBuilder) | ||||
| && Objects.equals(rescoreVectorBuilder, other.rescoreVectorBuilder); | ||||
| && Objects.equals(rescoreVectorBuilder, other.rescoreVectorBuilder) | ||||
| && isAutoPrefiltering == other.isAutoPrefiltering; | ||||
| } | ||||
|
|
||||
| @Override | ||||
| public TransportVersion getMinimalSupportedVersion() { | ||||
| return TransportVersions.V_8_0_0; | ||||
| } | ||||
|
|
||||
| public KnnVectorQueryBuilder setAutoPrefiltering(boolean isAutoPrefiltering) { | ||||
| this.isAutoPrefiltering = isAutoPrefiltering; | ||||
| return this; | ||||
| } | ||||
| } | ||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| 9216000 |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1 +1 @@ | ||
| inference_api_eis_authorization_persistent_task,9215000 | ||
| knn_vector_query_auto_prefiltering,9216000 |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This mustNot thing is tricky, since we already need to iterate all of it again when pushing down stream, why not have this
collectPrefiltershave aQueryBuilderargument, then we bypass the self concern, and then this can simply returnQueryBuilderwhich is a single BooleanQuery we pass down (as it will be a boolean query anyways ultimately once its finally used right?)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There is no self concern for
must_notas we do not apply prefilters to those. I'll add assertions inaddBooleanClausesto communicate that contract and keep things simple.