Skip to content

Commit e806bb6

Browse files
authored
handle fast field search (#5906)
* handle fast field search * add term set query tests
1 parent 3f79868 commit e806bb6

File tree

11 files changed

+319
-31
lines changed

11 files changed

+319
-31
lines changed

quickwit/quickwit-doc-mapper/src/query_builder.rs

Lines changed: 95 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -45,12 +45,86 @@ impl<'a> QueryAstVisitor<'a> for RangeQueryFields {
4545
}
4646
}
4747

48-
struct ExistsQueryFastFields {
49-
fields: HashSet<FastFieldWarmupInfo>,
48+
/// Term Queries on fields which are fast but not indexed.
49+
struct TermSearchOnColumnar<'f> {
50+
fields: &'f mut HashSet<FastFieldWarmupInfo>,
5051
schema: Schema,
5152
}
53+
impl<'a, 'f> QueryAstVisitor<'a> for TermSearchOnColumnar<'f> {
54+
type Err = Infallible;
55+
56+
fn visit_term_set(&mut self, term_set_query: &'a TermSetQuery) -> Result<(), Infallible> {
57+
for field in term_set_query.terms_per_field.keys() {
58+
if let Some((_field, field_entry, path)) =
59+
find_field_or_hit_dynamic(field, &self.schema)
60+
{
61+
if field_entry.is_fast() && !field_entry.is_indexed() {
62+
self.fields.insert(FastFieldWarmupInfo {
63+
name: if path.is_empty() {
64+
field_entry.name().to_string()
65+
} else {
66+
format!("{}.{}", field_entry.name(), path)
67+
},
68+
with_subfields: false,
69+
});
70+
}
71+
}
72+
}
73+
Ok(())
74+
}
75+
76+
fn visit_term(
77+
&mut self,
78+
term_query: &'a quickwit_query::query_ast::TermQuery,
79+
) -> Result<(), Infallible> {
80+
if let Some((_field, field_entry, path)) =
81+
find_field_or_hit_dynamic(&term_query.field, &self.schema)
82+
{
83+
if field_entry.is_fast() && !field_entry.is_indexed() {
84+
self.fields.insert(FastFieldWarmupInfo {
85+
name: if path.is_empty() {
86+
field_entry.name().to_string()
87+
} else {
88+
format!("{}.{}", field_entry.name(), path)
89+
},
90+
with_subfields: false,
91+
});
92+
}
93+
}
94+
Ok(())
95+
}
96+
/// We also need to visit full text queries because they can be converted to term queries
97+
/// on fast fields. We only care about the field being fast and not indexed AND the tokenizer
98+
/// being `raw` or None.
99+
fn visit_full_text(&mut self, full_text_query: &'a FullTextQuery) -> Result<(), Infallible> {
100+
if let Some((_field, field_entry, path)) =
101+
find_field_or_hit_dynamic(&full_text_query.field, &self.schema)
102+
{
103+
if field_entry.is_fast()
104+
&& !field_entry.is_indexed()
105+
&& (full_text_query.params.tokenizer.is_none()
106+
|| full_text_query.params.tokenizer.as_deref() == Some("raw"))
107+
{
108+
self.fields.insert(FastFieldWarmupInfo {
109+
name: if path.is_empty() {
110+
field_entry.name().to_string()
111+
} else {
112+
format!("{}.{}", field_entry.name(), path)
113+
},
114+
with_subfields: false,
115+
});
116+
}
117+
}
118+
Ok(())
119+
}
120+
}
52121

53-
impl<'a> QueryAstVisitor<'a> for ExistsQueryFastFields {
122+
struct ExistsQueryFastFields<'f> {
123+
fields: &'f mut HashSet<FastFieldWarmupInfo>,
124+
schema: Schema,
125+
}
126+
127+
impl<'a, 'f> QueryAstVisitor<'a> for ExistsQueryFastFields<'f> {
54128
type Err = Infallible;
55129

56130
fn visit_exists(&mut self, exists_query: &'a FieldPresenceQuery) -> Result<(), Infallible> {
@@ -88,18 +162,11 @@ pub(crate) fn build_query(
88162
search_fields: &[String],
89163
with_validation: bool,
90164
) -> Result<(Box<dyn Query>, WarmupInfo), QueryParserError> {
91-
let mut range_query_fields = RangeQueryFields::default();
92-
// This cannot fail. The error type is Infallible.
93-
let _: Result<(), Infallible> = range_query_fields.visit(query_ast);
165+
let mut fast_fields: HashSet<FastFieldWarmupInfo> = HashSet::new();
94166

95-
let mut exists_query_fields = ExistsQueryFastFields {
96-
fields: HashSet::new(),
97-
schema: schema.clone(),
98-
};
167+
let mut range_query_fields = RangeQueryFields::default();
99168
// This cannot fail. The error type is Infallible.
100-
let _: Result<(), Infallible> = exists_query_fields.visit(query_ast);
101-
102-
let mut fast_fields = HashSet::new();
169+
let Ok(_) = range_query_fields.visit(query_ast);
103170
let range_query_fast_fields =
104171
range_query_fields
105172
.range_query_field_names
@@ -109,7 +176,18 @@ pub(crate) fn build_query(
109176
with_subfields: false,
110177
});
111178
fast_fields.extend(range_query_fast_fields);
112-
fast_fields.extend(exists_query_fields.fields);
179+
180+
let Ok(_) = TermSearchOnColumnar {
181+
fields: &mut fast_fields,
182+
schema: schema.clone(),
183+
}
184+
.visit(query_ast);
185+
186+
let Ok(_) = ExistsQueryFastFields {
187+
fields: &mut fast_fields,
188+
schema: schema.clone(),
189+
}
190+
.visit(query_ast);
113191

114192
let query = query_ast.build_tantivy_query(
115193
&schema,
@@ -125,6 +203,9 @@ pub(crate) fn build_query(
125203
let mut terms_grouped_by_field: HashMap<Field, HashMap<_, bool>> = Default::default();
126204
query.query_terms(&mut |term, need_position| {
127205
let field = term.field();
206+
if !schema.get_field_entry(field).is_indexed() {
207+
return;
208+
}
128209
*terms_grouped_by_field
129210
.entry(field)
130211
.or_default()

quickwit/quickwit-query/src/query_ast/utils.rs

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,15 @@ use tantivy::json_utils::convert_to_fast_value_and_append_to_json_term;
1717
use tantivy::query::TermQuery as TantivyTermQuery;
1818
use tantivy::schema::{
1919
Field, FieldEntry, FieldType, IndexRecordOption, JsonObjectOptions, Schema as TantivySchema,
20-
Type,
20+
TextFieldIndexing, Type,
2121
};
2222

2323
use crate::InvalidQuery;
2424
use crate::MatchAllOrNone::MatchNone as TantivyEmptyQuery;
2525
use crate::json_literal::InterpretUserInput;
2626
use crate::query_ast::full_text_query::FullTextParams;
2727
use crate::query_ast::tantivy_query_ast::{TantivyBoolQuery, TantivyQueryAst};
28-
use crate::tokenizers::TokenizerManager;
28+
use crate::tokenizers::{RAW_TOKENIZER_NAME, TokenizerManager};
2929

3030
pub(crate) const DYNAMIC_FIELD_NAME: &str = "_dynamic";
3131

@@ -147,12 +147,18 @@ fn compute_query_with_field(
147147
Ok(make_term_query(term))
148148
}
149149
FieldType::Str(text_options) => {
150-
let text_field_indexing = text_options.get_indexing_options().ok_or_else(|| {
151-
InvalidQuery::SchemaError(format!(
152-
"field {} is not full-text searchable",
153-
field_entry.name()
154-
))
155-
})?;
150+
let columnar_opt = TextFieldIndexing::default()
151+
.set_fieldnorms(false)
152+
.set_tokenizer(RAW_TOKENIZER_NAME);
153+
let text_field_indexing = text_options
154+
.get_indexing_options()
155+
.or_else(|| text_options.is_fast().then_some(&columnar_opt))
156+
.ok_or_else(|| {
157+
InvalidQuery::SchemaError(format!(
158+
"field {} is not full-text searchable",
159+
field_entry.name()
160+
))
161+
})?;
156162
let terms = full_text_params.tokenize_text_into_terms(
157163
field,
158164
value,

quickwit/quickwit-query/src/tokenizers/mod.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ use self::chinese_compatible::ChineseTokenizer;
2828
pub use self::code_tokenizer::CodeTokenizer;
2929
#[cfg(feature = "multilang")]
3030
pub use self::multilang::MultiLangTokenizer;
31-
pub use self::tokenizer_manager::TokenizerManager;
31+
pub use self::tokenizer_manager::{RAW_TOKENIZER_NAME, TokenizerManager};
3232

3333
pub const DEFAULT_REMOVE_TOKEN_LENGTH: usize = 255;
3434

quickwit/quickwit-query/src/tokenizers/tokenizer_manager.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ use tantivy::tokenizer::{
2222

2323
use crate::DEFAULT_REMOVE_TOKEN_LENGTH;
2424

25-
const RAW_TOKENIZER_NAME: &str = "raw";
25+
pub const RAW_TOKENIZER_NAME: &str = "raw";
2626
const LOWERCASE_TOKENIZER_NAME: &str = "lowercase";
2727
const RAW_LOWERCASE_TOKENIZER_NAME: &str = "raw_lowercase";
2828

quickwit/rest-api-tests/scenarii/es_compatibility/0020-stats.yaml

Lines changed: 12 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -64,12 +64,12 @@ expected:
6464
_all:
6565
primaries:
6666
docs:
67-
count: 100
67+
count: 102
6868
total:
6969
segments:
70-
count: 1
70+
count: 2
7171
docs:
72-
count: 100
72+
count: 102
7373
indices:
7474
gharchive:
7575
primaries:
@@ -80,6 +80,15 @@ expected:
8080
count: 1
8181
docs:
8282
count: 100
83+
fast_only:
84+
primaries:
85+
docs:
86+
count: 2
87+
total:
88+
segments:
89+
count: 1
90+
docs:
91+
count: 2
8392
empty_index:
8493
primaries:
8594
docs:

quickwit/rest-api-tests/scenarii/es_compatibility/0021-cat-indices.yaml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5,11 +5,13 @@ endpoint: "_cat/indices?format=json"
55
expected:
66
- index: empty_index
77
docs.count: '0'
8-
- dataset.size: 222.8kb
8+
- index: fast_only
9+
docs.count: '2'
10+
- index: gharchive
11+
dataset.size: 222.8kb
912
docs.count: '100'
1013
docs.deleted: '0'
1114
health: green
12-
index: gharchive
1315
pri: '1'
1416
pri.store.size:
1517
$expect: 270 < float(val[:-2]) < 280
Lines changed: 133 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,133 @@
1+
# Search for a term in a field that is not indexed but is a fast field
2+
engines:
3+
- quickwit
4+
endpoint: "fast_only/_search"
5+
params:
6+
size: 0
7+
json:
8+
query:
9+
term:
10+
fast_text: "abc-123"
11+
expected:
12+
hits:
13+
total:
14+
value: 1
15+
relation: "eq"
16+
--- # term query with no matches
17+
engines:
18+
- quickwit
19+
endpoint: "fast_only/_search"
20+
params:
21+
size: 0
22+
json:
23+
query:
24+
term:
25+
fast_text: "zzz"
26+
expected:
27+
hits:
28+
total:
29+
value: 0
30+
relation: "eq"
31+
32+
--- # term set query with partial match
33+
engines:
34+
- quickwit
35+
endpoint: "fast_only/_search"
36+
params:
37+
size: 0
38+
json:
39+
query:
40+
terms:
41+
fast_text:
42+
- "abc-123"
43+
- "zzz"
44+
expected:
45+
hits:
46+
total:
47+
value: 1
48+
relation: "eq"
49+
50+
--- # term set query with multiple matches
51+
engines:
52+
- quickwit
53+
endpoint: "fast_only/_search"
54+
params:
55+
size: 0
56+
json:
57+
query:
58+
terms:
59+
fast_text:
60+
- "abc-123"
61+
- "def-456"
62+
expected:
63+
hits:
64+
total:
65+
value: 2
66+
relation: "eq"
67+
68+
--- # term query on nested JSON field
69+
engines:
70+
- quickwit
71+
endpoint: "fast_only/_search"
72+
params:
73+
size: 0
74+
json:
75+
query:
76+
term:
77+
obj.nested_text: "abc-123"
78+
expected:
79+
hits:
80+
total:
81+
value: 1
82+
relation: "eq"
83+
84+
--- # term query with no matches
85+
engines:
86+
- quickwit
87+
endpoint: "fast_only/_search"
88+
params:
89+
size: 0
90+
json:
91+
query:
92+
term:
93+
obj.nested_text: "zzz"
94+
expected:
95+
hits:
96+
total:
97+
value: 0
98+
relation: "eq"
99+
100+
--- # term set query
101+
engines:
102+
- quickwit
103+
endpoint: "fast_only/_search"
104+
params:
105+
size: 0
106+
json:
107+
query:
108+
terms:
109+
obj.nested_text:
110+
- "abc-123"
111+
- "ghi-789"
112+
expected:
113+
hits:
114+
total:
115+
value: 2
116+
relation: "eq"
117+
118+
--- # term set query with no matches
119+
engines:
120+
- quickwit
121+
endpoint: "fast_only/_search"
122+
params:
123+
size: 0
124+
json:
125+
query:
126+
terms:
127+
obj.nested_text:
128+
- "zzz"
129+
expected:
130+
hits:
131+
total:
132+
value: 0
133+
relation: "eq"

0 commit comments

Comments
 (0)