Skip to content
This repository was archived by the owner on Sep 30, 2024. It is now read-only.

Commit 80a8177

Browse files
authored
Search: improve keyword search prototype (#52233)
We have an experimental search type called `patterntype:keyword`. In testing it on Cody-style queries, it had worse relevance than our ripgrep implementation, and was sometimes quite slow. This PR makes improvements to query analysis: * Reduce the number of tokens we search by using a more aggressive stopword list * Make stemming cheaper and less noisy by using the stem if it's a prefix of the original * Limit the max number of tokens we'll search over * Remove language detection because it was too noisy and makes it hard to compare to other search strategies It also improves ranking: * Enable Zoekt's keyword scoring to rank documents by (approximate) BM25 * Removes unused ranking logic related to "match groups" Addresses https://github.com/sourcegraph/sourcegraph/issues/50786
1 parent 5a553d7 commit 80a8177

File tree

10 files changed

+1073
-408
lines changed

10 files changed

+1073
-408
lines changed

internal/search/client/client.go

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -117,14 +117,17 @@ func (s *searchClient) Plan(
117117
}
118118
tr.LazyPrintf("parsing done")
119119

120+
features := ToFeatures(featureflag.FromContext(ctx), s.logger)
121+
features.KeywordScoring = searchType == query.SearchTypeKeyword
122+
120123
inputs := &search.Inputs{
121124
Plan: plan,
122125
Query: plan.ToQ(),
123126
OriginalQuery: searchQuery,
124127
SearchMode: searchMode,
125128
UserSettings: settings,
126129
OnSourcegraphDotCom: sourcegraphDotComMode,
127-
Features: ToFeatures(featureflag.FromContext(ctx), s.logger),
130+
Features: features,
128131
PatternType: searchType,
129132
Protocol: protocol,
130133
SanitizeSearchPatterns: sanitizeSearchPatterns(ctx, s.db, s.logger), // Experimental: check site config to see if search sanitization is enabled

internal/search/keyword/BUILD.bazel

Lines changed: 2 additions & 8 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

internal/search/keyword/match_groups.go

Lines changed: 0 additions & 86 deletions
This file was deleted.

internal/search/keyword/match_groups_test.go

Lines changed: 0 additions & 83 deletions
This file was deleted.

internal/search/keyword/query_transformer.go

Lines changed: 12 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,11 @@ package keyword
33
import (
44
"strings"
55

6-
"github.com/go-enry/go-enry/v2"
7-
"github.com/kljensen/snowball"
8-
96
"github.com/sourcegraph/sourcegraph/internal/search/query"
107
)
118

9+
const maxTransformedPatterns = 10
10+
1211
type keywordQuery struct {
1312
query query.Basic
1413
patterns []string
@@ -44,7 +43,6 @@ func nodeToPatternsAndParameters(rootNode query.Node) ([]string, []query.Paramet
4443
// Only search file content
4544
{Field: query.FieldType, Value: "file"},
4645
}
47-
seenLangParameter := false
4846

4947
switch operator.Kind {
5048
case query.And:
@@ -58,9 +56,6 @@ func nodeToPatternsAndParameters(rootNode query.Node) ([]string, []query.Paramet
5856
if op.Field != query.FieldCount && op.Field != query.FieldCase && op.Field != query.FieldType {
5957
parameters = append(parameters, op)
6058
}
61-
if op.Field == query.FieldLang {
62-
seenLangParameter = true
63-
}
6459
case query.Pattern:
6560
patterns = append(patterns, op.Value)
6661
}
@@ -69,22 +64,6 @@ func nodeToPatternsAndParameters(rootNode query.Node) ([]string, []query.Paramet
6964
patterns = concatNodeToPatterns(operator)
7065
}
7166

72-
// Check if any of the patterns can be substituted as a lang: filter
73-
if !seenLangParameter {
74-
langPatternIdx := -1
75-
for idx, pattern := range patterns {
76-
langAlias, ok := enry.GetLanguageByAlias(pattern)
77-
if ok {
78-
parameters = append(parameters, query.Parameter{Field: query.FieldLang, Value: langAlias})
79-
langPatternIdx = idx
80-
break
81-
}
82-
}
83-
if langPatternIdx >= 0 {
84-
patterns = removeStringAtIndex(patterns, langPatternIdx)
85-
}
86-
}
87-
8867
return patterns, parameters
8968
}
9069

@@ -107,18 +86,19 @@ func transformPatterns(patterns []string) []string {
10786
}
10887

10988
for _, pattern := range patterns {
110-
patternLowerCase := strings.ToLower(pattern)
111-
112-
if stopWords.Has(patternLowerCase) {
89+
pattern = strings.ToLower(pattern)
90+
pattern = removePunctuation(pattern)
91+
if len(pattern) < 3 || isCommonTerm(pattern) {
11392
continue
11493
}
115-
add(patternLowerCase)
11694

117-
stemmed, err := snowball.Stem(patternLowerCase, "english", false)
118-
if err != nil {
119-
continue
120-
}
121-
add(stemmed)
95+
pattern = stemTerm(pattern)
96+
add(pattern)
97+
}
98+
99+
// To maintain decent latency, limit the number of patterns we search.
100+
if len(transformedPatterns) > maxTransformedPatterns {
101+
transformedPatterns = transformedPatterns[:maxTransformedPatterns]
122102
}
123103

124104
return transformedPatterns

internal/search/keyword/query_transformer_test.go

Lines changed: 8 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -11,41 +11,29 @@ import (
1111
func TestTransformPattern(t *testing.T) {
1212
patterns := []string{
1313
"compute",
14-
"K",
15-
"Means",
14+
"K", // very short terms should be removed
15+
"Means", // stop words should be removed
1616
"Clustering",
17-
"convert",
17+
"implement", // common code-related terms should be removed
1818
"int",
1919
"to",
2020
"string",
2121
"finding",
22-
"time",
23-
"elapsed",
22+
"\"time", // leading punctuation should be removed
23+
"elapsed\"", // trailing punctuation should be removed
2424
"using",
2525
"a",
2626
"timer",
2727
"computing",
28+
"!?", // punctuation-only token should be removed
2829
}
2930
wantPatterns := []string{
30-
"compute",
3131
"comput",
32-
"k",
33-
"means",
34-
"mean",
35-
"clustering",
3632
"cluster",
37-
"convert",
3833
"int",
3934
"string",
40-
"finding",
41-
"find",
42-
"time",
43-
"elapsed",
4435
"elaps",
45-
"using",
46-
"use",
4736
"timer",
48-
"computing",
4937
}
5038

5139
gotPatterns := transformPatterns(patterns)
@@ -75,8 +63,8 @@ func TestQueryStringToKeywordQuery(t *testing.T) {
7563
},
7664
{
7765
query: "K MEANS CLUSTERING in python",
78-
wantQuery: autogold.Expect("count:99999999 type:file lang:Python (k OR means OR mean OR clustering OR cluster)"),
79-
wantPatterns: autogold.Expect([]string{"k", "means", "mean", "clustering", "cluster"}),
66+
wantQuery: autogold.Expect("count:99999999 type:file (cluster OR python)"),
67+
wantPatterns: autogold.Expect([]string{"cluster", "python"}),
8068
},
8169
}
8270

0 commit comments

Comments
 (0)