From 0e0fe5a61116f8977d0793f2361b1b68a420f44e Mon Sep 17 00:00:00 2001 From: Martijn Laarman Date: Tue, 2 Dec 2025 15:36:48 +0100 Subject: [PATCH 1/2] Continuing tweaking search relevance --- config/synonyms.yml | 6 +- .../INavigationTraversable.cs | 53 +++++-- .../package-lock.json | 36 ++--- .../Search/DocumentationDocument.cs | 9 ++ .../DocumentationGenerator.cs | 6 +- .../Elasticsearch/ElasticsearchExporter.cs | 143 +++++++++++------- .../ElasticsearchMarkdownExporter.cs | 89 ++++++++--- .../Exporters/IMarkdownExporter.cs | 1 + .../Adapters/Search/ElasticsearchGateway.cs | 94 ++++++++---- .../Search/SearchBootstrapFixture.cs | 3 +- .../SearchRelevanceTests.cs | 21 ++- 11 files changed, 303 insertions(+), 158 deletions(-) diff --git a/config/synonyms.yml b/config/synonyms.yml index 79403ec52..036f47704 100644 --- a/config/synonyms.yml +++ b/config/synonyms.yml @@ -1,6 +1,8 @@ synonyms: - [ ".net", "c#", "csharp", "dotnet", "net" ] - - [ "esql", "es|ql" ] + - [ "esql", "es|ql => esql" ] + - [ "data-stream", "data stream", "datastream => data-streams"] + - [ "data-streams", "data streams", "datastreams"] - [ "motlp", "managed otlp" ] - [ "s3", "aws s3", "amazon s3" ] - [ "es", "elasticsearch" ] @@ -27,7 +29,7 @@ synonyms: - [ "edot", "elastic distribution of opentelemetry" ] - [ "k8s", "kubernetes" ] - [ "ecs", "elastic common schema" ] - - [ "ml", "machine learning" ] + - [ "machine-learning", "machine learning", "ml => machine learning" ] - [ "eis", "elastic inference service" ] - [ "traffic filter", "network security" ] - [ "sso", "single sign-on" ] diff --git a/src/Elastic.Documentation.Navigation/INavigationTraversable.cs b/src/Elastic.Documentation.Navigation/INavigationTraversable.cs index 27ff3c5c0..3afbe96fe 100644 --- a/src/Elastic.Documentation.Navigation/INavigationTraversable.cs +++ b/src/Elastic.Documentation.Navigation/INavigationTraversable.cs @@ -7,6 +7,42 @@ namespace Elastic.Documentation.Navigation; +public static class NavigationExtensions +{ + extension(INavigationItem navigationItem) + { + public INavigationItem[] GetParents() + { + var parents = new List(); + var parent = navigationItem.Parent; + do + { + if (parent is null) + continue; + if (parents.All(i => i.Url != parent.Url)) + parents.Add(parent); + + parent = parent.Parent; + } while (parent != null); + + return [.. parents]; + } + + public int NavigationDepth => navigationItem.GetParents().Length; + + public string? NavigationSection + { + get + { + var parents = navigationItem.GetParents(); + if (parents.Length <= 1) + return navigationItem.NavigationTitle.ToLowerInvariant(); + return parents.Reverse().Skip(1).FirstOrDefault()?.NavigationTitle.ToLowerInvariant(); + } + } + } +} + public interface INavigationTraversable { ConditionalWeakTable NavigationDocumentationFileLookup { get; } @@ -71,22 +107,7 @@ INavigationItem GetNavigationFor(IDocumentationFile file) => NavigationDocumentationFileLookup.TryGetValue(file, out var navigation) ? navigation : throw new InvalidOperationException($"Could not find {file.NavigationTitle} in navigation"); - INavigationItem[] GetParents(INavigationItem current) - { - var parents = new List(); - var parent = current.Parent; - do - { - if (parent is null) - continue; - if (parents.All(i => i.Url != parent.Url)) - parents.Add(parent); - - parent = parent.Parent; - } while (parent != null); - - return [.. parents]; - } + INavigationItem[] GetParents(INavigationItem current) => current.GetParents(); INavigationItem[] GetParentsOfMarkdownFile(IDocumentationFile file) => NavigationDocumentationFileLookup.TryGetValue(file, out var navigation) ? GetParents(navigation) : []; diff --git a/src/Elastic.Documentation.Site/package-lock.json b/src/Elastic.Documentation.Site/package-lock.json index 19a97384b..615a512ca 100644 --- a/src/Elastic.Documentation.Site/package-lock.json +++ b/src/Elastic.Documentation.Site/package-lock.json @@ -149,7 +149,6 @@ "integrity": "sha512-2BCOP7TN8M+gVDj7/ht3hsaO/B/n5oDbiAyyvnRlNOs+u1o+JWNYTQrmpuNp1/Wq2gcFrI01JAW+paEKDMx/CA==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@babel/code-frame": "^7.27.1", "@babel/generator": "^7.28.3", @@ -1998,7 +1997,6 @@ } ], "license": "MIT", - "peer": true, "engines": { "node": ">=18" }, @@ -2022,7 +2020,6 @@ } ], "license": "MIT", - "peer": true, "engines": { "node": ">=18" } @@ -2328,7 +2325,6 @@ "version": "11.13.5", "resolved": "https://registry.npmjs.org/@emotion/css/-/css-11.13.5.tgz", "integrity": "sha512-wQdD0Xhkn3Qy2VNcIzbLP9MR8TafI0MJb7BEAXKp+w4+XqErksWR4OXomuDzPsN4InLdGhVe6EYcn2ZIUCpB8w==", - "peer": true, "dependencies": { "@emotion/babel-plugin": "^11.13.5", "@emotion/cache": "^11.13.5", @@ -2351,7 +2347,6 @@ "version": "11.14.0", "resolved": "https://registry.npmjs.org/@emotion/react/-/react-11.14.0.tgz", "integrity": "sha512-O000MLDBDdk/EohJPFUqvnp4qnHeYkVP5B0xEG0D/L7cOKP9kefu2DXn8dj74cQfsEzUqh+sr1RzFqiL1o+PpA==", - "peer": true, "dependencies": { "@babel/runtime": "^7.18.3", "@emotion/babel-plugin": "^11.13.5", @@ -4479,7 +4474,6 @@ "resolved": "https://registry.npmjs.org/@opentelemetry/api/-/api-1.9.0.tgz", "integrity": "sha512-3giAOQvZiH5F9bMlMiv8+GSPMeqg0dbaeo58/0SlA9sxSqZhnUtxzX9/2FzyhS9sWQf5S0GJE0AKBrFqjpeYcg==", "license": "Apache-2.0", - "peer": true, "engines": { "node": ">=8.0.0" } @@ -4873,7 +4867,6 @@ "integrity": "sha512-erH9GdLe8Boie0mCO8hXn8Qt/pCACsOFlKp8UHNMlPaizUtCDkCOQqwmSi+VyrJ3dMMCOc/qBwTSGAJaJE8/Kw==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@mischnic/json-sourcemap": "^0.1.1", "@parcel/cache": "2.16.0", @@ -7380,7 +7373,6 @@ "resolved": "https://registry.npmjs.org/@trivago/prettier-plugin-sort-imports/-/prettier-plugin-sort-imports-5.2.2.tgz", "integrity": "sha512-fYDQA9e6yTNmA13TLVSA+WMQRc5Bn/c0EUBditUHNfMMxN7M82c38b1kEggVE3pLpZ0FwkwJkUEKMiOi52JXFA==", "dev": true, - "peer": true, "dependencies": { "@babel/generator": "^7.26.5", "@babel/parser": "^7.26.7", @@ -7433,7 +7425,8 @@ "version": "5.0.4", "resolved": "https://registry.npmjs.org/@types/aria-query/-/aria-query-5.0.4.tgz", "integrity": "sha512-rfT93uj5s0PRL7EzccGMs3brplhcrghnDoV26NqKhCAS1hVo+WdNsPvE/yb6ilfr5hi2MEk6d5EWJTKdxg8jVw==", - "dev": true + "dev": true, + "peer": true }, "node_modules/@types/babel__core": { "version": "7.20.5", @@ -7648,7 +7641,6 @@ "version": "18.3.23", "resolved": "https://registry.npmjs.org/@types/react/-/react-18.3.23.tgz", "integrity": "sha512-/LDXMQh55EzZQ0uVAZmKKhfENivEvWz6E+EYzh+/MCjMhNsotd+ZHhBGIjFDTi6+fz0OhQQQLbTgdQIxxCsC0w==", - "peer": true, "dependencies": { "@types/prop-types": "*", "csstype": "^3.0.2" @@ -7780,7 +7772,6 @@ "integrity": "sha512-tK3GPFWbirvNgsNKto+UmB/cRtn6TZfyw0D6IKrW55n6Vbs7KJoZtI//kpTKzE/DUmmnAFD8/Ca46s7Obs92/w==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@typescript-eslint/scope-manager": "8.46.4", "@typescript-eslint/types": "8.46.4", @@ -8310,7 +8301,6 @@ "version": "8.15.0", "resolved": "https://registry.npmjs.org/acorn/-/acorn-8.15.0.tgz", "integrity": "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg==", - "peer": true, "bin": { "acorn": "bin/acorn" }, @@ -8837,7 +8827,6 @@ "url": "https://github.com/sponsors/ai" } ], - "peer": true, "dependencies": { "caniuse-lite": "^1.0.30001726", "electron-to-chromium": "^1.5.173", @@ -9458,7 +9447,8 @@ "version": "0.5.16", "resolved": "https://registry.npmjs.org/dom-accessibility-api/-/dom-accessibility-api-0.5.16.tgz", "integrity": "sha512-X7BJ2yElsnOJ30pZF4uIIDfBEVgF4XEBxL9Bxhy6dnrm5hkzqmsWHGTiHqRiITNhMyFLyAiWndIJP7Z1NTteDg==", - "dev": true + "dev": true, + "peer": true }, "node_modules/dompurify": { "version": "3.2.7", @@ -9716,7 +9706,6 @@ "integrity": "sha512-BhHmn2yNOFA9H9JmmIVKJmd288g9hrVRDkdoIgRCRuSySRUHH7r/DI6aAXW9T1WwUuY3DFgrcaqB+deURBLR5g==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "@eslint-community/eslint-utils": "^4.8.0", "@eslint-community/regexpp": "^4.12.1", @@ -12338,7 +12327,6 @@ "integrity": "sha512-Cvc9WUhxSMEo4McES3P7oK3QaXldCfNWp7pl2NNeiIFlCoLr3kfq9kb1fxftiwk1FLV7CvpvDfonxtzUDeSOPg==", "dev": true, "license": "MIT", - "peer": true, "dependencies": { "cssstyle": "^4.2.1", "data-urls": "^5.0.0", @@ -12895,6 +12883,7 @@ "resolved": "https://registry.npmjs.org/lz-string/-/lz-string-1.5.0.tgz", "integrity": "sha512-h5bgJWpxJNswbU7qCrV0tIKQCaS3blPDrqKWx+QxzuzL1zGUzij9XCWLrSLsJPu5t+eWA/ycetzYAO5IOMcWAQ==", "dev": true, + "peer": true, "bin": { "lz-string": "bin/bin.js" } @@ -13141,7 +13130,6 @@ "version": "2.30.1", "resolved": "https://registry.npmjs.org/moment/-/moment-2.30.1.tgz", "integrity": "sha512-uEmtNhbDOrWPFS+hdjFCBfy9f2YoyzRpwcl+DqpC6taX21FzsTLQVbMV/W7PzNSX6x/bhC1zA3c2UQ5NzH6how==", - "peer": true, "engines": { "node": "*" } @@ -13754,7 +13742,6 @@ "url": "https://github.com/sponsors/ai" } ], - "peer": true, "dependencies": { "nanoid": "^3.3.11", "picocolors": "^1.1.1", @@ -13801,7 +13788,6 @@ "resolved": "https://registry.npmjs.org/prettier/-/prettier-3.6.2.tgz", "integrity": "sha512-I7AIg5boAr5R0FFtJ6rCfD+LFsWHp81dolrFD8S79U9tb8Az2nGrJncnMSnys+bpQJfRUzqs9hnA81OAA3hCuQ==", "dev": true, - "peer": true, "bin": { "prettier": "bin/prettier.cjs" }, @@ -13896,6 +13882,7 @@ "resolved": "https://registry.npmjs.org/pretty-format/-/pretty-format-27.5.1.tgz", "integrity": "sha512-Qb1gy5OrP5+zDf2Bvnzdl3jsTf1qXVMazbvCoKhtKqVs4/YK4ozX4gKQJJVyNe+cajNPn0KoC0MC3FUmaHWEmQ==", "dev": true, + "peer": true, "dependencies": { "ansi-regex": "^5.0.1", "ansi-styles": "^5.0.0", @@ -13910,6 +13897,7 @@ "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-5.2.0.tgz", "integrity": "sha512-Cxwpt2SfTzTtXcfOlzGEee8O+c+MmUgGrNiBcXnuWxuFJHe6a5Hz7qwhwe5OgaSYI0IJvkLqWX1ASG+cJOkEiA==", "dev": true, + "peer": true, "engines": { "node": ">=10" }, @@ -14059,7 +14047,6 @@ "version": "18.3.1", "resolved": "https://registry.npmjs.org/react/-/react-18.3.1.tgz", "integrity": "sha512-wS+hAgJShR0KhEvPJArfuPVN1+Hz1t0Y6n5jLrGQbkb4urgPE/0Rve+1kMB1v/oWgHgm4WIcV+i7F2pTVj+2iQ==", - "peer": true, "dependencies": { "loose-envify": "^1.1.0" }, @@ -14082,7 +14069,6 @@ "version": "18.3.1", "resolved": "https://registry.npmjs.org/react-dom/-/react-dom-18.3.1.tgz", "integrity": "sha512-5m4nQKp+rZRb09LNH59GM4BxTh9251/ylbKIbpe7TpGxfJ+9kv6BLkLBXIjjspbgbnIBNqlI23tRnTWT0snUIw==", - "peer": true, "dependencies": { "loose-envify": "^1.1.0", "scheduler": "^0.23.2" @@ -14419,7 +14405,6 @@ "version": "4.2.1", "resolved": "https://registry.npmjs.org/redux/-/redux-4.2.1.tgz", "integrity": "sha512-LAUYz4lc+Do8/g7aeRa8JkyDErK6ekstQaqWQrNRW//MY1TvCEpMtpTWvlQ+FPbWCx+Xixu/6SHt5N0HR+SB4w==", - "peer": true, "dependencies": { "@babel/runtime": "^7.9.2" } @@ -15479,7 +15464,8 @@ "node_modules/tslib": { "version": "1.14.1", "resolved": "https://registry.npmjs.org/tslib/-/tslib-1.14.1.tgz", - "integrity": "sha512-Xni35NKzjgMrwevysHTCArtLDpPvye8zV/0E4EyYn43P7/7qvQwPh9BGkHewbMulVntbigmcT7rdX3BNo9wRJg==" + "integrity": "sha512-Xni35NKzjgMrwevysHTCArtLDpPvye8zV/0E4EyYn43P7/7qvQwPh9BGkHewbMulVntbigmcT7rdX3BNo9wRJg==", + "peer": true }, "node_modules/type-check": { "version": "0.4.0", @@ -15526,7 +15512,6 @@ "resolved": "https://registry.npmjs.org/typescript/-/typescript-5.9.3.tgz", "integrity": "sha512-jl1vZzPDinLr9eUt3J/t7V6FgNEw9QjvBPdysz9KfQDD41fQrC2Y4vKQdiaUpFT4bXlb1RHhLpp8wtm6M5TgSw==", "license": "Apache-2.0", - "peer": true, "bin": { "tsc": "bin/tsc", "tsserver": "bin/tsserver" @@ -16372,8 +16357,7 @@ "version": "0.15.1", "resolved": "https://registry.npmjs.org/zone.js/-/zone.js-0.15.1.tgz", "integrity": "sha512-XE96n56IQpJM7NAoXswY3XRLcWFW83xe0BiAOeMD7K5k5xecOeul3Qcpx6GqEeeHNkW5DWL5zOyTbEfB4eti8w==", - "license": "MIT", - "peer": true + "license": "MIT" }, "node_modules/zustand": { "version": "5.0.8", diff --git a/src/Elastic.Documentation/Search/DocumentationDocument.cs b/src/Elastic.Documentation/Search/DocumentationDocument.cs index 3ca50c854..cb16b1fcd 100644 --- a/src/Elastic.Documentation/Search/DocumentationDocument.cs +++ b/src/Elastic.Documentation/Search/DocumentationDocument.cs @@ -25,6 +25,15 @@ public record DocumentationDocument [JsonPropertyName("url")] public string Url { get; set; } = string.Empty; + [JsonPropertyName("navigation_depth")] + public int NavigationDepth { get; set; } = 50; //default to a high number so that omission gets penalized. + + [JsonPropertyName("navigation_table_of_contents")] + public int NavigationTableOfContents { get; set; } = 50; //default to a high number so that omission gets penalized. + + [JsonPropertyName("navigation_section")] + public string? NavigationSection { get; set; } + /// The date of the batch update this document was part of last. /// This date could be higher than the date_last_updated. [JsonPropertyName("batch_index_date")] diff --git a/src/Elastic.Markdown/DocumentationGenerator.cs b/src/Elastic.Markdown/DocumentationGenerator.cs index 067f1ece2..2942bed95 100644 --- a/src/Elastic.Markdown/DocumentationGenerator.cs +++ b/src/Elastic.Markdown/DocumentationGenerator.cs @@ -69,6 +69,7 @@ public DocumentationGenerator( _logger = logFactory.CreateLogger(nameof(DocumentationGenerator)); DocumentationSet = docSet; + PositionalNavigation = positionalNavigation ?? docSet; Context = docSet.Context; var productVersionInferrer = new ProductVersionInferrerService(DocumentationSet.Context.ProductsConfiguration, DocumentationSet.Context.VersionsConfiguration); HtmlWriter = new HtmlWriter(DocumentationSet, _writeFileSystem, new DescriptionGenerator(), positionalNavigation, navigationHtmlWriter, legacyUrlMapper, productVersionInferrer); @@ -83,6 +84,8 @@ public DocumentationGenerator( _logger.LogInformation("Output directory: {OutputPath} Exists: {OutputPathExists}", docSet.OutputDirectory, docSet.OutputDirectory.Exists); } + private INavigationTraversable PositionalNavigation { get; } + public GenerationState? GetPreviousGenerationState() { var stateFile = DocumentationSet.OutputStateFile; @@ -256,7 +259,7 @@ private async Task ProcessFile(HashSet offendingFiles, DocumentationFile foreach (var exporter in _markdownExporters) { var document = context.MarkdownDocument ??= await markdown.ParseFullAsync(DocumentationSet.TryFindDocumentByRelativePath, ctx); - var navigationItem = DocumentationSet.FindNavigationByMarkdown(markdown); + var navigationItem = PositionalNavigation.GetNavigationFor(markdown); _ = await exporter.ExportAsync(new MarkdownExportFileContext { BuildContext = Context, @@ -265,6 +268,7 @@ private async Task ProcessFile(HashSet offendingFiles, DocumentationFile SourceFile = markdown, DefaultOutputFile = outputFile, DocumentationSet = DocumentationSet, + PositionaNavigation = PositionalNavigation, NavigationItem = navigationItem }, ctx); } diff --git a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchExporter.cs b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchExporter.cs index ae8c7b517..4e96e09f9 100644 --- a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchExporter.cs +++ b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchExporter.cs @@ -20,7 +20,8 @@ public class ElasticsearchLexicalExporter( IDiagnosticsCollector collector, ElasticsearchEndpoint endpoint, string indexNamespace, - DistributedTransport transport + DistributedTransport transport, + string[] indexTimeSynonyms ) : ElasticsearchExporter, CatalogIndexChannel> (logFactory, collector, endpoint, transport, o => new(o), t => new(t) @@ -33,7 +34,7 @@ DistributedTransport transport { "batch_index_date", d.BatchIndexDate.ToString("o") } }), GetMapping = () => CreateMapping(null), - GetMappingSettings = () => CreateMappingSetting($"docs-{indexNamespace}"), + GetMappingSettings = () => CreateMappingSetting($"docs-{indexNamespace}", indexTimeSynonyms), IndexFormat = $"{endpoint.IndexNamePrefix.Replace("semantic", "lexical").ToLowerInvariant()}-{indexNamespace.ToLowerInvariant()}-{{0:yyyy.MM.dd.HHmmss}}", ActiveSearchAlias = $"{endpoint.IndexNamePrefix.Replace("semantic", "lexical").ToLowerInvariant()}-{indexNamespace.ToLowerInvariant()}" @@ -44,14 +45,15 @@ public class ElasticsearchSemanticExporter( IDiagnosticsCollector collector, ElasticsearchEndpoint endpoint, string indexNamespace, - DistributedTransport transport + DistributedTransport transport, + string[] indexTimeSynonyms ) : ElasticsearchExporter, SemanticIndexChannel> (logFactory, collector, endpoint, transport, o => new(o), t => new(t) { BulkOperationIdLookup = d => d.Url, GetMapping = (inferenceId, _) => CreateMapping(inferenceId), - GetMappingSettings = (_, _) => CreateMappingSetting($"docs-{indexNamespace}"), + GetMappingSettings = (_, _) => CreateMappingSetting($"docs-{indexNamespace}", indexTimeSynonyms), IndexFormat = $"{endpoint.IndexNamePrefix.ToLowerInvariant()}-{indexNamespace.ToLowerInvariant()}-{{0:yyyy.MM.dd.HHmmss}}", ActiveSearchAlias = $"{endpoint.IndexNamePrefix}-{indexNamespace.ToLowerInvariant()}", IndexNumThreads = endpoint.IndexNumThreads, @@ -103,7 +105,14 @@ Func createOptions _logger.LogError(e, "Failed to export document"); _collector.EmitGlobalError("Elasticsearch export: failed to export document", e); }; - options.ServerRejectionCallback = items => _logger.LogInformation("Server rejection: {Rejection}", items.First().Item2); + options.ServerRejectionCallback = items => + { + foreach (var (doc, responseItem) in items) + { + _collector.EmitGlobalError( + $"Server rejection: {responseItem.Status} {responseItem.Error?.Type} {responseItem.Error?.Reason} for document {doc.Url}"); + } + }; Channel = createChannel(options); _logger.LogInformation("Created {Channel} Elasticsearch target for indexing", typeof(TChannel).Name); } @@ -140,54 +149,69 @@ public async ValueTask TryWrite(DocumentationDocument document, Cancel ctx return false; } - - protected static string CreateMappingSetting(string synonymSetName) => + protected static string CreateMappingSetting(string synonymSetName, string[] synonyms) + { + var indexTimeSynonyms = $"[{string.Join(",", synonyms.Select(r => $"\"{r}\""))}]"; // language=json - $$""" - { - "analysis": { - "analyzer": { - "synonyms_analyzer": { - "tokenizer": "group_tokenizer", - "filter": [ - "lowercase", - "synonyms_filter", - "kstem" - ] - }, - "highlight_analyzer": { - "tokenizer": "group_tokenizer", - "filter": [ - "lowercase", - "english_stop" - ] - }, - "hierarchy_analyzer": { "tokenizer": "path_tokenizer" } - }, - "filter": { - "synonyms_filter": { - "type": "synonym_graph", - "synonyms_set": "{{synonymSetName}}", - "updateable": true - }, - "english_stop": { - "type": "stop", - "stopwords": "_english_" - } - }, - "tokenizer": { - "group_tokenizer": { - "type": "char_group", - "tokenize_on_chars": [ "whitespace", ",", ";", "?", "!", "(", ")", "&", "'", "\"", "/", "[", "]", "{", "}" ] - }, - "path_tokenizer": { - "type": "path_hierarchy", - "delimiter": "/" - } - } - } - } - """; + return + $$$""" + { + "analysis": { + "analyzer": { + "synonyms_fixed_analyzer": { + "tokenizer": "group_tokenizer", + "filter": [ + "lowercase", + "synonyms_fixed_filter", + "kstem" + ] + }, + "synonyms_analyzer": { + "tokenizer": "group_tokenizer", + "filter": [ + "lowercase", + "synonyms_filter", + "kstem" + ] + }, + "highlight_analyzer": { + "tokenizer": "group_tokenizer", + "filter": [ + "lowercase", + "english_stop" + ] + }, + "hierarchy_analyzer": { "tokenizer": "path_tokenizer" } + }, + "filter": { + "synonyms_fixed_filter": { + "type": "synonym_graph", + "synonyms": {{{indexTimeSynonyms}}} + }, + "synonyms_filter": { + "type": "synonym_graph", + "synonyms_set": "{{{synonymSetName}}}", + "updateable": true + }, + "english_stop": { + "type": "stop", + "stopwords": "_english_" + } + }, + "tokenizer": { + "group_tokenizer": { + "type": "char_group", + "tokenize_on_chars": [ "whitespace", ",", ";", "?", "!", "(", ")", "&", "'", "\"", "/", "[", "]", "{", "}" ] + }, + "path_tokenizer": { + "type": "path_hierarchy", + "delimiter": "/" + } + } + } + } + """; + } protected static string CreateMapping(string? inferenceId) => $$""" @@ -200,6 +224,9 @@ protected static string CreateMapping(string? inferenceId) => "prefix": { "type": "text", "analyzer" : "hierarchy_analyzer" } } }, + "navigation_depth" : { "type" : "rank_feature", "positive_score_impact": false }, + "navigation_table_of_contents" : { "type" : "rank_feature", "positive_score_impact": false }, + "navigation_section" : { "type" : "keyword" }, "hidden" : { "type" : "boolean" }, @@ -234,9 +261,16 @@ protected static string CreateMapping(string? inferenceId) => "hash" : { "type" : "keyword" }, "search_title": { "type": "text", + "analyzer": "synonyms_fixed_analyzer", "search_analyzer": "synonyms_analyzer", "fields": { - "completion": { "type": "search_as_you_type" } + "completion": { + "type": "search_as_you_type", + "analyzer": "synonyms_fixed_analyzer", + "search_analyzer": "synonyms_analyzer", + "term_vector": "with_positions_offsets", + "index_options": "offsets" + } } }, "title": { @@ -256,15 +290,18 @@ protected static string CreateMapping(string? inferenceId) => }, "stripped_body": { "type": "text", + "analyzer": "synonyms_fixed_analyzer", "search_analyzer": "synonyms_analyzer", "term_vector": "with_positions_offsets" }, "headings": { "type": "text", + "analyzer": "synonyms_fixed_analyzer", "search_analyzer": "synonyms_analyzer" }, "abstract": { "type" : "text", + "analyzer": "synonyms_fixed_analyzer", "search_analyzer": "synonyms_analyzer", "fields" : { {{(!string.IsNullOrWhiteSpace(inferenceId) ? $"\"semantic_text\": {{{InferenceMapping(inferenceId)}}}" : "")}} diff --git a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.cs b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.cs index 815e3deb7..cedd1eb67 100644 --- a/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.cs +++ b/src/Elastic.Markdown/Exporters/Elasticsearch/ElasticsearchMarkdownExporter.cs @@ -17,10 +17,11 @@ using Elastic.Markdown.Helpers; using Elastic.Transport; using Elastic.Transport.Products.Elasticsearch; -using Markdig.Parsers; using Markdig.Syntax; using Microsoft.Extensions.Logging; using NetEscapades.EnumGenerators; +using static System.StringSplitOptions; +using MarkdownParser = Markdig.Parsers.MarkdownParser; namespace Elastic.Markdown.Exporters.Elasticsearch; @@ -46,6 +47,7 @@ public class ElasticsearchMarkdownExporter : IMarkdownExporter, IDisposable private readonly IReadOnlyCollection _synonyms; private readonly VersionsConfiguration _versionsConfiguration; + private readonly string _fixedSynonymsHash; public ElasticsearchMarkdownExporter( ILoggerFactory logFactory, @@ -89,8 +91,18 @@ IDocumentationConfigurationContext context }; _transport = new DistributedTransport(configuration); - _lexicalChannel = new ElasticsearchLexicalExporter(logFactory, collector, es, indexNamespace, _transport); - _semanticChannel = new ElasticsearchSemanticExporter(logFactory, collector, es, indexNamespace, _transport); + + string[] fixedSynonyms = ["esql", "data-stream", "data-streams", "machine-learning"]; + var indexTimeSynonyms = _synonyms.Aggregate(new List(), (acc, synonym) => + { + var id = synonym.Split(",", RemoveEmptyEntries)[0].Trim(); + acc.Add(new SynonymRule { Id = id, Synonyms = synonym }); + return acc; + }).Where(r => fixedSynonyms.Contains(r.Id)).Select(r => r.Synonyms).ToArray(); + _fixedSynonymsHash = HashedBulkUpdate.CreateHash(string.Join(",", indexTimeSynonyms)); + + _lexicalChannel = new ElasticsearchLexicalExporter(logFactory, collector, es, indexNamespace, _transport, indexTimeSynonyms); + _semanticChannel = new ElasticsearchSemanticExporter(logFactory, collector, es, indexNamespace, _transport, indexTimeSynonyms); } /// @@ -99,7 +111,7 @@ public async ValueTask StartAsync(Cancel ctx = default) _currentLexicalHash = await _lexicalChannel.Channel.GetIndexTemplateHashAsync(ctx) ?? string.Empty; _currentSemanticHash = await _semanticChannel.Channel.GetIndexTemplateHashAsync(ctx) ?? string.Empty; - await PublishSynonymsAsync($"docs-{_indexNamespace}", ctx); + await PublishSynonymsAsync(ctx); _ = await _lexicalChannel.Channel.BootstrapElasticsearchAsync(BootstrapMethod.Failure, null, ctx); // if the previous hash does not match the current hash, we know already we want to multiplex to a new index @@ -146,25 +158,24 @@ public async ValueTask StartAsync(Cancel ctx = default) async ValueTask IndexExists(string name) => (await _transport.HeadAsync(name, ctx)).ApiCallDetails.HasSuccessfulStatusCode; } - private async Task PublishSynonymsAsync(string setName, CancellationToken ctx) + private async Task PublishSynonymsAsync(Cancel ctx) { + var setName = $"docs-{_indexNamespace}"; _logger.LogInformation("Publishing synonym set '{SetName}' to Elasticsearch", setName); var synonymRules = _synonyms.Aggregate(new List(), (acc, synonym) => { - acc.Add(new SynonymRule - { - Id = synonym.Split(",", StringSplitOptions.RemoveEmptyEntries)[0].Trim(), - Synonyms = synonym - }); + var id = synonym.Split(",", RemoveEmptyEntries)[0].Trim(); + acc.Add(new SynonymRule { Id = id, Synonyms = synonym }); return acc; }); - var synonymsSet = new SynonymsSet - { - Synonyms = synonymRules - }; + var synonymsSet = new SynonymsSet { Synonyms = synonymRules }; + await PutSynonyms(synonymsSet, setName, ctx); + } + private async Task PutSynonyms(SynonymsSet synonymsSet, string setName, Cancel ctx) + { var json = JsonSerializer.Serialize(synonymsSet, SynonymSerializerContext.Default.SynonymsSet); var response = await _transport.PutAsync($"_synonyms/{setName}", PostData.String(json), ctx); @@ -393,23 +404,56 @@ private void AssignDocumentMetadata(DocumentationDocument doc) var semanticHash = _semanticChannel.Channel.ChannelHash; var lexicalHash = _lexicalChannel.Channel.ChannelHash; var hash = HashedBulkUpdate.CreateHash(semanticHash, lexicalHash, - doc.Url, doc.Type, doc.Body ?? string.Empty, string.Join(",", doc.Headings.OrderBy(h => h)), doc.Url + doc.Url, doc.Type, doc.StrippedBody ?? string.Empty, string.Join(",", doc.Headings.OrderBy(h => h)), + doc.SearchTitle ?? string.Empty, + doc.NavigationSection ?? string.Empty, doc.NavigationDepth.ToString("N0"), + doc.NavigationTableOfContents.ToString("N0"), + _fixedSynonymsHash ); doc.Hash = hash; doc.LastUpdated = _batchIndexDate; doc.BatchIndexDate = _batchIndexDate; } - private void CommonEnrichments(DocumentationDocument doc) + private void CommonEnrichments(DocumentationDocument doc, INavigationItem? navigationItem) { - var urlComponents = doc.Url.Split('/'); - doc.SearchTitle = $"{doc.Title} - ({string.Join(" ", urlComponents)}"; + doc.SearchTitle = CreateSearchTitle(); + // if we have no navigation, initialize to 20 since rank_feature would score 0 too high + doc.NavigationDepth = navigationItem?.NavigationDepth ?? 20; + doc.NavigationTableOfContents = navigationItem switch + { + IRootNavigationItem => Math.Min(2 * doc.NavigationDepth, 48), + INodeNavigationItem => 50, + _ => 100 + }; + doc.NavigationSection = navigationItem?.NavigationSection; + if (doc.Type == "api") + doc.NavigationSection = "api"; + + // this section gets promoted in the navigation we don't want it to be promoted in the search results + // e.g. `Use high-contrast mode in Kibana - ( docs cloud-account high contrast` + if (doc.NavigationSection == "manage your cloud account and preferences") + doc.NavigationDepth *= 2; + + string CreateSearchTitle() + { + // skip doc and the section + var split = new[] { '/', ' ', '-', '.', '_' }; + var urlComponents = new HashSet( + doc.Url.Split('/', RemoveEmptyEntries).Skip(2) + .SelectMany(c => c.Split(split, RemoveEmptyEntries)).ToArray() + ); + var title = doc.Title ?? string.Empty; + //skip tokens already part of the title we don't want to influence TF/IDF + var tokensInTitle = new HashSet(title.Split(split, RemoveEmptyEntries).Select(t => t.ToLowerInvariant())); + return $"{doc.Title} - {string.Join(" ", urlComponents.Where(c => !tokensInTitle.Contains(c.ToLowerInvariant())))}"; + } } public async ValueTask ExportAsync(MarkdownExportFileContext fileContext, Cancel ctx) { var file = fileContext.SourceFile; - INavigationTraversable navigation = fileContext.DocumentationSet; + var navigation = fileContext.PositionaNavigation; var currentNavigation = navigation.GetNavigationFor(file); var url = currentNavigation.Url; @@ -420,7 +464,6 @@ public async ValueTask ExportAsync(MarkdownExportFileContext fileContext, return true; } - // Remove the first h1 because we already have the title // and we don't want it to appear in the body var h1 = fileContext.Document.Descendants().FirstOrDefault(h => h.Level == 1); @@ -461,8 +504,8 @@ public async ValueTask ExportAsync(MarkdownExportFileContext fileContext, Hidden = fileContext.NavigationItem.Hidden }; + CommonEnrichments(doc, currentNavigation); AssignDocumentMetadata(doc); - CommonEnrichments(doc); if (_indexStrategy == IngestStrategy.Multiplex) return await _lexicalChannel.TryWrite(doc, ctx) && await _semanticChannel.TryWrite(doc, ctx); @@ -481,7 +524,6 @@ public async ValueTask FinishExportAsync(IDirectoryInfo outputFolder, Canc await foreach (var doc in exporter.ExportDocuments(limitPerSource: null, ctx)) { - AssignDocumentMetadata(doc); var document = MarkdownParser.Parse(doc.Body ?? string.Empty); doc.Body = LlmMarkdownExporter.ConvertToLlmMarkdown(document, _context); @@ -497,7 +539,8 @@ public async ValueTask FinishExportAsync(IDirectoryInfo outputFolder, Canc : string.Empty; doc.Abstract = @abstract; doc.Headings = headings; - CommonEnrichments(doc); + CommonEnrichments(doc, null); + AssignDocumentMetadata(doc); // Write to channels following the multiplex or reindex strategy if (_indexStrategy == IngestStrategy.Multiplex) diff --git a/src/Elastic.Markdown/Exporters/IMarkdownExporter.cs b/src/Elastic.Markdown/Exporters/IMarkdownExporter.cs index f67cd528c..57bf0383c 100644 --- a/src/Elastic.Markdown/Exporters/IMarkdownExporter.cs +++ b/src/Elastic.Markdown/Exporters/IMarkdownExporter.cs @@ -20,6 +20,7 @@ public record MarkdownExportFileContext public required IFileInfo DefaultOutputFile { get; init; } public required DocumentationSet DocumentationSet { get; init; } public required INavigationItem NavigationItem { get; init; } + public required INavigationTraversable PositionaNavigation { get; init; } } public interface IMarkdownExporter diff --git a/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/ElasticsearchGateway.cs b/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/ElasticsearchGateway.cs index f7d17105b..2893e11b3 100644 --- a/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/ElasticsearchGateway.cs +++ b/src/api/Elastic.Documentation.Api.Infrastructure/Adapters/Search/ElasticsearchGateway.cs @@ -25,6 +25,15 @@ internal sealed record DocumentDto [JsonPropertyName("url")] public required string Url { get; init; } + [JsonPropertyName("navigation_depth")] + public int NavigationDepth { get; init; } + + [JsonPropertyName("navigation_table_of_contents")] + public int NavigationTableOfContents { get; init; } + + [JsonPropertyName("navigation_section")] + public string? NavigationSection { get; init; } + [JsonPropertyName("description")] public string? Description { get; init; } @@ -97,37 +106,30 @@ public ElasticsearchGateway(ElasticsearchOptions elasticsearchOptions, ILogger(), StringSplitOptions.RemoveEmptyEntries); - if (tokens is ["datastream" or "datastreams" or "data-stream" or "data-streams"]) - { - // /docs/api/doc/kibana/operation/operation-delete-fleet-epm-packages-pkgname-pkgversion-datastream-assets - // Is the only page that uses "datastream" instead of "data streams" this gives it an N of 1 in the entire corpus - // which is hard to fix through tweaking boosting, should update the page to use "data streams" instead - searchQuery = "data streams"; - tokens = ["data", "streams"]; - } var query = - (Query)new MultiMatchQuery + (Query)new ConstantScoreQuery { - Query = searchQuery, Operator = Operator.And, Type = TextQueryType.BoolPrefix, - Analyzer = "synonyms_analyzer", - Boost = 2.0f, - Fields = new[] + Filter = new MultiMatchQuery { - "search_title.completion", - "search_title.completion._2gram", - "search_title.completion._3gram" - } + Query = searchQuery, Operator = Operator.And, Type = TextQueryType.BoolPrefix, + Analyzer = "synonyms_analyzer", + Boost = 1.0f, + Fields = new[] + { + "search_title.completion", + "search_title.completion._2gram", + "search_title.completion._3gram" + } + }, + Boost = 6.0f } || new MultiMatchQuery { Query = searchQuery, Operator = Operator.And, Type = TextQueryType.BestFields, Analyzer = "synonyms_analyzer", - Boost = 0.2f, - Fields = new[] - { - "stripped_body" - } + Boost = 0.1f, + Fields = new[] { "stripped_body" } }; // If the search term is a single word, boost the URL match // This is to ensure that URLs that contain the search term are ranked higher than URLs that don't @@ -142,18 +144,45 @@ private static Query BuildLexicalQuery(string searchQuery) Field = Infer.Field(f => f.Url.Suffix("match")), Query = searchQuery }, - Boost = 1 + Boost = 0.3f + }; + } + + if (tokens.Length > 2) + { + query |= new MultiMatchQuery + { + Query = searchQuery, Operator = Operator.And, Type = TextQueryType.Phrase, + Analyzer = "synonyms_analyzer", + Boost = 0.2f, + Fields = new[] { "stripped_body" } }; } return new BoostingQuery { - Positive = query, + Positive = new BoolQuery + { + Must = [query], + Filter = [DocumentFilter], + Should = [ + new RankFeatureQuery { + Field = Infer.Field(f => f.NavigationDepth), + Boost = 0.8f + }, + new RankFeatureQuery { + Field = Infer.Field(f => f.NavigationTableOfContents), + Boost = 0.8f + }, + new TermQuery { Field = Infer.Field(f => f.NavigationSection ), Value = "reference", Boost = 0.15f }, + new TermQuery { Field = Infer.Field(f => f.NavigationSection ), Value = "getting-started", Boost = 0.1f } + ] + }, NegativeBoost = 0.8, Negative = new MultiMatchQuery { - Query = "plugin client integration", Operator = Operator.Or, Fields = new[] { "search_title", "headings", "url.match" } - } + Query = "plugin client integration glossary", Operator = Operator.Or, Fields = new[] { "search_title", "url.match" } + }, }; } @@ -164,7 +193,7 @@ private static Query BuildSemanticQuery(string searchQuery) => (Query)new SemanticQuery("title.semantic_text", searchQuery) { Boost = 5.0f } || new SemanticQuery("abstract.semantic_text", searchQuery) { Boost = 3.0f }; - private static Query BuildFilter() => !(Query)new TermsQuery(Infer.Field(f => f.Url.Suffix("keyword")), + private static Query DocumentFilter { get; } = !(Query)new TermsQuery(Infer.Field(f => f.Url.Suffix("keyword")), new TermsQueryField(["/docs", "/docs/", "/docs/404", "/docs/404/"])) && !(Query)new TermQuery { Field = Infer.Field(f => f.Hidden), Value = true }; @@ -186,7 +215,6 @@ private static Query BuildFilter() => !(Query)new TermsQuery(Infer.Field r // .Rrf(rrf => rrf @@ -217,7 +245,7 @@ private static Query BuildFilter() => !(Query)new TermsQuery(Infer.Field h .RequireFieldMatch(true) .Fields(f => f - .Add(Infer.Field(d => d.Title), hf => hf + .Add(Infer.Field(d => d.SearchTitle.Suffix("completion")), hf => hf .FragmentSize(150) .NumberOfFragments(3) .NoMatchSize(150) @@ -226,7 +254,7 @@ private static Query BuildFilter() => !(Query)new TermsQuery(Infer.Field q.Match(m => m - .Field(d => d.Title) + .Field(d => d.SearchTitle.Suffix("completion")) .Query(searchQuery) .Analyzer("highlight_analyzer") )) @@ -282,10 +310,10 @@ private static (int TotalHits, List Results) ProcessSearchResp if (highlights != null) { if (highlights.TryGetValue("stripped_body", out var bodyHighlights) && bodyHighlights.Count > 0) - highlightedBody = string.Join(". ", bodyHighlights.Select(h => h.TrimEnd('.'))); + highlightedBody = string.Join(". ", bodyHighlights.Select(h => h.TrimEnd('.', ' ', '-'))); - if (highlights.TryGetValue("title", out var titleHighlights) && titleHighlights.Count > 0) - highlightedTitle = string.Join(". ", titleHighlights.Select(h => h.TrimEnd('.'))); + if (highlights.TryGetValue("search_title.completion", out var titleHighlights) && titleHighlights.Count > 0) + highlightedTitle = string.Join(". ", titleHighlights.Select(h => h.TrimEnd('.', ' ', '-'))); } return new SearchResultItem diff --git a/tests-integration/Elastic.Assembler.IntegrationTests/Search/SearchBootstrapFixture.cs b/tests-integration/Elastic.Assembler.IntegrationTests/Search/SearchBootstrapFixture.cs index 8438bd58f..bac870191 100644 --- a/tests-integration/Elastic.Assembler.IntegrationTests/Search/SearchBootstrapFixture.cs +++ b/tests-integration/Elastic.Assembler.IntegrationTests/Search/SearchBootstrapFixture.cs @@ -181,7 +181,8 @@ private async ValueTask IsIndexingNeeded() collector, endpoint, "dev", // index namespace - transport + transport, + [] ); // Get the current hash from Elasticsearch index template diff --git a/tests-integration/Search.IntegrationTests/SearchRelevanceTests.cs b/tests-integration/Search.IntegrationTests/SearchRelevanceTests.cs index cd0007ff1..f063b4a12 100644 --- a/tests-integration/Search.IntegrationTests/SearchRelevanceTests.cs +++ b/tests-integration/Search.IntegrationTests/SearchRelevanceTests.cs @@ -36,10 +36,25 @@ public class SearchRelevanceTests(ITestOutputHelper output) { "data-streams", "/docs/manage-data/data-store/data-streams", null }, { "datastream", "/docs/manage-data/data-store/data-streams", null }, { "data stream", "/docs/manage-data/data-store/data-streams", null }, - { "saml sso", "/docs/deploy-manage/users-roles/cloud-organization/register-elastic-cloud-saml-in-okta", ["/docs/deploy-manage/users-roles/cloud-organization/configure-saml-authentication"] }, + { "saml sso", "/docs/deploy-manage/users-roles/cloud-organization/configure-saml-authentication", ["/docs/deploy-manage/users-roles/cloud-organization/configure-saml-authentication"] }, { "templates", "/docs/manage-data/data-store/templates", null}, - { "query dsl", "/docs/explore-analyze/query-filter/languages/querydsl", null}, - { "querydsl", "/docs/explore-analyze/query-filter/languages/querydsl", null} + { "query dsl", "/docs/reference/query-languages/querydsl", ["/docs/explore-analyze/query-filter/languages/querydsl"]}, + { "querydsl", "/docs/reference/query-languages/querydsl", ["/docs/explore-analyze/query-filter/languages/querydsl"]}, + { "Agent policy", "/docs/reference/fleet/agent-policy", null}, + { "aliases", "/docs/manage-data/data-store/aliases", null}, + { "Kibana privilege", "/docs/deploy-manage/users-roles/cluster-or-deployment-auth/kibana-privileges", null}, + { "lens", "/docs/explore-analyze/visualize/lens", null}, + { "machine learning node", "/docs/deploy-manage/autoscaling/autoscaling-in-ece-and-ech", ["/docs/deploy-manage/distributed-architecture/clusters-nodes-shards/node-roles"]}, + { "machine learning", "/docs/reference/machine-learning", null}, + { "ml", "/docs/reference/machine-learning", null}, + { "elasticsearch", "/docs/reference/elasticsearch", null}, + { "kibana", "/docs/reference/kibana", null}, + { "cloud", "/docs/reference/cloud", null}, + { "logstash", "/docs/reference/logstash", null}, + { "esql", "/docs/reference/query-languages/esql", null}, + { "ES|QL", "/docs/reference/query-languages/esql", null}, + { "Output plugins for Logstash", "/docs/reference/logstash/plugins/output-plugins", null}, + { "Sending data to Elastic Cloud Hosted", "/docs/solutions/observability/get-started/quickstart-elastic-cloud-otel-endpoint", ["/docs/reference/logstash/connecting-to-cloud"]}, }; [Theory] From 7b3077c433f71a498dadf7d81a5a49878903cee2 Mon Sep 17 00:00:00 2001 From: Martijn Laarman Date: Tue, 2 Dec 2025 20:33:47 +0100 Subject: [PATCH 2/2] update test assertion --- .../Search.IntegrationTests/SearchRelevanceTests.cs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests-integration/Search.IntegrationTests/SearchRelevanceTests.cs b/tests-integration/Search.IntegrationTests/SearchRelevanceTests.cs index f063b4a12..5a657cde0 100644 --- a/tests-integration/Search.IntegrationTests/SearchRelevanceTests.cs +++ b/tests-integration/Search.IntegrationTests/SearchRelevanceTests.cs @@ -44,7 +44,7 @@ public class SearchRelevanceTests(ITestOutputHelper output) { "aliases", "/docs/manage-data/data-store/aliases", null}, { "Kibana privilege", "/docs/deploy-manage/users-roles/cluster-or-deployment-auth/kibana-privileges", null}, { "lens", "/docs/explore-analyze/visualize/lens", null}, - { "machine learning node", "/docs/deploy-manage/autoscaling/autoscaling-in-ece-and-ech", ["/docs/deploy-manage/distributed-architecture/clusters-nodes-shards/node-roles"]}, + { "machine learning node", "/docs/deploy-manage/autoscaling/autoscaling-in-ece-and-ech", null }, { "machine learning", "/docs/reference/machine-learning", null}, { "ml", "/docs/reference/machine-learning", null}, { "elasticsearch", "/docs/reference/elasticsearch", null},