Merge pull request #4468 from ClickHouse/search-filtering

dhtclk · web-flow · commit 3825267408bb · 2025-09-22T16:18:23.000-05:00
search filtering by doc_type
diff --git a/scripts/search/index_pages.py b/scripts/search/index_pages.py
@@ -38,7 +38,7 @@ def read_metadata(text):
     for part in parts:
         parts = part.split(":")
         if len(parts) == 2:
-            if parts[0] in ['title', 'description', 'slug', 'keywords', 'score']:
+            if parts[0] in ['title', 'description', 'slug', 'keywords', 'score', 'doc_type']:
                 metadata[parts[0]] = int(parts[1].strip()) if parts[0] == 'score' else parts[1].strip()
     return metadata
 
@@ -215,12 +215,12 @@ def update_page_links(directory, base_directory, page_path, url, content, base_u
                 c_page = os.path.abspath(os.path.join(os.path.dirname(page_path), './' + target))
             metadata, _ = parse_metadata_and_content(directory, base_directory, c_page, log_snippet_failure=False)
             if 'slug' in metadata:
-                link_data.append((url, f'{base_url}{metadata.get('slug')}'))
+                link_data.append((url, f"{base_url}{metadata.get('slug')}"))
             else:
                 fail = True
         elif target.startswith('/'):  # ignore external links
             target = target.removesuffix('/')
-            link_data.append((url, f'{base_url}{target}'))
+            link_data.append((url, f"{base_url}{target}"))
     if fail:
         print(f"Warning: couldn't resolve link for {page_path}")
 
@@ -248,7 +248,8 @@ def parse_markdown_content(metadata, content, base_url):
             'lvl0': current_h1,
             'lvl1': current_h1
         },
-        'score': metadata.get('score', 0)
+        'score': metadata.get('score', 0),
+        'doc_type': metadata.get('doc_type', '')
     }
     for line in lines:
         if line.startswith('# '):
@@ -266,8 +267,7 @@ def parse_markdown_content(metadata, content, base_url):
             current_subdoc['type'] = 'lvl1'
             current_subdoc['object_id'] = custom_slugify(heading_slug)
             current_subdoc['hierarchy']['lvl1'] = current_h1
-            current_subdoc['hierarchy']['lvl0'] = current_h1 if metadata.get('title', '') == '' else metadata.get(
-                'title', '')
+            current_subdoc['hierarchy']['lvl0'] = current_h1 if metadata.get('title', '') == '' else metadata.get('title', '')
         elif line.startswith('## '):
             if current_subdoc:
                 yield from split_large_document(current_subdoc)
@@ -293,7 +293,8 @@ def parse_markdown_content(metadata, content, base_url):
                     'lvl0': current_h1 if metadata.get('title', '') == '' else metadata.get('title', ''),
                     'lvl1': current_h1,
                     'lvl2': current_h2,
-                }
+                },
+                'doc_type': metadata.get('doc_type', '')
             }
         elif line.startswith('### '):
             # note we send users to the h2 or h1 even on ###
@@ -322,7 +323,8 @@ def parse_markdown_content(metadata, content, base_url):
                     'lvl1': current_h1,
                     'lvl2': current_h2,
                     'lvl3': current_h3,
-                }
+                },
+                'doc_type': metadata.get('doc_type', '')
             }
         elif line.startswith('#### '):
             if current_subdoc:
@@ -348,7 +350,8 @@ def parse_markdown_content(metadata, content, base_url):
                     'lvl2': current_h2,
                     'lvl3': current_h3,
                     'lvl4': current_h4,
-                }
+                },
+                'doc_type': metadata.get('doc_type', '')
             }
         elif current_subdoc:
             current_subdoc['content'] += line + '\n'
@@ -410,9 +413,9 @@ def compute_page_rank(link_data, damping_factor=0.85, max_iter=100, tol=1e-6):
 def create_new_index(client, index_name):
     try:
         client.delete_index(index_name)
-        print(f'Temporary index \'{index_name}\' deleted successfully.')
+        print(f"Temporary index '{index_name}' deleted successfully.")
     except:
-        print(f'Temporary index \'{index_name}\' does not exist or could not be deleted')
+        print(f"Temporary index '{index_name}' does not exist or could not be deleted")
     client.set_settings(index_name, settings['settings'])
     client.save_rules(index_name, settings['rules'])
     print(f"Settings applied to temporary index '{index_name}'.")
@@ -442,9 +445,19 @@ def main(base_directory, algolia_app_id, algolia_api_key, algolia_index_name,
         else:
             for d in batch:
                 print(f"{d['url']} - {d['page_rank']}")
-        print(f'{'processed' if dry_run else 'indexed'} {len(batch)} records')
+            # Print a sample record to verify doc_type is included
+            if batch:
+                print("\n--- Sample record ---")
+                sample_record = batch[0]
+                print(f"Title: {sample_record.get('title', 'N/A')}")
+                print(f"URL: {sample_record.get('url', 'N/A')}")
+                print(f"Type: {sample_record.get('type', 'N/A')}")
+                print(f"Doc Type: {sample_record.get('doc_type', 'N/A')}")
+                print(f"Keywords: {sample_record.get('keywords', 'N/A')}")
+                print("--- End sample ---\n")
+        print(f"{'processed' if dry_run else 'indexed'} {len(batch)} records")
         t += len(batch)
-    print(f'total {'processed' if dry_run else 'indexed'} {t} records')
+    print(f"total {'processed' if dry_run else 'indexed'} {t} records")
     if not dry_run:
         print('switching temporary index...', end='')
         client.operation_index(temp_index_name, {"operation": "move", "destination": algolia_index_name})
@@ -471,4 +484,4 @@ def main(base_directory, algolia_app_id, algolia_api_key, algolia_index_name,
     if args.dry_run:
         print('Dry running, not sending results to Algolia.')
     main(args.base_directory, args.algolia_app_id, args.algolia_api_key, args.algolia_index_name,
-         dry_run=args.dry_run)
+         dry_run=args.dry_run)
diff --git a/scripts/search/settings.json b/scripts/search/settings.json
@@ -21,7 +21,8 @@
       "url",
       "url_without_anchor",
       "type",
-      "title"
+      "title",
+      "doc_type"
     ],
     "camelCaseAttributes": [
       "h1",
@@ -51,7 +52,9 @@
       "a",
       "an"
     ],
-    "attributesForFaceting": null,
+    "attributesForFaceting": [
+      "doc_type"
+    ],
     "attributesToSnippet": [
       "content:15",
       "title:10"
diff --git a/src/theme/SearchBar/docTypeSelector.jsx b/src/theme/SearchBar/docTypeSelector.jsx
@@ -0,0 +1,47 @@
+import React from 'react';
+
+const DOC_TYPES = {
+  GUIDE: 'guide',
+  REFERENCE: 'reference', 
+  CHANGELOG: 'changelog',
+  LANDINGPAGE: 'landing-page',
+};
+
+export function DocTypeSelector({ selectedDocTypes, onSelectionChange, className }) {
+  const handleChange = (event) => {
+    const value = event.target.value;
+    if (value === 'all') {
+      onSelectionChange(null);
+    } else {
+      onSelectionChange([value]);
+    }
+  };
+
+  const currentValue = selectedDocTypes?.length === 1 ? selectedDocTypes[0] : 'all';
+
+  return (
+    <select 
+      value={currentValue}
+      onChange={handleChange}
+      className={className}
+      style={{
+        padding: '6px 12px',
+        borderRadius: '6px',
+        border: '1px solid var(--docsearch-searchbox-shadow)',
+        backgroundColor: 'var(--docsearch-modal-background)',
+        color: 'var(--docsearch-text-color)',
+        fontSize: '14px',
+        minWidth: '140px',
+        cursor: 'pointer'
+      }}
+    >
+      <option value="all">All docs</option>
+      <option value={DOC_TYPES.GUIDE}>Guides</option>
+      <option value={DOC_TYPES.REFERENCE}>Reference</option>
+      <option value={DOC_TYPES.CHANGELOG}>Changelog</option>
+      <option value={DOC_TYPES.LANDINGPAGE}>Landing Pages</option>
+    </select>
+  );
+}
+
+export { DOC_TYPES };
diff --git a/src/theme/SearchBar/index.js b/src/theme/SearchBar/index.js
@@ -1,4 +1,4 @@
-import React, { useCallback, useMemo, useRef } from 'react';
+import React, { useCallback, useMemo, useRef, useState } from 'react';
 import { DocSearchButton, useDocSearchKeyboardEvents } from '@docsearch/react';
 import Head from '@docusaurus/Head';
 import { useHistory } from '@docusaurus/router';
@@ -21,6 +21,7 @@ import {
 } from './utils/searchConfig';
 import { SearchHit } from './searchHit';
 import { SearchResultsFooter } from './searchResultsFooter';
+import { DocTypeSelector } from './docTypeSelector';
 
 function DocSearch({ contextualSearch, externalUrlRegex, ...props }) {
   const queryIDRef = useRef(null);
@@ -31,6 +32,9 @@ function DocSearch({ contextualSearch, externalUrlRegex, ...props }) {
   const history = useHistory();
   const searchButtonRef = useRef(null);
   
+  // Doc type filtering state
+  const [selectedDocTypes, setSelectedDocTypes] = useState(null);
+  
   // Use the modal management hook
   const {
     isOpen,
@@ -43,8 +47,13 @@ function DocSearch({ contextualSearch, externalUrlRegex, ...props }) {
     importDocSearchModalIfNeeded
   } = useDocSearchModal();
 
-  // Configure search parameters
-  const searchParameters = createSearchParameters(props, contextualSearch, contextualSearchFacetFilters);
+  // Configure search parameters with doc_type filter
+  const searchParameters = createSearchParameters(
+    props, 
+    contextualSearch, 
+    contextualSearchFacetFilters,
+    selectedDocTypes
+  );
 
   useEffect(() => {
     initializeSearchAnalytics(props.appId, props.apiKey);
@@ -66,6 +75,10 @@ function DocSearch({ contextualSearch, externalUrlRegex, ...props }) {
     });
   }, [props.transformItems, processSearchResultUrl, currentLocale]);
 
+  const handleDocTypeChange = useCallback((docTypes) => {
+    setSelectedDocTypes(docTypes);
+  }, []);
+
   const resultsFooterComponent = useMemo(
     () =>
       // eslint-disable-next-line react/no-unstable-nested-components
@@ -130,23 +143,40 @@ function DocSearch({ contextualSearch, externalUrlRegex, ...props }) {
             DocSearchModal &&
             searchContainer &&
             createPortal(
-                <DocSearchModal
-                    onClose={onClose}
-                    initialScrollY={window.scrollY}
-                    initialQuery={initialQuery}
-                    navigator={navigator}
-                    transformItems={transformItems}
-                    hitComponent={SearchHit}
-                    transformSearchClient={transformSearchClient}
-                    {...(props.searchPagePath && {
-                      resultsFooterComponent,
-                    })}
-                    {...props}
-                    insights={true}
-                    searchParameters={searchParameters}
-                    placeholder={translations.placeholder}
-                    translations={translations.modal}
-                />,
+                <>               
+                  <DocSearchModal
+                      onClose={onClose}
+                      initialScrollY={window.scrollY}
+                      initialQuery={initialQuery}
+                      navigator={navigator}
+                      transformItems={transformItems}
+                      hitComponent={SearchHit}
+                      transformSearchClient={transformSearchClient}
+                      {...(props.searchPagePath && {
+                        resultsFooterComponent,
+                      })}
+                      {...props}
+                      insights={true}
+                      searchParameters={searchParameters}
+                      placeholder={translations.placeholder}
+                      translations={translations.modal}
+                  />
+                  
+                  {/* Selector positioned as overlay */}
+                  <div style={{
+                    position: 'fixed',
+                    top: window.innerWidth < 768 ? '55px' : '120px',
+                    right: window.innerWidth < 768 ? 'calc(50% - 185px)' : 'calc(50% - 255px)',
+                    zIndex: 10000,
+                    backgroundColor: 'var(--docsearch-modal-background)',
+                    boxShadow: '0 2px 8px rgba(0,0,0,0.1)'
+                  }}>
+                    <DocTypeSelector 
+                      selectedDocTypes={selectedDocTypes}
+                      onSelectionChange={handleDocTypeChange}
+                    />
+                  </div>
+                </>,
                 searchContainer,
             )}
       </>
diff --git a/src/theme/SearchBar/searchHit.jsx b/src/theme/SearchBar/searchHit.jsx
@@ -10,20 +10,44 @@ export function SearchHit({ hit, children }) {
     .slice(0, 3) // Take first 3 segments max
     .map(segment => segment.replace(/-/g, ' ').replace(/\b\w/g, l => l.toUpperCase()));
   
+  // Format doc_type for display, stripping quotes and formatting
+  const formatDocType = (docType) => {
+    if (!docType) return null;
+    // Remove surrounding quotes and format
+    const cleaned = docType.replace(/^'|'$/g, '');
+    return cleaned.replace(/-/g, ' ').replace(/\b\w/g, l => l.toUpperCase());
+  };
+  
+  const docTypeDisplay = formatDocType(hit.doc_type);
+  
   return (
     <Link onClick={handleClick} to={hit.url}>
       {children}
-      {breadcrumbs.length > 0 && (
-        <span style={{ 
-          fontSize: '10px', 
-          color: '#888',
-          display: 'block',
-          lineHeight: '1',
-          marginBottom: '12px'
-        }}>
-          {breadcrumbs.join(' › ')}
-        </span>
-      )}
+      <div style={{ 
+        fontSize: '10px', 
+        color: '#888',
+        lineHeight: '1',
+        marginBottom: '12px'
+      }}>
+        {/* Doc type badge */}
+        {docTypeDisplay && (
+          <span style={{ 
+            backgroundColor: '#f3f4f6',
+            color: '#374151',
+            padding: '2px 6px',
+            borderRadius: '3px',
+            marginRight: '8px',
+            fontWeight: '500'
+          }}>
+            {docTypeDisplay}
+          </span>
+        )}
+        
+        {/* Breadcrumbs */}
+        {breadcrumbs.length > 0 && (
+          <span>{breadcrumbs.join(' › ')}</span>
+        )}
+      </div>
     </Link>
   );
 }
diff --git a/src/theme/SearchBar/utils/searchConfig.js b/src/theme/SearchBar/utils/searchConfig.js