Skip to content

Commit a9e14d6

Browse files
authored
Merge pull request #3205 from ClickHouse/click_analytics
click analytics for search
2 parents 353a97a + 24a4300 commit a9e14d6

File tree

13 files changed

+626
-8595
lines changed

13 files changed

+626
-8595
lines changed

.github/workflows/build-search.yml

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,10 @@ env:
1515

1616
jobs:
1717
update-search:
18-
if: github.event.pull_request.merged == true && contains(github.event.pull_request.labels.*.name, 'update search') && github.event.pull_request.base.ref == 'main'
19-
#if: contains(github.event.pull_request.labels.*.name, 'update search') # Updated to trigger directly on PRs with the label
18+
if: |
19+
github.event_name == 'workflow_dispatch' ||
20+
github.event_name == 'schedule' ||
21+
(github.event_name == 'pull_request' && github.event.pull_request.merged == true && contains(github.event.pull_request.labels.*.name, 'update search') && github.event.pull_request.base.ref == 'main') #if: contains(github.event.pull_request.labels.*.name, 'update search') # Updated to trigger directly on PRs with the label
2022
runs-on: ubuntu-latest
2123

2224
steps:

docs/en/engines/table-engines/integrations/s3queue.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,8 @@ sidebar_position: 181
44
sidebar_label: S3Queue
55
---
66

7+
import ScalePlanFeatureBadge from '@theme/badges/ScalePlanFeatureBadge'
8+
79
# S3Queue Table Engine
810

911
This engine provides integration with [Amazon S3](https://aws.amazon.com/s3/) ecosystem and allows streaming import. This engine is similar to the [Kafka](../../../engines/table-engines/integrations/kafka.md), [RabbitMQ](../../../engines/table-engines/integrations/rabbitmq.md) engines, but provides S3-specific features.
@@ -194,6 +196,8 @@ Engine supports all s3 related settings. For more information about S3 settings
194196

195197
## S3 role-based access
196198

199+
<ScalePlanFeatureBadge feature="S3 Role-Based Access" />
200+
197201
The s3Queue table engine supports role-based access.
198202
Refer to the documentation [here](https://clickhouse.com/docs/en/cloud/security/secure-s3) for steps to configure a role to access your bucket.
199203

package.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,8 @@
5252
"remark-docusaurus-tabs": "^0.2.0",
5353
"remark-link-rewrite": "^1.0.7",
5454
"remark-math": "^6.0.0",
55-
"sass": "^1.83.1"
55+
"sass": "^1.83.1",
56+
"search-insights": "^2.17.3"
5657
},
5758
"devDependencies": {
5859
"@argos-ci/cli": "^2.5.5",

scripts/search/README.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -61,3 +61,7 @@ Algolia configs to try:
6161
- minProximity - 1
6262
- minWordSizefor2Typos - 7
6363
- minWordSizefor1Typo- 3
64+
65+
Implement:
66+
- per page ranking as metadata
67+
- omit page from index

scripts/search/compute_ndcg.py

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,10 @@
11
import csv
22
import math
33
import argparse
4+
from urllib.parse import urlparse
5+
from bs4 import BeautifulSoup
6+
7+
import requests
48
from algoliasearch.search.client import SearchClientSync
59

610
ALGOLIA_INDEX_NAME = "clickhouse"
@@ -25,6 +29,33 @@ def compute_dcg(relevance_scores):
2529
return sum(rel / math.log2(idx + 2) for idx, rel in enumerate(relevance_scores))
2630

2731

32+
def verify_link(link):
33+
if not link:
34+
return True
35+
"""Verify that a given link is valid and exists, including checking anchor existence."""
36+
parsed_url = urlparse(link)
37+
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}{parsed_url.path}"
38+
anchor = parsed_url.fragment # Extract anchor if present
39+
40+
try:
41+
response = requests.get(base_url, timeout=10, allow_redirects=True)
42+
response.raise_for_status() # Raise an error for HTTP failures (4xx, 5xx)
43+
except requests.RequestException as e:
44+
print(f"Error: Unable to reach {base_url} - {e}")
45+
exit(1)
46+
47+
# If no anchor, the base page is sufficient
48+
if not anchor:
49+
return True
50+
51+
# Parse the page and check for the anchor
52+
soup = BeautifulSoup(response.text, 'html.parser')
53+
element = soup.find(id=anchor) or soup.find(attrs={"name": anchor}) # Check for both `id` and `name`
54+
55+
if not element:
56+
print(f"Error: Anchor #{anchor} not found in {base_url}")
57+
exit(1)
58+
2859
def compute_ndcg(expected_links, retrieved_links, k):
2960
"""Compute normalized DCG."""
3061
relevance_scores = [1 if link in expected_links else 0 for link in retrieved_links[:k]]
@@ -36,14 +67,26 @@ def compute_ndcg(expected_links, retrieved_links, k):
3667
return dcg / idcg if idcg > 0 else 0
3768

3869

39-
def main(input_csv, detailed, k=3):
70+
def main(input_csv, detailed, validate, k=3):
4071
"""Main function to compute nDCG for search terms in a CSV."""
4172
with open(input_csv, mode='r', newline='', encoding='utf-8') as file:
4273
reader = csv.reader(file)
4374
rows = list(reader)
4475
results = []
4576
total_ndcg = 0
4677

78+
if validate:
79+
print("Validating links...")
80+
for row in rows:
81+
for link in row[1:4]:
82+
if link:
83+
print(f"Checking link {link}...", end="")
84+
verify_link(link)
85+
print("OK")
86+
print("All links validated.")
87+
else:
88+
print("Skipping link validation.")
89+
4790
for row in rows:
4891
term = row[0]
4992
# Remove duplicates in expected links - can happen as some docs return same url
@@ -97,6 +140,12 @@ def main(input_csv, detailed, k=3):
97140
action="store_true",
98141
help="Print detailed results for each search term."
99142
)
143+
parser.add_argument(
144+
"-v",
145+
"--validate",
146+
action="store_true",
147+
help="Validate links."
148+
)
100149
args = parser.parse_args()
101150

102-
main(args.input_csv, args.detailed)
151+
main(args.input_csv, args.detailed, args.validate)

scripts/search/index_pages.py

Lines changed: 11 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -186,6 +186,14 @@ def extract_links_from_content(content):
186186
return re.findall(link_pattern, content)
187187

188188

189+
def remove_markdown_links(text):
190+
# Remove inline Markdown links: [text](url)
191+
text = re.sub(r'\[([^\]]+)\]\([^)]+\)', r'\1', text)
192+
# Remove autolinks: <http://example.com>
193+
text = re.sub(r'<https?://[^>]+>', '', text)
194+
return text
195+
196+
189197
# best effort at creating links between docs - handling both md and urls. Challenge here some files import others
190198
# e.g. /opt/clickhouse-docs/docs/en/sql-reference/formats.mdx - we don't recursively resolve here
191199
def update_page_links(directory, base_directory, page_path, url, content):
@@ -249,7 +257,8 @@ def parse_markdown_content(metadata, content):
249257
current_subdoc['type'] = 'lvl1'
250258
current_subdoc['object_id'] = custom_slugify(heading_slug)
251259
current_subdoc['hierarchy']['lvl1'] = current_h1
252-
current_subdoc['hierarchy']['lvl0'] = current_h1 if metadata.get('title', '') == '' else metadata.get('title', '')
260+
current_subdoc['hierarchy']['lvl0'] = current_h1 if metadata.get('title', '') == '' else metadata.get(
261+
'title', '')
253262
elif line.startswith('## '):
254263
if current_subdoc:
255264
yield from split_large_document(current_subdoc)
@@ -356,6 +365,7 @@ def process_markdown_directory(directory, base_directory):
356365
sub_doc['anchor'] = anchor
357366
update_page_links(directory, base_directory, metadata.get('file_path', ''), sub_doc['url'],
358367
sub_doc['content'])
368+
sub_doc['content'] = remove_markdown_links(sub_doc['content'])
359369
yield sub_doc
360370

361371

scripts/search/results.csv

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ keeper,https://clickhouse.com/docs/en/guides/sre/keeper/clickhouse-keeper,https:
8383
type,https://clickhouse.com/docs/en/sql-reference/data-types,https://clickhouse.com/docs/en/sql-reference/functions/type-conversion-functions,
8484
nullable,https://clickhouse.com/docs/en/sql-reference/data-types/nullable,https://clickhouse.com/docs/en/cloud/bestpractices/avoid-nullable-columns,https://clickhouse.com/docs/en/sql-reference/functions/functions-for-nulls
8585
projection,https://clickhouse.com/docs/en/sql-reference/statements/alter/projection,https://clickhouse.com/docs/en/engines/table-engines/mergetree-family/mergetree#projections,https://clickhouse.com/docs/knowledgebase/projection_example
86-
jdbc,https://clickhouse.com/docs/en/engines/table-engines/integrations/jdbc,https://clickhouse.com/docs/en/interfaces/jdbc,https://clickhouse.com/docs/en/integrations/java/jdbc-driver
86+
jdbc,https://clickhouse.com/docs/en/interfaces/jdbc,https://clickhouse.com/docs/en/integrations/java/jdbc-v2, https://clickhouse.com/docs/en/engines/table-engines/integrations/jdbc
8787
ifnull,https://clickhouse.com/docs/en/sql-reference/functions/functions-for-nulls#ifnull,https://clickhouse.com/docs/en/sql-reference/functions/conditional-functions,
8888
any,https://clickhouse.com/docs/en/sql-reference/aggregate-functions/reference/any,https://clickhouse.com/docs/en/sql-reference/aggregate-functions/reference/first_value,
8989
optimize,https://clickhouse.com/docs/en/sql-reference/statements/optimize,https://clickhouse.com/docs/en/optimize,https://clickhouse.com/docs/en/cloud/bestpractices/avoid-optimize-final
@@ -138,7 +138,7 @@ nested,https://clickhouse.com/docs/en/sql-reference/data-types/nested-data-struc
138138
sample,https://clickhouse.com/docs/en/sql-reference/statements/select/sample,,
139139
distinct,https://clickhouse.com/docs/en/sql-reference/statements/select/distinct,,
140140
hash,https://clickhouse.com/docs/en/sql-reference/functions/hash-functions,,
141-
codec,https://clickhouse.com/docs/en/sql-reference/statements/create/table#column_compression_codec,https://clickhouse.com/docs/en/data-compression/compression-in-clickhouse,https://clickhouse.com/docs/observability/use-cases/observability/schema-design#using-codecs
141+
codec,https://clickhouse.com/docs/en/sql-reference/statements/create/table#column_compression_codec,https://clickhouse.com/docs/en/data-compression/compression-in-clickhouse,https://clickhouse.com/docs/en/use-cases/observability/schema-design#using-codecs
142142
timestamp,https://clickhouse.com/docs/en/sql-reference/functions/date-time-functions#timestamp,,
143143
drop,https://clickhouse.com/docs/en/sql-reference/statements/drop,,
144144
parquet,https://clickhouse.com/docs/en/integrations/data-formats/parquet,https://clickhouse.com/docs/knowledgebase/ingest-parquet-files-in-s3,
@@ -151,7 +151,7 @@ materi,https://clickhouse.com/docs/en/materialized-view,,
151151
max_threads,https://clickhouse.com/docs/en/operations/settings/settings#max_threads,,
152152
limit,https://clickhouse.com/docs/en/sql-reference/statements/select/limit,,
153153
toint,https://clickhouse.com/docs/en/sql-reference/functions/type-conversion-functions,,
154-
shard,https://clickhouse.com/docs/concepts/concepts/glossary#shard,https://clickhouse.com/docs/en/concepts/glossary,
154+
shard,https://clickhouse.com/docs/en/concepts/glossary#shard,https://clickhouse.com/docs/en/concepts/glossary,
155155
timeout,https://clickhouse.com/docs/en/operations/server-configuration-parameters/settings,https://clickhouse.com/docs/en/operations/settings/settings#timeout_overflow_mode,
156156
date_diff,https://clickhouse.com/docs/en/sql-reference/functions/date-time-functions#date_diff,,
157157
default,https://clickhouse.com/docs/en/operations/settings/settings-users,https://clickhouse.com/docs/knowledgebase/remove-default-user,
@@ -181,7 +181,7 @@ config.xml,https://clickhouse.com/docs/en/operations/server-configuration-parame
181181
where,https://clickhouse.com/docs/en/sql-reference/statements/select/where,,
182182
replica,https://clickhouse.com/docs/en/architecture/horizontal-scaling,,
183183
last,https://clickhouse.com/docs/en/sql-reference/window-functions/last_value,,
184-
version,https://clickhouse.com/docs/en/sql-reference/functions/other-functions#version,https://clickhouse.com/docswhich-clickhouse-version-to-use-in-production,
184+
version,https://clickhouse.com/docs/en/sql-reference/functions/other-functions#version,https://clickhouse.com/docs/en/faq/operations/production,
185185
greatest,https://clickhouse.com/docs/en/sql-reference/functions/conditional-functions#greatest,https://clickhouse.com/docs/en/sql-reference/functions/other-functions#greatest,
186186
nan,https://clickhouse.com/docs/en/sql-reference/data-types/float#nan-and-inf,,
187187
log,https://clickhouse.com/docs/en/sql-reference/functions/math-functions#log,,
@@ -206,7 +206,8 @@ min_insert_block_size_rows,https://clickhouse.com/docs/en/operations/settings/se
206206
allow_experimental_parallel_reading_from_replicas,https://clickhouse.com/docs/en/operations/settings/settings#allow_experimental_parallel_reading_from_replicas,,
207207
join_algorithm,https://clickhouse.com/docs/en/operations/settings/settings#join_algorithm,,
208208
max_memory_usage,https://clickhouse.com/docs/en/operations/settings/settings#max_memory_usage,,
209-
max_bytes_before_external_group_by,https://clickhouse.com/docs/en/operations/settings/settings#max_bytes_before_external_group_by,,
209+
max_bytes_before_external_group_by,https://clickhouse.com/docs/en/operations/settings/settings#max_bytes_before_external_group_by,https://clickhouse.com/docs/en/operations/settings/query-complexity#settings-max_bytes_before_external_group_by,https://clickhouse.com/docs/en/sql-reference/statements/select/group-by#group-by-in-external-memory
210210
max_bytes_before_external_sort,https://clickhouse.com/docs/en/operations/settings/settings#max_bytes_before_external_sort,,
211211
result_overflow_mode,https://clickhouse.com/docs/en/operations/settings/settings#result_overflow_mode,,
212212
use_query_cache,https://clickhouse.com/docs/en/operations/settings/settings#use_query_cache,,
213+
date time best effort,https://clickhouse.com/docs/en/sql-reference/functions/type-conversion-functions#parsedatetimebesteffortusorzero,https://clickhouse.com/docs/en/sql-reference/functions/type-conversion-functions#parsedatetimebesteffort

scripts/search/settings.json

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,9 @@
2222
"url_without_anchor",
2323
"type"
2424
],
25-
"camelCaseAttributes": [],
25+
"camelCaseAttributes": [
26+
"h2"
27+
],
2628
"attributeCriteriaComputedByMinProximity": false,
2729
"distinct": true,
2830
"unretrievableAttributes": null,

src/lib/google/google.js

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
/**
2+
* Extracts the user ID from the Google Analytics device ID.
3+
* @example `GA1.1.xxxxxxxxxx.xxxxxxxxxx => xxxxxxxxxx-xxxxxxxxxx`
4+
* @link https://support.google.com/analytics/answer/11397207
5+
*/
6+
const extractGoogleAnalyticsUserIdFromCookie = (gaCookie) => {
7+
if (gaCookie) {
8+
// Remove the Google Analytics tracker from the device ID.
9+
const userIdParts = gaCookie.split('.').slice(-2);
10+
if (userIdParts.length === 2) {
11+
return userIdParts.join('-');
12+
}
13+
}
14+
return undefined;
15+
};
16+
17+
const getBrowserCookie = (cookieName) => {
18+
// In React Native environments, `document.cookie` doesn't exist.
19+
if (typeof document !== 'object' || typeof document.cookie !== 'string') {
20+
return undefined;
21+
}
22+
const name = cookieName + '=';
23+
const decodedCookie = decodeURIComponent(document.cookie);
24+
const ca = decodedCookie.split(';');
25+
for (let i = 0; i < ca.length; i++) {
26+
let c = ca[i].trim();
27+
if (c.startsWith(name)) {
28+
return c.substring(name.length);
29+
}
30+
}
31+
return undefined;
32+
};
33+
34+
/**
35+
* Returns the Google Analytics User ID from a browser cookie name.
36+
* @example `getGoogleAnalyticsUserIdFromBrowserCookie('_ga')`
37+
*/
38+
export const getGoogleAnalyticsUserIdFromBrowserCookie = (cookieName) => {
39+
const browserCookie = getBrowserCookie(cookieName);
40+
return browserCookie ? extractGoogleAnalyticsUserIdFromCookie(browserCookie) : undefined;
41+
};

0 commit comments

Comments
 (0)