|
8 | 8 |
|
9 | 9 | import type { LegacySearchMethodProps, SearchResponse } from 'algoliasearch'; |
10 | 10 | import { createDecipheriv } from 'node:crypto'; |
| 11 | +import { Readable } from 'node:stream'; |
11 | 12 | import { z } from 'zod'; |
12 | 13 | import { at, iv, k1 } from '../constants'; |
13 | 14 | import { McpToolContext, declareTool } from './tool-registry'; |
@@ -198,12 +199,10 @@ function createDocSearchHandler({ logger }: McpToolContext) { |
198 | 199 | // Only fetch content from angular.dev |
199 | 200 | if (url.hostname === 'angular.dev' || url.hostname.endsWith('.angular.dev')) { |
200 | 201 | const response = await fetch(url); |
201 | | - if (response.ok) { |
202 | | - const html = await response.text(); |
203 | | - const mainContent = extractMainContent(html); |
204 | | - if (mainContent) { |
205 | | - topContent = stripHtml(mainContent); |
206 | | - } |
| 202 | + if (response.ok && response.body) { |
| 203 | + topContent = await extractMainContent( |
| 204 | + Readable.fromWeb(response.body, { encoding: 'utf-8' }), |
| 205 | + ); |
207 | 206 | } |
208 | 207 | } |
209 | 208 | } catch (e) { |
@@ -246,46 +245,53 @@ function createDocSearchHandler({ logger }: McpToolContext) { |
246 | 245 | } |
247 | 246 |
|
248 | 247 | /** |
249 | | - * Strips HTML tags from a string using a regular expression. |
| 248 | + * Extracts the text content of the `<main>` element by streaming an HTML response. |
250 | 249 | * |
251 | | - * NOTE: This is a basic implementation and is not a full, correct HTML parser. It is, however, |
252 | | - * appropriate for this tool's specific use case because its input is always from a |
253 | | - * trusted source (angular.dev) and its output is consumed by a non-browser environment (an LLM). |
254 | | - * |
255 | | - * The regex first tries to match a complete tag (`<...>`). If it fails, it falls back to matching |
256 | | - * an incomplete tag (e.g., `<script`). |
257 | | - * |
258 | | - * @param html The HTML string to strip. |
259 | | - * @returns The text content of the HTML. |
| 250 | + * @param htmlStream A readable stream of the HTML content of a page. |
| 251 | + * @returns A promise that resolves to the text content of the `<main>` element, or `undefined` if not found. |
260 | 252 | */ |
261 | | -function stripHtml(html: string): string { |
262 | | - return html |
263 | | - .replace(/<[^>]*>|<[a-zA-Z0-9/]+/g, '') |
264 | | - .replace(/</g, '<') |
265 | | - .replace(/>/g, '>') |
266 | | - .replace(/&/g, '&') |
267 | | - .trim(); |
268 | | -} |
| 253 | +async function extractMainContent(htmlStream: Readable): Promise<string | undefined> { |
| 254 | + const { RewritingStream } = await import('parse5-html-rewriting-stream'); |
269 | 255 |
|
270 | | -/** |
271 | | - * Extracts the content of the `<main>` element from an HTML string. |
272 | | - * |
273 | | - * @param html The HTML content of a page. |
274 | | - * @returns The content of the `<main>` element, or `undefined` if not found. |
275 | | - */ |
276 | | -function extractMainContent(html: string): string | undefined { |
277 | | - const mainTagStart = html.indexOf('<main'); |
278 | | - if (mainTagStart === -1) { |
279 | | - return undefined; |
280 | | - } |
| 256 | + const rewriter = new RewritingStream(); |
| 257 | + let mainTextContent = ''; |
| 258 | + let inMainElement = false; |
| 259 | + let mainTagFound = false; |
| 260 | + |
| 261 | + rewriter.on('startTag', (tag) => { |
| 262 | + if (tag.tagName === 'main') { |
| 263 | + inMainElement = true; |
| 264 | + mainTagFound = true; |
| 265 | + } |
| 266 | + }); |
| 267 | + |
| 268 | + rewriter.on('endTag', (tag) => { |
| 269 | + if (tag.tagName === 'main') { |
| 270 | + inMainElement = false; |
| 271 | + } |
| 272 | + }); |
281 | 273 |
|
282 | | - const mainTagEnd = html.lastIndexOf('</main>'); |
283 | | - if (mainTagEnd <= mainTagStart) { |
284 | | - return undefined; |
285 | | - } |
| 274 | + // Only capture text content, and only when inside the <main> element. |
| 275 | + rewriter.on('text', (text) => { |
| 276 | + if (inMainElement) { |
| 277 | + mainTextContent += text.text; |
| 278 | + } |
| 279 | + }); |
| 280 | + |
| 281 | + return new Promise((resolve, reject) => { |
| 282 | + htmlStream |
| 283 | + .pipe(rewriter) |
| 284 | + .on('finish', () => { |
| 285 | + if (!mainTagFound) { |
| 286 | + resolve(undefined); |
| 287 | + |
| 288 | + return; |
| 289 | + } |
286 | 290 |
|
287 | | - // Add 7 to include '</main>' |
288 | | - return html.substring(mainTagStart, mainTagEnd + 7); |
| 291 | + resolve(mainTextContent.trim()); |
| 292 | + }) |
| 293 | + .on('error', reject); |
| 294 | + }); |
289 | 295 | } |
290 | 296 |
|
291 | 297 | /** |
|
0 commit comments