Skip to content

Commit 434daef

Browse files
committed
refactor(@angular/cli): use streaming HTML parser in search documentation tool
The `search_documentation` MCP tool previously used a regular expression and string searching to extract and clean documentation content from fetched HTML. This approach was not robust and could produce incorrect results. It also buffered the entire HTML response in memory before processing. This commit refactors the implementation to use `parse5-html-rewriting-stream`, which is already a dependency in the workspace. The new implementation streams the `fetch` response directly into a single-pass parser that simultaneously extracts the `<main>` element's content and strips all HTML tags. This change makes the parsing more reliable, efficient, and memory-friendly.
1 parent 4aba526 commit 434daef

File tree

4 files changed

+52
-41
lines changed

4 files changed

+52
-41
lines changed

packages/angular/cli/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ ts_project(
5858
":node_modules/jsonc-parser",
5959
":node_modules/npm-package-arg",
6060
":node_modules/pacote",
61+
":node_modules/parse5-html-rewriting-stream",
6162
":node_modules/resolve",
6263
":node_modules/yargs",
6364
":node_modules/zod",

packages/angular/cli/package.json

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
"listr2": "9.0.4",
3737
"npm-package-arg": "13.0.1",
3838
"pacote": "21.0.3",
39+
"parse5-html-rewriting-stream": "8.0.0",
3940
"resolve": "1.22.10",
4041
"semver": "7.7.3",
4142
"yargs": "18.0.0",

packages/angular/cli/src/commands/mcp/tools/doc-search.ts

Lines changed: 47 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88

99
import type { LegacySearchMethodProps, SearchResponse } from 'algoliasearch';
1010
import { createDecipheriv } from 'node:crypto';
11+
import { Readable } from 'node:stream';
1112
import { z } from 'zod';
1213
import { at, iv, k1 } from '../constants';
1314
import { McpToolContext, declareTool } from './tool-registry';
@@ -198,12 +199,10 @@ function createDocSearchHandler({ logger }: McpToolContext) {
198199
// Only fetch content from angular.dev
199200
if (url.hostname === 'angular.dev' || url.hostname.endsWith('.angular.dev')) {
200201
const response = await fetch(url);
201-
if (response.ok) {
202-
const html = await response.text();
203-
const mainContent = extractMainContent(html);
204-
if (mainContent) {
205-
topContent = stripHtml(mainContent);
206-
}
202+
if (response.ok && response.body) {
203+
topContent = await extractMainContent(
204+
Readable.fromWeb(response.body, { encoding: 'utf-8' }),
205+
);
207206
}
208207
}
209208
} catch (e) {
@@ -246,46 +245,53 @@ function createDocSearchHandler({ logger }: McpToolContext) {
246245
}
247246

248247
/**
249-
* Strips HTML tags from a string using a regular expression.
248+
* Extracts the text content of the `<main>` element by streaming an HTML response.
250249
*
251-
* NOTE: This is a basic implementation and is not a full, correct HTML parser. It is, however,
252-
* appropriate for this tool's specific use case because its input is always from a
253-
* trusted source (angular.dev) and its output is consumed by a non-browser environment (an LLM).
254-
*
255-
* The regex first tries to match a complete tag (`<...>`). If it fails, it falls back to matching
256-
* an incomplete tag (e.g., `<script`).
257-
*
258-
* @param html The HTML string to strip.
259-
* @returns The text content of the HTML.
250+
* @param htmlStream A readable stream of the HTML content of a page.
251+
* @returns A promise that resolves to the text content of the `<main>` element, or `undefined` if not found.
260252
*/
261-
function stripHtml(html: string): string {
262-
return html
263-
.replace(/<[^>]*>|<[a-zA-Z0-9/]+/g, '')
264-
.replace(/&lt;/g, '<')
265-
.replace(/&gt;/g, '>')
266-
.replace(/&amp;/g, '&')
267-
.trim();
268-
}
253+
async function extractMainContent(htmlStream: Readable): Promise<string | undefined> {
254+
const { RewritingStream } = await import('parse5-html-rewriting-stream');
269255

270-
/**
271-
* Extracts the content of the `<main>` element from an HTML string.
272-
*
273-
* @param html The HTML content of a page.
274-
* @returns The content of the `<main>` element, or `undefined` if not found.
275-
*/
276-
function extractMainContent(html: string): string | undefined {
277-
const mainTagStart = html.indexOf('<main');
278-
if (mainTagStart === -1) {
279-
return undefined;
280-
}
256+
const rewriter = new RewritingStream();
257+
let mainTextContent = '';
258+
let inMainElement = false;
259+
let mainTagFound = false;
260+
261+
rewriter.on('startTag', (tag) => {
262+
if (tag.tagName === 'main') {
263+
inMainElement = true;
264+
mainTagFound = true;
265+
}
266+
});
267+
268+
rewriter.on('endTag', (tag) => {
269+
if (tag.tagName === 'main') {
270+
inMainElement = false;
271+
}
272+
});
281273

282-
const mainTagEnd = html.lastIndexOf('</main>');
283-
if (mainTagEnd <= mainTagStart) {
284-
return undefined;
285-
}
274+
// Only capture text content, and only when inside the <main> element.
275+
rewriter.on('text', (text) => {
276+
if (inMainElement) {
277+
mainTextContent += text.text;
278+
}
279+
});
280+
281+
return new Promise((resolve, reject) => {
282+
htmlStream
283+
.pipe(rewriter)
284+
.on('finish', () => {
285+
if (!mainTagFound) {
286+
resolve(undefined);
287+
288+
return;
289+
}
286290

287-
// Add 7 to include '</main>'
288-
return html.substring(mainTagStart, mainTagEnd + 7);
291+
resolve(mainTextContent.trim());
292+
})
293+
.on('error', reject);
294+
});
289295
}
290296

291297
/**

pnpm-lock.yaml

Lines changed: 3 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)