refactor(@angular/cli): use streaming HTML parser in search documentation tool

clydin · clydin · commit 434daef99207 · 2025-10-21T06:49:47.000-04:00
The `search_documentation` MCP tool previously used a regular expression and string searching to extract and clean documentation content from fetched HTML. This approach was not robust and could produce incorrect results. It also buffered the entire HTML response in memory before processing.

This commit refactors the implementation to use `parse5-html-rewriting-stream`, which is already a dependency in the workspace. The new implementation streams the `fetch` response directly into a single-pass parser that simultaneously extracts the `&lt;main&gt;` element's content and strips all HTML tags.

This change makes the parsing more reliable, efficient, and memory-friendly.
diff --git a/packages/angular/cli/BUILD.bazel b/packages/angular/cli/BUILD.bazel
@@ -58,6 +58,7 @@ ts_project(
         ":node_modules/jsonc-parser",
         ":node_modules/npm-package-arg",
         ":node_modules/pacote",
+        ":node_modules/parse5-html-rewriting-stream",
         ":node_modules/resolve",
         ":node_modules/yargs",
         ":node_modules/zod",
diff --git a/packages/angular/cli/package.json b/packages/angular/cli/package.json
@@ -36,6 +36,7 @@
     "listr2": "9.0.4",
     "npm-package-arg": "13.0.1",
     "pacote": "21.0.3",
+    "parse5-html-rewriting-stream": "8.0.0",
     "resolve": "1.22.10",
     "semver": "7.7.3",
     "yargs": "18.0.0",
diff --git a/packages/angular/cli/src/commands/mcp/tools/doc-search.ts b/packages/angular/cli/src/commands/mcp/tools/doc-search.ts
@@ -8,6 +8,7 @@
 
 import type { LegacySearchMethodProps, SearchResponse } from 'algoliasearch';
 import { createDecipheriv } from 'node:crypto';
+import { Readable } from 'node:stream';
 import { z } from 'zod';
 import { at, iv, k1 } from '../constants';
 import { McpToolContext, declareTool } from './tool-registry';
@@ -198,12 +199,10 @@ function createDocSearchHandler({ logger }: McpToolContext) {
         // Only fetch content from angular.dev
         if (url.hostname === 'angular.dev' || url.hostname.endsWith('.angular.dev')) {
           const response = await fetch(url);
-          if (response.ok) {
-            const html = await response.text();
-            const mainContent = extractMainContent(html);
-            if (mainContent) {
-              topContent = stripHtml(mainContent);
-            }
+          if (response.ok && response.body) {
+            topContent = await extractMainContent(
+              Readable.fromWeb(response.body, { encoding: 'utf-8' }),
+            );
           }
         }
       } catch (e) {
@@ -246,46 +245,53 @@ function createDocSearchHandler({ logger }: McpToolContext) {
 }
 
 /**
- * Strips HTML tags from a string using a regular expression.
+ * Extracts the text content of the `<main>` element by streaming an HTML response.
  *
- * NOTE: This is a basic implementation and is not a full, correct HTML parser. It is, however,
- * appropriate for this tool's specific use case because its input is always from a
- * trusted source (angular.dev) and its output is consumed by a non-browser environment (an LLM).
- *
- * The regex first tries to match a complete tag (`<...>`). If it fails, it falls back to matching
- * an incomplete tag (e.g., `<script`).
- *
- * @param html The HTML string to strip.
- * @returns The text content of the HTML.
+ * @param htmlStream A readable stream of the HTML content of a page.
+ * @returns A promise that resolves to the text content of the `<main>` element, or `undefined` if not found.
  */
-function stripHtml(html: string): string {
-  return html
-    .replace(/<[^>]*>|<[a-zA-Z0-9/]+/g, '')
-    .replace(/&lt;/g, '<')
-    .replace(/&gt;/g, '>')
-    .replace(/&amp;/g, '&')
-    .trim();
-}
+async function extractMainContent(htmlStream: Readable): Promise<string | undefined> {
+  const { RewritingStream } = await import('parse5-html-rewriting-stream');
 
-/**
- * Extracts the content of the `<main>` element from an HTML string.
- *
- * @param html The HTML content of a page.
- * @returns The content of the `<main>` element, or `undefined` if not found.
- */
-function extractMainContent(html: string): string | undefined {
-  const mainTagStart = html.indexOf('<main');
-  if (mainTagStart === -1) {
-    return undefined;
-  }
+  const rewriter = new RewritingStream();
+  let mainTextContent = '';
+  let inMainElement = false;
+  let mainTagFound = false;
+
+  rewriter.on('startTag', (tag) => {
+    if (tag.tagName === 'main') {
+      inMainElement = true;
+      mainTagFound = true;
+    }
+  });
+
+  rewriter.on('endTag', (tag) => {
+    if (tag.tagName === 'main') {
+      inMainElement = false;
+    }
+  });
 
-  const mainTagEnd = html.lastIndexOf('</main>');
-  if (mainTagEnd <= mainTagStart) {
-    return undefined;
-  }
+  // Only capture text content, and only when inside the <main> element.
+  rewriter.on('text', (text) => {
+    if (inMainElement) {
+      mainTextContent += text.text;
+    }
+  });
+
+  return new Promise((resolve, reject) => {
+    htmlStream
+      .pipe(rewriter)
+      .on('finish', () => {
+        if (!mainTagFound) {
+          resolve(undefined);
+
+          return;
+        }
 
-  // Add 7 to include '</main>'
-  return html.substring(mainTagStart, mainTagEnd + 7);
+        resolve(mainTextContent.trim());
+      })
+      .on('error', reject);
+  });
 }
 
 /**
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml