Skip to content

Commit 2f78652

Browse files
authored
better search scraping error handling (don't fail on single page) (#58004)
1 parent 0d41564 commit 2f78652

File tree

4 files changed

+269
-53
lines changed

4 files changed

+269
-53
lines changed

.github/workflows/index-general-search.yml

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -180,6 +180,19 @@ jobs:
180180
181181
ls -lh /tmp/records
182182
183+
- name: Check for scraping failures
184+
id: check-failures
185+
run: |
186+
if [ -f /tmp/records/failures-summary.json ]; then
187+
FAILED_PAGES=$(jq -r '.totalFailedPages' /tmp/records/failures-summary.json)
188+
echo "failed_pages=$FAILED_PAGES" >> $GITHUB_OUTPUT
189+
echo "has_failures=true" >> $GITHUB_OUTPUT
190+
echo "⚠️ Warning: $FAILED_PAGES page(s) failed to scrape"
191+
else
192+
echo "has_failures=false" >> $GITHUB_OUTPUT
193+
echo "✅ All pages scraped successfully"
194+
fi
195+
183196
- name: Check that Elasticsearch is accessible
184197
run: |
185198
curl --fail --retry-connrefused --retry 5 -I ${{ env.ELASTICSEARCH_URL }}
@@ -211,6 +224,19 @@ jobs:
211224
FASTLY_SURROGATE_KEY: api-search:${{ matrix.language }}
212225
run: npm run purge-fastly-edge-cache
213226

227+
- name: Alert on scraping failures
228+
if: ${{ steps.check-failures.outputs.has_failures == 'true' && github.event_name != 'workflow_dispatch' }}
229+
uses: ./.github/actions/slack-alert
230+
with:
231+
slack_channel_id: ${{ secrets.DOCS_ALERTS_SLACK_CHANNEL_ID }}
232+
slack_token: ${{ secrets.SLACK_DOCS_BOT_TOKEN }}
233+
message: |
234+
:warning: ${{ steps.check-failures.outputs.failed_pages }} page(s) failed to scrape for general search indexing (language: ${{ matrix.language }})
235+
236+
The indexing completed but some pages could not be scraped. This may affect search results for those pages.
237+
238+
Workflow: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}
239+
214240
- uses: ./.github/actions/slack-alert
215241
if: ${{ failure() && github.event_name != 'workflow_dispatch' }}
216242
with:

src/search/scripts/scrape/lib/build-records.ts

Lines changed: 111 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -48,14 +48,26 @@ const MIN_TIME = parseInt(process.env.BUILD_RECORDS_MIN_TIME || '200', 10)
4848
// when multiple docs match on a certain keyword(s).
4949
const FORCE_0_POPULARITY_PRODUCTS = new Set(['contributing'])
5050

51+
interface FailedPage {
52+
url?: string
53+
relativePath?: string
54+
error: string
55+
errorType: string
56+
}
57+
58+
export interface BuildRecordsResult {
59+
records: Record[]
60+
failedPages: FailedPage[]
61+
}
62+
5163
export default async function buildRecords(
5264
indexName: string,
5365
indexablePages: Page[],
5466
indexVersion: string,
5567
languageCode: string,
5668
redirects: Redirects,
5769
config: Config = {} as Config,
58-
): Promise<Record[]> {
70+
): Promise<BuildRecordsResult> {
5971
// Determine the page version from the index version
6072
const pageVersion = getAllVersionsKeyFromIndexVersion(indexVersion)
6173

@@ -96,6 +108,9 @@ export default async function buildRecords(
96108

97109
const hasPopularPages = Object.keys(popularPages).length > 0
98110

111+
// Track failed pages
112+
const failedPages: FailedPage[] = []
113+
99114
const waiter = domwaiter(permalinks, { maxConcurrent: MAX_CONCURRENT, minTime: MIN_TIME })
100115
.on('page', (page) => {
101116
if (!noMarkers) process.stdout.write(pageMarker)
@@ -114,23 +129,105 @@ export default async function buildRecords(
114129
records.push(newRecord)
115130
})
116131
.on('error', (err) => {
117-
if (err instanceof HTTPError && !err.response.ok) {
118-
console.log(
119-
'\n' +
120-
boxen(chalk.bold(err.request.requestUrl?.pathname), {
121-
title: chalk.red('The URL it failed on was'),
122-
padding: 1,
123-
borderColor: 'red',
124-
}) +
125-
'\n',
126-
)
132+
// Track the failure
133+
const url = (err as any).url
134+
const relativePath = (err as any).relativePath
135+
136+
// Check for HTTPError by name since it may come from a different module
137+
if ((err instanceof HTTPError || err?.name === 'HTTPError') && (err as any).response) {
138+
const httpErr = err as any
139+
failedPages.push({
140+
url: httpErr.request?.requestUrl?.pathname || url,
141+
relativePath,
142+
error: err.message,
143+
errorType: `HTTP ${httpErr.response?.statusCode || 'Error'}`,
144+
})
145+
146+
if (!noMarkers) process.stdout.write(chalk.red('✗'))
147+
} else if (err instanceof Error) {
148+
// Enhanced error handling for timeout and network errors
149+
const errorType = (err.cause as any)?.code || err.name
150+
const isTimeout =
151+
errorType === 'UND_ERR_HEADERS_TIMEOUT' ||
152+
errorType === 'UND_ERR_CONNECT_TIMEOUT' ||
153+
err.message.includes('timed out')
154+
155+
failedPages.push({
156+
url,
157+
relativePath,
158+
error: err.message,
159+
errorType: isTimeout ? 'Timeout' : errorType || 'Unknown Error',
160+
})
161+
162+
if (!noMarkers) process.stdout.write(chalk.red('✗'))
127163
} else {
128-
console.error(err)
164+
failedPages.push({
165+
url,
166+
relativePath,
167+
error: String(err),
168+
errorType: 'Unknown Error',
169+
})
170+
171+
if (!noMarkers) process.stdout.write(chalk.red('✗'))
129172
}
130173
})
131174

132-
return eventToPromise(waiter, 'done').then(() => {
175+
// Wait for 'done' event but ignore 'error' events (they're handled by the error listener above)
176+
return eventToPromise(waiter, 'done', { ignoreErrors: true }).then(() => {
133177
console.log('\nrecords in index: ', records.length)
134-
return records
178+
179+
// Report failed pages if any
180+
if (failedPages.length > 0) {
181+
console.log(
182+
'\n' +
183+
boxen(
184+
chalk.bold.red(`${failedPages.length} page(s) failed to scrape\n\n`) +
185+
failedPages
186+
.slice(0, 10) // Show first 10 failures
187+
.map((failure, idx) => {
188+
return (
189+
chalk.gray(`${idx + 1}. `) +
190+
chalk.yellow(failure.errorType) +
191+
'\n' +
192+
(failure.relativePath
193+
? chalk.cyan(` Path: `) + failure.relativePath + '\n'
194+
: '') +
195+
(failure.url ? chalk.cyan(` URL: `) + failure.url + '\n' : '') +
196+
chalk.gray(` Error: ${failure.error}`)
197+
)
198+
})
199+
.join('\n\n') +
200+
(failedPages.length > 10
201+
? `\n\n${chalk.gray(`... and ${failedPages.length - 10} more`)}`
202+
: ''),
203+
{
204+
title: chalk.red('⚠ Failed Pages'),
205+
padding: 1,
206+
borderColor: 'yellow',
207+
},
208+
) +
209+
'\n',
210+
)
211+
212+
// Log suggestion
213+
console.log(
214+
chalk.yellow(
215+
`💡 Tip: These failures won't stop the scraping process. The script will continue with the remaining pages.`,
216+
),
217+
)
218+
219+
if (failedPages.some((f) => f.errorType === 'Timeout')) {
220+
console.log(
221+
chalk.gray(
222+
` For timeout errors, try: export BUILD_RECORDS_MAX_CONCURRENT=50 (currently ${MAX_CONCURRENT})`,
223+
),
224+
)
225+
}
226+
}
227+
228+
return {
229+
records,
230+
failedPages,
231+
}
135232
})
136233
}

src/search/scripts/scrape/lib/domwaiter.ts

Lines changed: 93 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,15 @@ interface DomWaiterOptions {
3232
export default function domwaiter(pages: Permalink[], opts: DomWaiterOptions = {}): EventEmitter {
3333
const emitter = new EventEmitter()
3434

35+
// Add a default no-op error handler to prevent EventEmitter from throwing
36+
// when errors are emitted before the caller attaches their error handler
37+
// This will be overridden/supplemented by the caller's error handler
38+
const defaultErrorHandler = () => {
39+
// No-op: prevents EventEmitter from throwing
40+
// External handlers will still receive the error
41+
}
42+
emitter.on('error', defaultErrorHandler)
43+
3544
const defaults = {
3645
parseDOM: true,
3746
json: false,
@@ -43,7 +52,12 @@ export default function domwaiter(pages: Permalink[], opts: DomWaiterOptions = {
4352
const limiter = new Bottleneck(opts)
4453

4554
pages.forEach((page) => {
46-
limiter.schedule(() => getPage(page, emitter, opts))
55+
limiter
56+
.schedule(() => getPage(page, emitter, opts))
57+
.catch((err) => {
58+
// Catch any unhandled promise rejections
59+
emitter.emit('error', err)
60+
})
4761
})
4862

4963
limiter.on('idle', () => {
@@ -58,46 +72,87 @@ export default function domwaiter(pages: Permalink[], opts: DomWaiterOptions = {
5872
}
5973

6074
async function getPage(page: Permalink, emitter: EventEmitter, opts: DomWaiterOptions) {
61-
emitter.emit('beforePageLoad', page)
75+
// Wrap everything in a try-catch to ensure no errors escape
76+
try {
77+
emitter.emit('beforePageLoad', page)
6278

63-
if (opts.json) {
64-
try {
65-
const response = await fetchWithRetry(page.url!, undefined, { retries: 3, timeout: 60000 })
66-
if (!response.ok) {
67-
throw new HTTPError(
68-
`HTTP ${response.status}: ${response.statusText}`,
69-
{ ok: response.ok, statusCode: response.status },
70-
{ requestUrl: { pathname: page.url } },
71-
)
72-
}
73-
const json = await response.json()
74-
const pageCopy = Object.assign({}, page, { json })
75-
emitter.emit('page', pageCopy)
76-
} catch (err) {
77-
if (err instanceof Error) {
78-
err.message = `Failed to fetch ${page.url}: ${err.message}`
79+
if (opts.json) {
80+
try {
81+
const response = await fetchWithRetry(page.url!, undefined, {
82+
retries: 3,
83+
throwHttpErrors: false,
84+
timeout: 60000,
85+
})
86+
if (!response.ok) {
87+
const httpError = new HTTPError(
88+
`HTTP ${response.status}: ${response.statusText}`,
89+
{ ok: response.ok, statusCode: response.status },
90+
{ requestUrl: { pathname: page.url } },
91+
)
92+
// Add URL and path info directly to the HTTPError
93+
;(httpError as any).url = page.url
94+
;(httpError as any).relativePath = page.relativePath
95+
// Emit error instead of throwing
96+
emitter.emit('error', httpError)
97+
return // Exit early, don't continue processing
98+
}
99+
const json = await response.json()
100+
const pageCopy = Object.assign({}, page, { json })
101+
emitter.emit('page', pageCopy)
102+
} catch (err) {
103+
// Enhance error with URL information
104+
if (err instanceof Error && page.url) {
105+
const enhancedError = new Error(err.message, { cause: err.cause })
106+
enhancedError.name = err.name
107+
enhancedError.stack = err.stack
108+
;(enhancedError as any).url = page.url
109+
;(enhancedError as any).relativePath = page.relativePath
110+
emitter.emit('error', enhancedError)
111+
} else {
112+
emitter.emit('error', err)
113+
}
79114
}
80-
emitter.emit('error', err)
81-
}
82-
} else {
83-
try {
84-
const response = await fetchWithRetry(page.url!, undefined, { retries: 3, timeout: 60000 })
85-
if (!response.ok) {
86-
throw new HTTPError(
87-
`HTTP ${response.status}: ${response.statusText}`,
88-
{ ok: response.ok, statusCode: response.status },
89-
{ requestUrl: { pathname: page.url } },
90-
)
91-
}
92-
const body = await response.text()
93-
const pageCopy = Object.assign({}, page, { body })
94-
if (opts.parseDOM) (pageCopy as any).$ = cheerio.load(body)
95-
emitter.emit('page', pageCopy)
96-
} catch (err) {
97-
if (err instanceof Error) {
98-
err.message = `Failed to fetch ${page.url}: ${err.message}`
115+
} else {
116+
try {
117+
const response = await fetchWithRetry(page.url!, undefined, {
118+
retries: 3,
119+
throwHttpErrors: false,
120+
timeout: 60000,
121+
})
122+
if (!response.ok) {
123+
const httpError = new HTTPError(
124+
`HTTP ${response.status}: ${response.statusText}`,
125+
{ ok: response.ok, statusCode: response.status },
126+
{ requestUrl: { pathname: page.url } },
127+
)
128+
// Add URL and path info directly to the HTTPError
129+
;(httpError as any).url = page.url
130+
;(httpError as any).relativePath = page.relativePath
131+
// Emit error instead of throwing
132+
emitter.emit('error', httpError)
133+
return // Exit early, don't continue processing
134+
}
135+
const body = await response.text()
136+
const pageCopy = Object.assign({}, page, { body })
137+
if (opts.parseDOM) (pageCopy as any).$ = cheerio.load(body)
138+
emitter.emit('page', pageCopy)
139+
} catch (err) {
140+
// Enhance error with URL information
141+
if (err instanceof Error && page.url) {
142+
const enhancedError = new Error(err.message, { cause: err.cause })
143+
enhancedError.name = err.name
144+
enhancedError.stack = err.stack
145+
;(enhancedError as any).url = page.url
146+
;(enhancedError as any).relativePath = page.relativePath
147+
emitter.emit('error', enhancedError)
148+
} else {
149+
emitter.emit('error', err)
150+
}
99151
}
100-
emitter.emit('error', err)
101152
}
153+
} catch (err) {
154+
// Ultimate catch-all to ensure nothing escapes
155+
console.error('Unexpected error in getPage:', err)
156+
emitter.emit('error', err)
102157
}
103158
}

0 commit comments

Comments
 (0)