From 7f6d5d029d8f1ae2ef49b784899059574dc7cc84 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 19 Mar 2025 16:30:35 -0700 Subject: [PATCH 1/3] direct fetch: don't allow direct fetch if no mime type provided --- src/util/recorder.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/util/recorder.ts b/src/util/recorder.ts index bbaffa436..675308d90 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -1404,7 +1404,7 @@ export class Recorder extends EventEmitter { mime = ct.split(";")[0]; } - const result = !isHTMLMime(mime); + const result = !!mime && !isHTMLMime(mime); if (result) { logger.info( From f33d35011165eb4faf0531b56cfa974a1fe17c3c Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Wed, 19 Mar 2025 18:48:26 -0700 Subject: [PATCH 2/3] several fixes for sso-redirect workflow: - if a seed page redirects to another page, and then back (such as for sso), ensure original seed is used for link extraction - don't allow direct fetch if no mime type at all - don't add --lang if using profile, display warning, as language override may invalidate profile settings - add temp extra delay if seed page redirects, to ensure any sso-related redirects finish --- src/crawler.ts | 47 +++++++++++++++++++++++++++++++++++++------ src/replaycrawler.ts | 2 +- src/util/constants.ts | 1 + 3 files changed, 43 insertions(+), 7 deletions(-) diff --git a/src/crawler.ts b/src/crawler.ts index 61c7516f3..d992e17d8 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -47,6 +47,7 @@ import { ExitCodes, InterruptReason, BxFunctionBindings, + SEED_REDIRECT_ADD_DELAY, } from "./util/constants.js"; import { AdBlockRules, BlockRuleDecl, BlockRules } from "./util/blockrules.js"; @@ -592,7 +593,14 @@ export class Crawler { extraChromeArgs() { const args = []; if (this.params.lang) { - args.push(`--accept-lang=${this.params.lang}`); + if (this.params.profile) { + logger.warn( + "Ignoring --lang option with profile, using language configured in the profile", + { lang: this.params.lang }, + ); + } else { + args.push(`--accept-lang=${this.params.lang}`); + } } return args; } @@ -2123,6 +2131,8 @@ self.__bx_behaviors.selectMainBehavior(); const respUrl = resp.url().split("#")[0]; const isChromeError = page.url().startsWith("chrome-error://"); + let thisPageDelay = 0; + let originalSeedId = null; if ( depth === 0 && @@ -2131,6 +2141,7 @@ self.__bx_behaviors.selectMainBehavior(); respUrl + "/" !== url && !downloadResponse ) { + originalSeedId = data.seedId; data.seedId = await this.crawlState.addExtraSeed( this.seeds, this.numOriginalSeeds, @@ -2142,6 +2153,7 @@ self.__bx_behaviors.selectMainBehavior(); newUrl: respUrl, seedId: data.seedId, }); + thisPageDelay = SEED_REDIRECT_ADD_DELAY; } const status = resp.status(); @@ -2228,7 +2240,7 @@ self.__bx_behaviors.selectMainBehavior(); await this.netIdle(page, logDetails); - await this.awaitPageLoad(page.mainFrame(), logDetails); + await this.awaitPageLoad(page.mainFrame(), thisPageDelay, logDetails); // skip extraction if at max depth if (seed.isAtMaxDepth(depth, extraHops)) { @@ -2242,6 +2254,27 @@ self.__bx_behaviors.selectMainBehavior(); "links", ); + const pageUrl = page.url().split("#")[0]; + + if (depth === 0 && respUrl !== urlNoHash) { + if (pageUrl === urlNoHash && originalSeedId !== null) { + logger.info("Seed page redirected back to original seed", { pageUrl }); + data.seedId = originalSeedId; + } else { + data.seedId = await this.crawlState.addExtraSeed( + this.seeds, + this.numOriginalSeeds, + data.seedId, + pageUrl, + ); + logger.info("Seed page redirected, adding redirected seed", { + origUrl: respUrl, + newUrl: pageUrl, + seedId: data.seedId, + }); + } + } + await this.extractLinks(page, data, this.params.selectLinks, logDetails); } @@ -2263,7 +2296,7 @@ self.__bx_behaviors.selectMainBehavior(); } } - async awaitPageLoad(frame: Frame, logDetails: LogDetails) { + async awaitPageLoad(frame: Frame, tempDelay: number, logDetails: LogDetails) { if (this.params.behaviorOpts) { try { await timedRun( @@ -2279,11 +2312,13 @@ self.__bx_behaviors.selectMainBehavior(); } } - if (this.params.postLoadDelay) { + const delay = tempDelay + this.params.postLoadDelay; + + if (delay) { logger.info("Awaiting post load delay", { - seconds: this.params.postLoadDelay, + seconds: delay, }); - await sleep(this.params.postLoadDelay); + await sleep(delay); } } diff --git a/src/replaycrawler.ts b/src/replaycrawler.ts index 75abfc4ef..aa50a6cfd 100644 --- a/src/replaycrawler.ts +++ b/src/replaycrawler.ts @@ -450,7 +450,7 @@ export class ReplayCrawler extends Crawler { // optionally reload (todo: reevaluate if this is needed) // await page.reload(); - await this.awaitPageLoad(replayFrame, logDetails); + await this.awaitPageLoad(replayFrame, 0, logDetails); data.isHTMLPage = true; diff --git a/src/util/constants.ts b/src/util/constants.ts index 0f75df739..a2d4eab78 100644 --- a/src/util/constants.ts +++ b/src/util/constants.ts @@ -38,6 +38,7 @@ export const DEFAULT_MAX_RETRIES = 2; export const FETCH_HEADERS_TIMEOUT_SECS = 30; export const PAGE_OP_TIMEOUT_SECS = 5; export const SITEMAP_INITIAL_FETCH_TIMEOUT_SECS = 30; +export const SEED_REDIRECT_ADD_DELAY = 20; export type ExtractSelector = { selector: string; From 34e1579a42c817e79427e3e9dfd5e2d48fe914f4 Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 20 Mar 2025 19:16:21 -0700 Subject: [PATCH 3/3] edge cases: check for page responses which are non-400 / or missing mime, possibly a captcha/sso check, remove from dupe check to allow recapture --- src/util/recorder.ts | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/src/util/recorder.ts b/src/util/recorder.ts index 675308d90..699db6a8f 100644 --- a/src/util/recorder.ts +++ b/src/util/recorder.ts @@ -507,7 +507,7 @@ export class Recorder extends EventEmitter { return; } - this.serializeToWARC(reqresp).catch((e) => + this.serializeToWARC(reqresp, true).catch((e) => logger.warn("Error Serializing to WARC", e, "recorder"), ); } @@ -1327,7 +1327,7 @@ export class Recorder extends EventEmitter { return reqresp; } - async serializeToWARC(reqresp: RequestResponseInfo) { + async serializeToWARC(reqresp: RequestResponseInfo, fromFinished = false) { // always include in pageinfo record if going to serialize to WARC // even if serialization does not happen this.addPageRecord(reqresp); @@ -1371,6 +1371,15 @@ export class Recorder extends EventEmitter { const requestRecord = createRequest(reqresp, responseRecord, this.pageid); this.writer.writeRecordPair(responseRecord, requestRecord); + + // edge case: from finished response load, and page response and no mime type or status != 200, possibly a captcha/sso page + // allow it to be captured again + if ( + (fromFinished && url === this.pageUrl && !reqresp.getMimeType()) || + status !== 200 + ) { + await this.crawlState.removeDupe(WRITE_DUPE_KEY, url, status); + } } async directFetchCapture({