Skip to content

Commit 60e7136

Browse files
committed
Add support for --robots crawler flag to Browsertrix
1 parent aa03965 commit 60e7136

File tree

7 files changed

+37
-0
lines changed

7 files changed

+37
-0
lines changed

backend/btrixcloud/models.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -376,6 +376,8 @@ class RawCrawlConfig(BaseModel):
376376

377377
saveStorage: Optional[bool] = False
378378

379+
robots: Optional[bool] = False
380+
379381

380382
# ============================================================================
381383
class CrawlConfigIn(BaseModel):

frontend/docs/docs/user-guide/workflow-setup.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,10 @@ Refer to a specific [_Crawl Scope_ option](#crawl-scope-options) for details on
9797

9898
**These credentials WILL BE WRITTEN into the archive.** We recommend exercising caution and only archiving with dedicated archival accounts, changing your password or deleting the account when finished.
9999

100+
### Skip Pages Disallowed By Robots.txt
101+
102+
When enabled, the crawler will check for a [Robots Exclusion Protocol](https://www.rfc-editor.org/rfc/rfc9309.html) file at /robots.txt for each host encountered during crawling and skip any pages that are disallowed by the rules found therein.
103+
100104
### Include Any Linked Page
101105

102106
When enabled, the crawler will visit all the links it finds within each URL defined in the [URL input field](#crawl-start-url-urls-to-crawl) under _Crawl Scope_.

frontend/src/components/ui/config-details.ts

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -462,6 +462,10 @@ export class ConfigDetails extends BtrixElement {
462462
msg("Include Any Linked Page (“one hop out”)"),
463463
Boolean(config.extraHops),
464464
)}
465+
${this.renderSetting(
466+
msg("Skip Pages Disallowed By Robots.txt"),
467+
Boolean(config.robots),
468+
)}
465469
${this.renderSetting(
466470
msg("Fail Crawl If Not Logged In"),
467471
Boolean(config.failOnContentCheck),
@@ -536,6 +540,10 @@ export class ConfigDetails extends BtrixElement {
536540
msg("Include Any Linked Page (“one hop out”)"),
537541
Boolean(primarySeedConfig?.extraHops ?? config.extraHops),
538542
)}
543+
${this.renderSetting(
544+
msg("Skip Pages Disallowed By Robots.txt"),
545+
Boolean(config.robots),
546+
)}
539547
${this.renderSetting(
540548
msg("Check For Sitemap"),
541549
Boolean(config.useSitemap),

frontend/src/features/crawl-workflows/workflow-editor.ts

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1036,6 +1036,12 @@ export class WorkflowEditor extends BtrixElement {
10361036
</sl-checkbox>
10371037
`)}
10381038
${this.renderHelpTextCol(infoTextFor["includeLinkedPages"], false)}
1039+
${inputCol(html`
1040+
<sl-checkbox name="robots" ?checked=${this.formState.robots}>
1041+
${msg("Skip pages disallowed by robots.txt")}
1042+
</sl-checkbox>
1043+
`)}
1044+
${this.renderHelpTextCol(infoTextFor["robots"], false)}
10391045
${inputCol(html`
10401046
<sl-checkbox
10411047
name="failOnContentCheck"
@@ -1553,6 +1559,12 @@ https://example.net`}
15531559
</sl-checkbox>
15541560
`)}
15551561
${this.renderHelpTextCol(infoTextFor["includeLinkedPages"], false)}
1562+
${inputCol(html`
1563+
<sl-checkbox name="robots" ?checked=${this.formState.robots}>
1564+
${msg("Skip pages disallowed by robots.txt")}
1565+
</sl-checkbox>
1566+
`)}
1567+
${this.renderHelpTextCol(infoTextFor["robots"], false)}
15561568
${inputCol(html`
15571569
<sl-checkbox name="useSitemap" ?checked=${this.formState.useSitemap}>
15581570
${msg("Check for sitemap")}
@@ -3263,6 +3275,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
32633275
| "failOnFailedSeed"
32643276
| "failOnContentCheck"
32653277
| "saveStorage"
3278+
| "robots"
32663279
> {
32673280
const jsonSeeds = this.formState.seedListFormat === SeedListFormat.JSON;
32683281

@@ -3282,6 +3295,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
32823295
failOnFailedSeed: this.formState.failOnFailedSeed,
32833296
failOnContentCheck: this.formState.failOnContentCheck,
32843297
saveStorage: this.formState.saveStorage,
3298+
robots: this.formState.robots,
32853299
};
32863300

32873301
return config;
@@ -3295,6 +3309,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
32953309
| "failOnFailedSeed"
32963310
| "failOnContentCheck"
32973311
| "saveStorage"
3312+
| "robots"
32983313
> {
32993314
const primarySeedUrl = this.formState.primarySeedUrl;
33003315
const includeUrlList = this.formState.customIncludeUrlList
@@ -3327,6 +3342,7 @@ https://archiveweb.page/images/${"logo.svg"}`}
33273342
failOnFailedSeed: false,
33283343
failOnContentCheck: this.formState.failOnContentCheck,
33293344
saveStorage: this.formState.saveStorage,
3345+
robots: this.formState.robots,
33303346
};
33313347
return config;
33323348
}

frontend/src/strings/crawl-workflows/infoText.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,9 @@ export const infoTextFor = {
8585
saveStorage: msg(
8686
`Include data from the browser's local and session storage in the web archive.`,
8787
),
88+
robots: msg(
89+
`Check for a /robots.txt for each host and skip any disallowed pages.`,
90+
),
8891
} as const satisfies Partial<Record<Field, string | TemplateResult>>;
8992

9093
export default infoTextFor;

frontend/src/types/crawler.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,6 +55,7 @@ export type SeedConfig = Expand<
5555
customBehaviors: string[];
5656
clickSelector: string;
5757
saveStorage?: boolean;
58+
robots?: boolean;
5859
}
5960
>;
6061

frontend/src/utils/workflow.ts

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -184,6 +184,7 @@ export type FormState = {
184184
selectLinks: string[];
185185
clickSelector: string;
186186
saveStorage: WorkflowParams["config"]["saveStorage"];
187+
robots: WorkflowParams["config"]["robots"];
187188
};
188189

189190
export type FormStateField = keyof FormState;
@@ -246,6 +247,7 @@ export const getDefaultFormState = (): FormState => ({
246247
clickSelector: DEFAULT_AUTOCLICK_SELECTOR,
247248
customBehavior: false,
248249
saveStorage: false,
250+
robots: false,
249251
});
250252

251253
export const mapSeedToUrl = (arr: Seed[]) =>
@@ -416,6 +418,7 @@ export function getInitialFormState(params: {
416418
params.initialWorkflow.crawlerChannel || defaultFormState.crawlerChannel,
417419
proxyId: params.initialWorkflow.proxyId || defaultFormState.proxyId,
418420
saveStorage: params.initialWorkflow.config.saveStorage,
421+
robots: params.initialWorkflow.config.robots,
419422
...formState,
420423
};
421424
}

0 commit comments

Comments
 (0)