Skip to content

Commit 0090df9

Browse files
authored
feat: add maxCrawlDepth crawler option (#3045)
Adds `BasicCrawlerOptions.maxCrawlDepth` option. Works with `addRequests` and `enqueueLinks` **called from the crawling context** (those have the current request depth bound and can propagate this value to the new requests). Closes #2633
1 parent 8c8a817 commit 0090df9

File tree

12 files changed

+345
-76
lines changed

12 files changed

+345
-76
lines changed

packages/basic-crawler/src/internals/basic-crawler.ts

Lines changed: 52 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -266,6 +266,13 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
266266
*/
267267
maxRequestsPerCrawl?: number;
268268

269+
/**
270+
* Maximum depth of the crawl. If not set, the crawl will continue until all requests are processed.
271+
* Setting this to `0` will only process the initial requests, skipping all links enqueued by `crawlingContext.enqueueLinks` and `crawlingContext.addRequests`.
272+
* Passing `1` will process the initial requests and all links enqueued by `crawlingContext.enqueueLinks` and `crawlingContext.addRequests` in the handler for initial requests.
273+
*/
274+
maxCrawlDepth?: number;
275+
269276
/**
270277
* Custom options passed to the underlying {@apilink AutoscaledPool} constructor.
271278
* > *NOTE:* The {@apilink AutoscaledPoolOptions.runTaskFunction|`runTaskFunction`}
@@ -516,6 +523,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
516523
protected requestHandlerTimeoutMillis!: number;
517524
protected internalTimeoutMillis: number;
518525
protected maxRequestRetries: number;
526+
protected maxCrawlDepth?: number;
519527
protected sameDomainDelayMillis: number;
520528
protected domainAccessedTime: Map<string, number>;
521529
protected maxSessionRotations: number;
@@ -559,6 +567,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
559567
sameDomainDelaySecs: ow.optional.number,
560568
maxSessionRotations: ow.optional.number,
561569
maxRequestsPerCrawl: ow.optional.number,
570+
maxCrawlDepth: ow.optional.number,
562571
autoscaledPoolOptions: ow.optional.object,
563572
sessionPoolOptions: ow.optional.object,
564573
useSessionPool: ow.optional.boolean,
@@ -600,6 +609,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
600609
sameDomainDelaySecs = 0,
601610
maxSessionRotations = 10,
602611
maxRequestsPerCrawl,
612+
maxCrawlDepth,
603613
autoscaledPoolOptions = {},
604614
keepAlive,
605615
sessionPoolOptions = {},
@@ -711,6 +721,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
711721
}
712722

713723
this.maxRequestRetries = maxRequestRetries;
724+
this.maxCrawlDepth = maxCrawlDepth;
714725
this.sameDomainDelayMillis = sameDomainDelaySecs * 1000;
715726
this.maxSessionRotations = maxSessionRotations;
716727
this.stats = new Statistics({
@@ -1112,8 +1123,10 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
11121123

11131124
const skippedBecauseOfRobots = new Set<string>();
11141125
const skippedBecauseOfLimit = new Set<string>();
1126+
const skippedBecauseOfMaxCrawlDepth = new Set<string>();
11151127

11161128
const isAllowedBasedOnRobotsTxtFile = this.isAllowedBasedOnRobotsTxtFile.bind(this);
1129+
const maxCrawlDepth = this.maxCrawlDepth;
11171130

11181131
async function* filteredRequests() {
11191132
let yieldedRequestCount = 0;
@@ -1126,6 +1139,11 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
11261139
continue;
11271140
}
11281141

1142+
if (maxCrawlDepth !== undefined && (request as any).crawlDepth > maxCrawlDepth) {
1143+
skippedBecauseOfMaxCrawlDepth.add(url);
1144+
continue;
1145+
}
1146+
11291147
if (await isAllowedBasedOnRobotsTxtFile(url)) {
11301148
yield request;
11311149
yieldedRequestCount += 1;
@@ -1143,7 +1161,11 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
11431161
});
11441162
}
11451163

1146-
if (skippedBecauseOfRobots.size > 0 || skippedBecauseOfLimit.size > 0) {
1164+
if (
1165+
skippedBecauseOfRobots.size > 0 ||
1166+
skippedBecauseOfLimit.size > 0 ||
1167+
skippedBecauseOfMaxCrawlDepth.size > 0
1168+
) {
11471169
await Promise.all(
11481170
[...skippedBecauseOfRobots]
11491171
.map((url) => {
@@ -1153,6 +1175,9 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
11531175
[...skippedBecauseOfLimit].map((url) => {
11541176
return this.handleSkippedRequest({ url, reason: 'limit' });
11551177
}),
1178+
[...skippedBecauseOfMaxCrawlDepth].map((url) => {
1179+
return this.handleSkippedRequest({ url, reason: 'depth' });
1180+
}),
11561181
),
11571182
);
11581183
}
@@ -1480,10 +1505,35 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
14801505
robotsTxtFile: await this.getRobotsTxtFileForUrl(request!.url),
14811506
onSkippedRequest: this.handleSkippedRequest,
14821507
limit: this.calculateEnqueuedRequestLimit(options.limit),
1508+
transformRequestFunction: (newRequest) => {
1509+
newRequest.crawlDepth = (request?.crawlDepth ?? 0) + 1;
1510+
1511+
if (this.maxCrawlDepth !== undefined && newRequest.crawlDepth > this.maxCrawlDepth) {
1512+
newRequest.skippedReason = 'depth';
1513+
return false;
1514+
}
1515+
1516+
return options.transformRequestFunction?.(newRequest) ?? newRequest;
1517+
},
14831518
...options,
14841519
});
14851520
},
1486-
addRequests: this.addRequests.bind(this),
1521+
addRequests: async (requests: RequestsLike, options: CrawlerAddRequestsOptions = {}) => {
1522+
const newRequestDepth = (request?.crawlDepth ?? 0) + 1;
1523+
1524+
async function* injectDepth() {
1525+
for await (const rq of requests) {
1526+
if (typeof rq === 'string') {
1527+
yield { url: rq, crawlDepth: newRequestDepth };
1528+
} else {
1529+
rq.crawlDepth ??= newRequestDepth;
1530+
yield rq;
1531+
}
1532+
}
1533+
}
1534+
1535+
return this.addRequests(injectDepth(), options);
1536+
},
14871537
pushData: this.pushData.bind(this),
14881538
useState: this.useState.bind(this),
14891539
sendRequest: createSendRequest(this.httpClient, request!, session, () => crawlingContext.proxyInfo?.url),

packages/browser-crawler/src/internals/browser-crawler.ts

Lines changed: 39 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import type {
22
Awaitable,
33
BasicCrawlerOptions,
4+
BasicCrawlingContext,
45
CrawlingContext,
56
Dictionary,
67
EnqueueLinksOptions,
@@ -623,6 +624,7 @@ export abstract class BrowserCrawler<
623624
crawlingContext.proxyInfo = browserControllerInstance.launchContext.proxyInfo as ProxyInfo;
624625
}
625626

627+
const contextEnqueueLinks = crawlingContext.enqueueLinks;
626628
crawlingContext.enqueueLinks = async (enqueueOptions) => {
627629
return browserCrawlerEnqueueLinks({
628630
options: { ...enqueueOptions, limit: this.calculateEnqueuedRequestLimit(enqueueOptions?.limit) },
@@ -632,6 +634,7 @@ export abstract class BrowserCrawler<
632634
onSkippedRequest: this.handleSkippedRequest,
633635
originalRequestUrl: crawlingContext.request.url,
634636
finalRequestUrl: crawlingContext.request.loadedUrl,
637+
enqueueLinks: contextEnqueueLinks,
635638
});
636639
};
637640
}
@@ -802,35 +805,54 @@ interface EnqueueLinksInternalOptions {
802805
}
803806

804807
/** @internal */
805-
export async function browserCrawlerEnqueueLinks({
806-
options,
807-
page,
808-
requestQueue,
809-
robotsTxtFile,
810-
onSkippedRequest,
811-
originalRequestUrl,
812-
finalRequestUrl,
813-
}: EnqueueLinksInternalOptions) {
808+
interface BoundEnqueueLinksInternalOptions {
809+
enqueueLinks: BasicCrawlingContext['enqueueLinks'];
810+
options?: ReadonlyDeep<Omit<EnqueueLinksOptions, 'requestQueue'>> & Pick<EnqueueLinksOptions, 'requestQueue'>;
811+
originalRequestUrl: string;
812+
finalRequestUrl?: string;
813+
page: CommonPage;
814+
}
815+
816+
/** @internal */
817+
function containsEnqueueLinks(
818+
options: EnqueueLinksInternalOptions | BoundEnqueueLinksInternalOptions,
819+
): options is BoundEnqueueLinksInternalOptions {
820+
return !!(options as BoundEnqueueLinksInternalOptions).enqueueLinks;
821+
}
822+
823+
/** @internal */
824+
export async function browserCrawlerEnqueueLinks(
825+
options: EnqueueLinksInternalOptions | BoundEnqueueLinksInternalOptions,
826+
) {
827+
const { options: enqueueLinksOptions, finalRequestUrl, originalRequestUrl, page } = options;
828+
814829
const baseUrl = resolveBaseUrlForEnqueueLinksFiltering({
815-
enqueueStrategy: options?.strategy,
830+
enqueueStrategy: enqueueLinksOptions?.strategy,
816831
finalRequestUrl,
817832
originalRequestUrl,
818-
userProvidedBaseUrl: options?.baseUrl,
833+
userProvidedBaseUrl: enqueueLinksOptions?.baseUrl,
819834
});
820835

821836
const urls = await extractUrlsFromPage(
822837
page as any,
823-
options?.selector ?? 'a',
824-
options?.baseUrl ?? finalRequestUrl ?? originalRequestUrl,
838+
enqueueLinksOptions?.selector ?? 'a',
839+
enqueueLinksOptions?.baseUrl ?? finalRequestUrl ?? originalRequestUrl,
825840
);
826841

842+
if (containsEnqueueLinks(options)) {
843+
return options.enqueueLinks({
844+
urls,
845+
baseUrl,
846+
...enqueueLinksOptions,
847+
});
848+
}
827849
return enqueueLinks({
828-
requestQueue,
829-
robotsTxtFile,
830-
onSkippedRequest,
850+
requestQueue: options.requestQueue,
851+
robotsTxtFile: options.robotsTxtFile,
852+
onSkippedRequest: options.onSkippedRequest,
831853
urls,
832854
baseUrl,
833-
...(options as EnqueueLinksOptions),
855+
...(enqueueLinksOptions as EnqueueLinksOptions),
834856
});
835857
}
836858

packages/cheerio-crawler/src/internals/cheerio-crawler.ts

Lines changed: 38 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ import type { IncomingMessage } from 'node:http';
22
import { text as readStreamToString } from 'node:stream/consumers';
33

44
import type {
5+
BasicCrawlingContext,
56
Configuration,
67
EnqueueLinksOptions,
78
ErrorHandler,
@@ -185,6 +186,8 @@ export class CheerioCrawler extends HttpCrawler<CheerioCrawlingContext> {
185186
_useHtmlParser2: true,
186187
} as CheerioOptions);
187188

189+
const originalEnqueueLinks = crawlingContext.enqueueLinks;
190+
188191
return {
189192
dom,
190193
$,
@@ -198,6 +201,7 @@ export class CheerioCrawler extends HttpCrawler<CheerioCrawlingContext> {
198201
onSkippedRequest: this.handleSkippedRequest,
199202
originalRequestUrl: crawlingContext.request.url,
200203
finalRequestUrl: crawlingContext.request.loadedUrl,
204+
enqueueLinks: originalEnqueueLinks,
201205
});
202206
},
203207
};
@@ -247,40 +251,57 @@ interface EnqueueLinksInternalOptions {
247251
finalRequestUrl?: string;
248252
}
249253

254+
interface BoundEnqueueLinksInternalOptions {
255+
enqueueLinks: BasicCrawlingContext['enqueueLinks'];
256+
options?: EnqueueLinksOptions;
257+
$: cheerio.CheerioAPI | null;
258+
originalRequestUrl: string;
259+
finalRequestUrl?: string;
260+
}
261+
262+
/** @internal */
263+
function containsEnqueueLinks(
264+
options: EnqueueLinksInternalOptions | BoundEnqueueLinksInternalOptions,
265+
): options is BoundEnqueueLinksInternalOptions {
266+
return !!(options as BoundEnqueueLinksInternalOptions).enqueueLinks;
267+
}
268+
250269
/** @internal */
251-
export async function cheerioCrawlerEnqueueLinks({
252-
options,
253-
$,
254-
requestQueue,
255-
robotsTxtFile,
256-
onSkippedRequest,
257-
originalRequestUrl,
258-
finalRequestUrl,
259-
}: EnqueueLinksInternalOptions) {
270+
export async function cheerioCrawlerEnqueueLinks(
271+
options: EnqueueLinksInternalOptions | BoundEnqueueLinksInternalOptions,
272+
) {
273+
const { options: enqueueLinksOptions, $, originalRequestUrl, finalRequestUrl } = options;
260274
if (!$) {
261275
throw new Error('Cannot enqueue links because the DOM is not available.');
262276
}
263277

264278
const baseUrl = resolveBaseUrlForEnqueueLinksFiltering({
265-
enqueueStrategy: options?.strategy,
279+
enqueueStrategy: enqueueLinksOptions?.strategy,
266280
finalRequestUrl,
267281
originalRequestUrl,
268-
userProvidedBaseUrl: options?.baseUrl,
282+
userProvidedBaseUrl: enqueueLinksOptions?.baseUrl,
269283
});
270284

271285
const urls = extractUrlsFromCheerio(
272286
$,
273-
options?.selector ?? 'a',
274-
options?.baseUrl ?? finalRequestUrl ?? originalRequestUrl,
287+
enqueueLinksOptions?.selector ?? 'a',
288+
enqueueLinksOptions?.baseUrl ?? finalRequestUrl ?? originalRequestUrl,
275289
);
276290

291+
if (containsEnqueueLinks(options)) {
292+
return options.enqueueLinks({
293+
urls,
294+
baseUrl,
295+
...enqueueLinksOptions,
296+
});
297+
}
277298
return enqueueLinks({
278-
requestQueue,
279-
robotsTxtFile,
280-
onSkippedRequest,
299+
requestQueue: options.requestQueue,
300+
robotsTxtFile: options.robotsTxtFile,
301+
onSkippedRequest: options.onSkippedRequest,
281302
urls,
282303
baseUrl,
283-
...options,
304+
...enqueueLinksOptions,
284305
});
285306
}
286307

packages/core/src/enqueue_links/enqueue_links.ts

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -401,11 +401,14 @@ export async function enqueueLinks(
401401
}
402402
}
403403

404-
async function reportSkippedRequests(skippedRequests: { url: string }[], reason: SkippedRequestReason) {
404+
async function reportSkippedRequests(
405+
skippedRequests: { url: string; skippedReason?: SkippedRequestReason }[],
406+
reason: SkippedRequestReason,
407+
) {
405408
if (onSkippedRequest && skippedRequests.length > 0) {
406409
await Promise.all(
407410
skippedRequests.map((request) => {
408-
return onSkippedRequest({ url: request.url, reason });
411+
return onSkippedRequest({ url: request.url, reason: request.skippedReason ?? reason });
409412
}),
410413
);
411414
}

packages/core/src/enqueue_links/shared.ts

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ export type RegExpObject = { regexp: RegExp } & Pick<
4747

4848
export type RegExpInput = RegExp | RegExpObject;
4949

50-
export type SkippedRequestReason = 'robotsTxt' | 'limit' | 'filters' | 'redirect';
50+
export type SkippedRequestReason = 'robotsTxt' | 'limit' | 'filters' | 'redirect' | 'depth';
5151

5252
export type SkippedRequestCallback = (args: {
5353
url: string;

0 commit comments

Comments
 (0)