Skip to content

Commit 7bf8f0b

Browse files
janbucharbarjin
andauthored
feat: Sitemap-based request list implementation (#2498)
This introduces an alternative RequestList implementation based on sitemaps. It should be possible to use this in tandem with RequestProvider in BasicCrawler, just like with the current RequestList. --------- Co-authored-by: Jindřich Bär <jindrichbar@gmail.com>
1 parent bf01cbd commit 7bf8f0b

File tree

10 files changed

+1218
-196
lines changed

10 files changed

+1218
-196
lines changed

packages/basic-crawler/src/internals/basic-crawler.ts

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@ import type {
1414
DatasetExportOptions,
1515
FinalStatistics,
1616
GetUserDataFromRequest,
17+
IRequestList,
1718
ProxyInfo,
1819
Request,
19-
RequestList,
2020
RequestOptions,
2121
RequestProvider,
2222
RouterHandler,
@@ -171,7 +171,7 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
171171
* > Alternatively, `requests` parameter of {@apilink BasicCrawler.run|`crawler.run()`} could be used to enqueue the initial requests -
172172
* it is a shortcut for running `crawler.addRequests()` before the `crawler.run()`.
173173
*/
174-
requestList?: RequestList;
174+
requestList?: IRequestList;
175175

176176
/**
177177
* Dynamic queue of URLs to be processed. This is useful for recursive crawling of websites.
@@ -445,7 +445,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
445445
* A reference to the underlying {@apilink RequestList} class that manages the crawler's {@apilink Request|requests}.
446446
* Only available if used by the crawler.
447447
*/
448-
requestList?: RequestList;
448+
requestList?: IRequestList;
449449

450450
/**
451451
* Dynamic queue of URLs to be processed. This is useful for recursive crawling of websites.
@@ -1170,7 +1170,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
11701170
* adding it back to the queue after the timeout passes. Returns `true` if the request
11711171
* should be ignored and will be reclaimed to the queue once ready.
11721172
*/
1173-
protected delayRequest(request: Request, source: RequestList | RequestProvider) {
1173+
protected delayRequest(request: Request, source: IRequestList | RequestProvider) {
11741174
const domain = getDomain(request.url);
11751175

11761176
if (!domain || !request) {
@@ -1415,7 +1415,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
14151415
protected async _requestFunctionErrorHandler(
14161416
error: Error,
14171417
crawlingContext: Context,
1418-
source: RequestList | RequestProvider,
1418+
source: IRequestList | RequestProvider,
14191419
): Promise<void> {
14201420
const { request } = crawlingContext;
14211421
request.pushErrorMessage(error);

packages/core/src/storages/index.ts

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,3 +8,4 @@ export { RequestQueue as RequestQueueV2 } from './request_queue_v2';
88
export * from './storage_manager';
99
export * from './utils';
1010
export * from './access_checking';
11+
export * from './sitemap_request_list';

packages/core/src/storages/request_list.ts

Lines changed: 93 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,80 @@ export const REQUESTS_PERSISTENCE_KEY = 'REQUEST_LIST_REQUESTS';
2020

2121
const CONTENT_TYPE_BINARY = 'application/octet-stream';
2222

23+
/**
24+
* Represents a static list of URLs to crawl.
25+
*/
26+
export interface IRequestList {
27+
/**
28+
* Returns the total number of unique requests present in the list.
29+
*/
30+
length(): number;
31+
32+
/**
33+
* Returns `true` if all requests were already handled and there are no more left.
34+
*/
35+
isFinished(): Promise<boolean>;
36+
37+
/**
38+
* Resolves to `true` if the next call to {@apilink IRequestList.fetchNextRequest} function
39+
* would return `null`, otherwise it resolves to `false`.
40+
* Note that even if the list is empty, there might be some pending requests currently being processed.
41+
*/
42+
isEmpty(): Promise<boolean>;
43+
44+
/**
45+
* Returns number of handled requests.
46+
*/
47+
handledCount(): number;
48+
49+
/**
50+
* Persists the current state of the `IRequestList` into the default {@apilink KeyValueStore}.
51+
* The state is persisted automatically in regular intervals, but calling this method manually
52+
* is useful in cases where you want to have the most current state available after you pause
53+
* or stop fetching its requests. For example after you pause or abort a crawl. Or just before
54+
* a server migration.
55+
*/
56+
persistState(): Promise<void>;
57+
58+
/**
59+
* Gets the next {@apilink Request} to process. First, the function gets a request previously reclaimed
60+
* using the {@apilink RequestList.reclaimRequest} function, if there is any.
61+
* Otherwise it gets the next request from sources.
62+
*
63+
* The function's `Promise` resolves to `null` if there are no more
64+
* requests to process.
65+
*/
66+
fetchNextRequest(): Promise<Request | null>;
67+
68+
/**
69+
* Gets the next {@apilink Request} to process. First, the function gets a request previously reclaimed
70+
* using the {@apilink RequestList.reclaimRequest} function, if there is any.
71+
* Otherwise it gets the next request from sources.
72+
*
73+
* The function resolves to `null` if there are no more requests to process.
74+
*
75+
* Can be used to iterate over the `RequestList` instance in a `for await .. of` loop.
76+
* Provides an alternative for the repeated use of `fetchNextRequest`.
77+
*/
78+
[Symbol.asyncIterator](): AsyncGenerator<Request>;
79+
80+
/**
81+
* Reclaims request to the list if its processing failed.
82+
* The request will become available in the next `this.fetchNextRequest()`.
83+
*/
84+
reclaimRequest(request: Request): Promise<void>;
85+
86+
/**
87+
* Marks request as handled after successful processing.
88+
*/
89+
markRequestHandled(request: Request): Promise<void>;
90+
91+
/**
92+
* @internal
93+
*/
94+
inProgress: Set<string>;
95+
}
96+
2397
export interface RequestListOptions {
2498
/**
2599
* An array of sources of URLs for the {@apilink RequestList}. It can be either an array of strings,
@@ -229,7 +303,7 @@ export interface RequestListOptions {
229303
* ```
230304
* @category Sources
231305
*/
232-
export class RequestList {
306+
export class RequestList implements IRequestList {
233307
private log = log.child({ prefix: 'RequestList' });
234308

235309
/**
@@ -431,11 +505,7 @@ export class RequestList {
431505
}
432506

433507
/**
434-
* Persists the current state of the `RequestList` into the default {@apilink KeyValueStore}.
435-
* The state is persisted automatically in regular intervals, but calling this method manually
436-
* is useful in cases where you want to have the most current state available after you pause
437-
* or stop fetching its requests. For example after you pause or abort a crawl. Or just before
438-
* a server migration.
508+
* @inheritDoc
439509
*/
440510
async persistState(): Promise<void> {
441511
if (!this.persistStateKey) {
@@ -570,9 +640,7 @@ export class RequestList {
570640
}
571641

572642
/**
573-
* Resolves to `true` if the next call to {@apilink RequestList.fetchNextRequest} function
574-
* would return `null`, otherwise it resolves to `false`.
575-
* Note that even if the list is empty, there might be some pending requests currently being processed.
643+
* @inheritDoc
576644
*/
577645
async isEmpty(): Promise<boolean> {
578646
this._ensureIsInitialized();
@@ -581,7 +649,7 @@ export class RequestList {
581649
}
582650

583651
/**
584-
* Returns `true` if all requests were already handled and there are no more left.
652+
* @inheritDoc
585653
*/
586654
async isFinished(): Promise<boolean> {
587655
this._ensureIsInitialized();
@@ -590,12 +658,7 @@ export class RequestList {
590658
}
591659

592660
/**
593-
* Gets the next {@apilink Request} to process. First, the function gets a request previously reclaimed
594-
* using the {@apilink RequestList.reclaimRequest} function, if there is any.
595-
* Otherwise it gets the next request from sources.
596-
*
597-
* The function's `Promise` resolves to `null` if there are no more
598-
* requests to process.
661+
* @inheritDoc
599662
*/
600663
async fetchNextRequest(): Promise<Request | null> {
601664
this._ensureIsInitialized();
@@ -621,6 +684,17 @@ export class RequestList {
621684
return null;
622685
}
623686

687+
/**
688+
* @inheritDoc
689+
*/
690+
async *[Symbol.asyncIterator]() {
691+
while (true) {
692+
const req = await this.fetchNextRequest();
693+
if (!req) break;
694+
yield req;
695+
}
696+
}
697+
624698
private ensureRequest(requestLike: Request | RequestOptions, index: number): Request {
625699
if (requestLike instanceof Request) {
626700
return requestLike;
@@ -631,7 +705,7 @@ export class RequestList {
631705
}
632706

633707
/**
634-
* Marks request as handled after successful processing.
708+
* @inheritDoc
635709
*/
636710
async markRequestHandled(request: Request): Promise<void> {
637711
const { uniqueKey } = request;
@@ -645,8 +719,7 @@ export class RequestList {
645719
}
646720

647721
/**
648-
* Reclaims request to the list if its processing failed.
649-
* The request will become available in the next `this.fetchNextRequest()`.
722+
* @inheritDoc
650723
*/
651724
async reclaimRequest(request: Request): Promise<void> {
652725
const { uniqueKey } = request;
@@ -798,7 +871,7 @@ export class RequestList {
798871
}
799872

800873
/**
801-
* Returns number of handled requests.
874+
* @inheritDoc
802875
*/
803876
handledCount(): number {
804877
this._ensureIsInitialized();

0 commit comments

Comments
 (0)