Skip to content

Commit 764f992

Browse files
authored
feat: add onSkippedRequest option (#2916)
Related: #2910
1 parent 99af95e commit 764f992

File tree

13 files changed

+190
-4
lines changed

13 files changed

+190
-4
lines changed

packages/basic-crawler/src/internals/basic-crawler.ts

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ import type {
2121
RouterRoutes,
2222
Session,
2323
SessionPoolOptions,
24+
SkippedRequestCallback,
2425
Source,
2526
StatisticsOptions,
2627
StatisticState,
@@ -349,6 +350,12 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
349350
*/
350351
respectRobotsTxtFile?: boolean;
351352

353+
/**
354+
* When a request is skipped for some reason, you can use this callback to act on it.
355+
* This is currently fired only for requests skipped based on robots.txt file.
356+
*/
357+
onSkippedRequest?: SkippedRequestCallback;
358+
352359
/** @internal */
353360
log?: Log;
354361

@@ -517,6 +524,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
517524
protected httpClient: BaseHttpClient;
518525
protected retryOnBlocked: boolean;
519526
protected respectRobotsTxtFile: boolean;
527+
protected onSkippedRequest?: SkippedRequestCallback;
520528
private _closeEvents?: boolean;
521529

522530
private experiments: CrawlerExperiments;
@@ -552,6 +560,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
552560

553561
retryOnBlocked: ow.optional.boolean,
554562
respectRobotsTxtFile: ow.optional.boolean,
563+
onSkippedRequest: ow.optional.function,
555564
httpClient: ow.optional.object,
556565

557566
// AutoscaledPool shorthands
@@ -595,6 +604,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
595604

596605
retryOnBlocked = false,
597606
respectRobotsTxtFile = false,
607+
onSkippedRequest,
598608

599609
// internal
600610
log = defaultLog.child({ prefix: this.constructor.name }),
@@ -668,6 +678,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
668678

669679
this.retryOnBlocked = retryOnBlocked;
670680
this.respectRobotsTxtFile = respectRobotsTxtFile;
681+
this.onSkippedRequest = onSkippedRequest;
671682

672683
this._handlePropertyNameChange({
673684
newName: 'requestHandlerTimeoutSecs',
@@ -1059,13 +1070,22 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
10591070
allowedRequests.push(request);
10601071
} else {
10611072
skipped.add(url);
1073+
await this.onSkippedRequest?.({ url, reason: 'robotsTxt' });
10621074
}
10631075
}
10641076

10651077
if (skipped.size > 0) {
10661078
this.log.warning(`Some requests were skipped because they were disallowed based on the robots.txt file`, {
10671079
skipped: [...skipped],
10681080
});
1081+
1082+
if (this.onSkippedRequest) {
1083+
await Promise.all(
1084+
[...skipped].map((url) => {
1085+
return this.onSkippedRequest!({ url, reason: 'robotsTxt' });
1086+
}),
1087+
);
1088+
}
10691089
}
10701090

10711091
return requestQueue.addRequestsBatched(allowedRequests, options);
@@ -1355,12 +1375,16 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
13551375
}
13561376

13571377
if (!(await this.isAllowedBasedOnRobotsTxtFile(request.url))) {
1358-
this.log.debug(
1378+
this.log.warning(
13591379
`Skipping request ${request.url} (${request.id}) because it is disallowed based on robots.txt`,
13601380
);
13611381
request.state = RequestState.SKIPPED;
13621382
request.noRetry = true;
13631383
await source.markRequestHandled(request);
1384+
await this.onSkippedRequest?.({
1385+
url: request.url,
1386+
reason: 'robotsTxt',
1387+
});
13641388
return;
13651389
}
13661390

@@ -1385,6 +1409,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
13851409
// specify the RQ first to allow overriding it
13861410
requestQueue: await this.getRequestQueue(),
13871411
robotsTxtFile: await this.getRobotsTxtFileForUrl(request!.url),
1412+
onSkippedRequest: this.onSkippedRequest,
13881413
...options,
13891414
});
13901415
},

packages/browser-crawler/src/internals/browser-crawler.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import type {
1111
RequestHandler,
1212
RequestProvider,
1313
Session,
14+
SkippedRequestCallback,
1415
} from '@crawlee/basic';
1516
import {
1617
BASIC_CRAWLER_TIMEOUT_BUFFER_SECS,
@@ -626,6 +627,7 @@ export abstract class BrowserCrawler<
626627
page,
627628
requestQueue: await this.getRequestQueue(),
628629
robotsTxtFile: await this.getRobotsTxtFileForUrl(crawlingContext.request.url),
630+
onSkippedRequest: this.onSkippedRequest,
629631
originalRequestUrl: crawlingContext.request.url,
630632
finalRequestUrl: crawlingContext.request.loadedUrl,
631633
});
@@ -792,6 +794,7 @@ interface EnqueueLinksInternalOptions {
792794
page: CommonPage;
793795
requestQueue: RequestProvider;
794796
robotsTxtFile?: RobotsTxtFile;
797+
onSkippedRequest?: SkippedRequestCallback;
795798
originalRequestUrl: string;
796799
finalRequestUrl?: string;
797800
}
@@ -802,6 +805,7 @@ export async function browserCrawlerEnqueueLinks({
802805
page,
803806
requestQueue,
804807
robotsTxtFile,
808+
onSkippedRequest,
805809
originalRequestUrl,
806810
finalRequestUrl,
807811
}: EnqueueLinksInternalOptions) {
@@ -821,6 +825,7 @@ export async function browserCrawlerEnqueueLinks({
821825
return enqueueLinks({
822826
requestQueue,
823827
robotsTxtFile,
828+
onSkippedRequest,
824829
urls,
825830
baseUrl,
826831
...(options as EnqueueLinksOptions),

packages/cheerio-crawler/src/internals/cheerio-crawler.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ import type {
1212
RequestHandler,
1313
RequestProvider,
1414
RouterRoutes,
15+
SkippedRequestCallback,
1516
} from '@crawlee/http';
1617
import { enqueueLinks, HttpCrawler, resolveBaseUrlForEnqueueLinksFiltering, Router } from '@crawlee/http';
1718
import type { Dictionary } from '@crawlee/types';
@@ -194,6 +195,7 @@ export class CheerioCrawler extends HttpCrawler<CheerioCrawlingContext> {
194195
$,
195196
requestQueue: await this.getRequestQueue(),
196197
robotsTxtFile: await this.getRobotsTxtFileForUrl(crawlingContext.request.url),
198+
onSkippedRequest: this.onSkippedRequest,
197199
originalRequestUrl: crawlingContext.request.url,
198200
finalRequestUrl: crawlingContext.request.loadedUrl,
199201
});
@@ -240,6 +242,7 @@ interface EnqueueLinksInternalOptions {
240242
$: cheerio.CheerioAPI | null;
241243
requestQueue: RequestProvider;
242244
robotsTxtFile?: RobotsTxtFile;
245+
onSkippedRequest?: SkippedRequestCallback;
243246
originalRequestUrl: string;
244247
finalRequestUrl?: string;
245248
}
@@ -250,6 +253,7 @@ export async function cheerioCrawlerEnqueueLinks({
250253
$,
251254
requestQueue,
252255
robotsTxtFile,
256+
onSkippedRequest,
253257
originalRequestUrl,
254258
finalRequestUrl,
255259
}: EnqueueLinksInternalOptions) {
@@ -273,6 +277,7 @@ export async function cheerioCrawlerEnqueueLinks({
273277
return enqueueLinks({
274278
requestQueue,
275279
robotsTxtFile,
280+
onSkippedRequest,
276281
urls,
277282
baseUrl,
278283
...options,

packages/core/src/enqueue_links/enqueue_links.ts

Lines changed: 27 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import type { BatchAddRequestsResult, Dictionary } from '@crawlee/types';
1+
import type { Awaitable, BatchAddRequestsResult, Dictionary } from '@crawlee/types';
22
import { type RobotsTxtFile } from '@crawlee/utils';
33
import ow from 'ow';
44
import { getDomain } from 'tldts';
@@ -18,6 +18,8 @@ import {
1818
filterRequestsByPatterns,
1919
} from './shared';
2020

21+
export type SkippedRequestCallback = (args: { url: string; reason: 'robotsTxt' }) => Awaitable<void>;
22+
2123
export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
2224
/** Limit the amount of actually enqueued URLs to this number. Useful for testing across the entire crawling scope. */
2325
limit?: number;
@@ -165,6 +167,12 @@ export interface EnqueueLinksOptions extends RequestQueueOperationOptions {
165167
* If provided, disallowed URLs will be ignored.
166168
*/
167169
robotsTxtFile?: RobotsTxtFile;
170+
171+
/**
172+
* When a request is skipped for some reason, you can use this callback to act on it.
173+
* This is currently fired only for requests skipped based on robots.txt file.
174+
*/
175+
onSkippedRequest?: SkippedRequestCallback;
168176
}
169177

170178
/**
@@ -264,6 +272,7 @@ export async function enqueueLinks(
264272
urls: ow.array.ofType(ow.string),
265273
requestQueue: ow.object.hasKeys('fetchNextRequest', 'addRequest'),
266274
robotsTxtFile: ow.optional.object.hasKeys('isAllowed'),
275+
onSkippedRequest: ow.optional.function,
267276
forefront: ow.optional.boolean,
268277
skipNavigation: ow.optional.boolean,
269278
limit: ow.optional.number,
@@ -295,6 +304,7 @@ export async function enqueueLinks(
295304
forefront,
296305
waitForAllRequestsToBeAdded,
297306
robotsTxtFile,
307+
onSkippedRequest,
298308
} = options;
299309

300310
const urlExcludePatternObjects: UrlPatternObject[] = [];
@@ -373,9 +383,24 @@ export async function enqueueLinks(
373383
let requestOptions = createRequestOptions(urls, options);
374384

375385
if (robotsTxtFile) {
386+
const skippedRequests: RequestOptions[] = [];
387+
376388
requestOptions = requestOptions.filter((request) => {
377-
return robotsTxtFile.isAllowed(request.url);
389+
if (robotsTxtFile.isAllowed(request.url)) {
390+
return true;
391+
}
392+
393+
skippedRequests.push(request);
394+
return false;
378395
});
396+
397+
if (onSkippedRequest && skippedRequests.length > 0) {
398+
await Promise.all(
399+
skippedRequests.map((request) => {
400+
return onSkippedRequest({ url: request.url, reason: 'robotsTxt' });
401+
}),
402+
);
403+
}
379404
}
380405

381406
if (transformRequestFunction) {

packages/jsdom-crawler/src/internals/jsdom-crawler.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ import type {
1111
RequestHandler,
1212
RequestProvider,
1313
RouterRoutes,
14+
SkippedRequestCallback,
1415
} from '@crawlee/http';
1516
import {
1617
enqueueLinks,
@@ -305,6 +306,7 @@ export class JSDOMCrawler extends HttpCrawler<JSDOMCrawlingContext> {
305306
window,
306307
requestQueue: await this.getRequestQueue(),
307308
robotsTxtFile: await this.getRobotsTxtFileForUrl(crawlingContext.request.url),
309+
onSkippedRequest: this.onSkippedRequest,
308310
originalRequestUrl: crawlingContext.request.url,
309311
finalRequestUrl: crawlingContext.request.loadedUrl,
310312
});
@@ -345,6 +347,7 @@ interface EnqueueLinksInternalOptions {
345347
window: DOMWindow | null;
346348
requestQueue: RequestProvider;
347349
robotsTxtFile?: RobotsTxtFile;
350+
onSkippedRequest?: SkippedRequestCallback;
348351
originalRequestUrl: string;
349352
finalRequestUrl?: string;
350353
}
@@ -355,6 +358,7 @@ export async function domCrawlerEnqueueLinks({
355358
window,
356359
requestQueue,
357360
robotsTxtFile,
361+
onSkippedRequest,
358362
originalRequestUrl,
359363
finalRequestUrl,
360364
}: EnqueueLinksInternalOptions) {
@@ -378,6 +382,7 @@ export async function domCrawlerEnqueueLinks({
378382
return enqueueLinks({
379383
requestQueue,
380384
robotsTxtFile,
385+
onSkippedRequest,
381386
urls,
382387
baseUrl,
383388
...options,

packages/linkedom-crawler/src/internals/linkedom-crawler.ts

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@ import type {
1010
RequestHandler,
1111
RequestProvider,
1212
RouterRoutes,
13+
SkippedRequestCallback,
1314
} from '@crawlee/http';
1415
import {
1516
enqueueLinks,
@@ -188,6 +189,7 @@ export class LinkeDOMCrawler extends HttpCrawler<LinkeDOMCrawlingContext> {
188189
window: document.defaultView,
189190
requestQueue: await this.getRequestQueue(),
190191
robotsTxtFile: await this.getRobotsTxtFileForUrl(crawlingContext.request.url),
192+
onSkippedRequest: this.onSkippedRequest,
191193
originalRequestUrl: crawlingContext.request.url,
192194
finalRequestUrl: crawlingContext.request.loadedUrl,
193195
});
@@ -228,6 +230,7 @@ interface EnqueueLinksInternalOptions {
228230
window: Window | null;
229231
requestQueue: RequestProvider;
230232
robotsTxtFile?: RobotsTxtFile;
233+
onSkippedRequest?: SkippedRequestCallback;
231234
originalRequestUrl: string;
232235
finalRequestUrl?: string;
233236
}
@@ -238,6 +241,7 @@ export async function linkedomCrawlerEnqueueLinks({
238241
window,
239242
requestQueue,
240243
robotsTxtFile,
244+
onSkippedRequest,
241245
originalRequestUrl,
242246
finalRequestUrl,
243247
}: EnqueueLinksInternalOptions) {
@@ -261,6 +265,7 @@ export async function linkedomCrawlerEnqueueLinks({
261265
return enqueueLinks({
262266
requestQueue,
263267
robotsTxtFile,
268+
onSkippedRequest,
264269
urls,
265270
baseUrl,
266271
...options,
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
{
2+
"actorSpecification": 1,
3+
"name": "test-adaptive-playwright-robots-file",
4+
"version": "0.0",
5+
"buildTag": "latest",
6+
"env": null
7+
}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
.idea
2+
.DS_Store
3+
node_modules
4+
package-lock.json
5+
apify_storage
6+
crawlee_storage
7+
storage
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
FROM node:20 AS builder
2+
3+
COPY /packages ./packages
4+
COPY /package*.json ./
5+
RUN npm --quiet set progress=false \
6+
&& npm install --only=prod --no-optional --no-audit \
7+
&& npm update
8+
9+
FROM apify/actor-node-playwright-chrome:20-beta
10+
11+
RUN rm -r node_modules
12+
COPY --from=builder /node_modules ./node_modules
13+
COPY --from=builder /packages ./packages
14+
COPY --from=builder /package*.json ./
15+
COPY /.actor ./.actor
16+
COPY /main.js ./
17+
18+
RUN echo "Installed NPM packages:" \
19+
&& (npm list --only=prod --no-optional --all || true) \
20+
&& echo "Node.js version:" \
21+
&& node --version \
22+
&& echo "NPM version:" \
23+
&& npm --version
24+
25+
ENV PLAYWRIGHT_EXECUTABLE_PATH=/usr/bin/google-chrome

0 commit comments

Comments
 (0)