@@ -21,6 +21,7 @@ import type {
2121 RouterRoutes ,
2222 Session ,
2323 SessionPoolOptions ,
24+ SkippedRequestCallback ,
2425 Source ,
2526 StatisticsOptions ,
2627 StatisticState ,
@@ -349,6 +350,12 @@ export interface BasicCrawlerOptions<Context extends CrawlingContext = BasicCraw
349350 */
350351 respectRobotsTxtFile ?: boolean ;
351352
353+ /**
354+ * When a request is skipped for some reason, you can use this callback to act on it.
355+ * This is currently fired only for requests skipped based on robots.txt file.
356+ */
357+ onSkippedRequest ?: SkippedRequestCallback ;
358+
352359 /** @internal */
353360 log ?: Log ;
354361
@@ -517,6 +524,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
517524 protected httpClient : BaseHttpClient ;
518525 protected retryOnBlocked : boolean ;
519526 protected respectRobotsTxtFile : boolean ;
527+ protected onSkippedRequest ?: SkippedRequestCallback ;
520528 private _closeEvents ?: boolean ;
521529
522530 private experiments : CrawlerExperiments ;
@@ -552,6 +560,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
552560
553561 retryOnBlocked : ow . optional . boolean ,
554562 respectRobotsTxtFile : ow . optional . boolean ,
563+ onSkippedRequest : ow . optional . function ,
555564 httpClient : ow . optional . object ,
556565
557566 // AutoscaledPool shorthands
@@ -595,6 +604,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
595604
596605 retryOnBlocked = false ,
597606 respectRobotsTxtFile = false ,
607+ onSkippedRequest,
598608
599609 // internal
600610 log = defaultLog . child ( { prefix : this . constructor . name } ) ,
@@ -668,6 +678,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
668678
669679 this . retryOnBlocked = retryOnBlocked ;
670680 this . respectRobotsTxtFile = respectRobotsTxtFile ;
681+ this . onSkippedRequest = onSkippedRequest ;
671682
672683 this . _handlePropertyNameChange ( {
673684 newName : 'requestHandlerTimeoutSecs' ,
@@ -1059,13 +1070,22 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
10591070 allowedRequests . push ( request ) ;
10601071 } else {
10611072 skipped . add ( url ) ;
1073+ await this . onSkippedRequest ?.( { url, reason : 'robotsTxt' } ) ;
10621074 }
10631075 }
10641076
10651077 if ( skipped . size > 0 ) {
10661078 this . log . warning ( `Some requests were skipped because they were disallowed based on the robots.txt file` , {
10671079 skipped : [ ...skipped ] ,
10681080 } ) ;
1081+
1082+ if ( this . onSkippedRequest ) {
1083+ await Promise . all (
1084+ [ ...skipped ] . map ( ( url ) => {
1085+ return this . onSkippedRequest ! ( { url, reason : 'robotsTxt' } ) ;
1086+ } ) ,
1087+ ) ;
1088+ }
10691089 }
10701090
10711091 return requestQueue . addRequestsBatched ( allowedRequests , options ) ;
@@ -1355,12 +1375,16 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
13551375 }
13561376
13571377 if ( ! ( await this . isAllowedBasedOnRobotsTxtFile ( request . url ) ) ) {
1358- this . log . debug (
1378+ this . log . warning (
13591379 `Skipping request ${ request . url } (${ request . id } ) because it is disallowed based on robots.txt` ,
13601380 ) ;
13611381 request . state = RequestState . SKIPPED ;
13621382 request . noRetry = true ;
13631383 await source . markRequestHandled ( request ) ;
1384+ await this . onSkippedRequest ?.( {
1385+ url : request . url ,
1386+ reason : 'robotsTxt' ,
1387+ } ) ;
13641388 return ;
13651389 }
13661390
@@ -1385,6 +1409,7 @@ export class BasicCrawler<Context extends CrawlingContext = BasicCrawlingContext
13851409 // specify the RQ first to allow overriding it
13861410 requestQueue : await this . getRequestQueue ( ) ,
13871411 robotsTxtFile : await this . getRobotsTxtFileForUrl ( request ! . url ) ,
1412+ onSkippedRequest : this . onSkippedRequest ,
13881413 ...options ,
13891414 } ) ;
13901415 } ,
0 commit comments