Skip to content

Commit 2f7fc7c

Browse files
authored
fix: handle redirect cookies (#1521)
1 parent 6bfe1ce commit 2f7fc7c

File tree

2 files changed

+98
-4
lines changed

2 files changed

+98
-4
lines changed

packages/http-crawler/src/internals/http-crawler.ts

Lines changed: 18 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ import type { Awaitable, Dictionary } from '@crawlee/types';
2323
import type { RequestLike, ResponseLike } from 'content-type';
2424
import contentTypeParser from 'content-type';
2525
import mime from 'mime-types';
26-
import type { OptionsInit, Method, Request as GotRequest, Response as GotResponse, GotOptionsInit } from 'got-scraping';
26+
import type { OptionsInit, Method, Request as GotRequest, Response as GotResponse, GotOptionsInit, Options } from 'got-scraping';
2727
import { gotScraping, TimeoutError } from 'got-scraping';
2828
import type { JsonValue } from 'type-fest';
2929
import { extname } from 'node:path';
@@ -539,7 +539,10 @@ export class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any, H
539539
gotOptions.headers ??= {};
540540
Reflect.deleteProperty(gotOptions.headers, 'Cookie');
541541
Reflect.deleteProperty(gotOptions.headers, 'cookie');
542-
gotOptions.headers.Cookie = mergedCookie;
542+
543+
if (mergedCookie !== '') {
544+
gotOptions.headers.Cookie = mergedCookie;
545+
}
543546
}
544547

545548
/**
@@ -551,7 +554,7 @@ export class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any, H
551554
const opts = this._getRequestOptions(request, session, proxyUrl, gotOptions);
552555

553556
try {
554-
return await this._requestAsBrowser(opts);
557+
return await this._requestAsBrowser(opts, session);
555558
} catch (e) {
556559
if (e instanceof TimeoutError) {
557560
this._handleRequestTimeout(session);
@@ -729,10 +732,21 @@ export class HttpCrawler<Context extends InternalHttpCrawlingContext<any, any, H
729732
/**
730733
* @internal wraps public utility for mocking purposes
731734
*/
732-
private _requestAsBrowser = (options: OptionsInit & { isStream: true }) => {
735+
private _requestAsBrowser = (options: OptionsInit & { isStream: true }, session?: Session) => {
733736
return new Promise<IncomingMessage>((resolve, reject) => {
734737
const stream = gotScraping(options);
735738

739+
stream.on('redirect', (updatedOptions: Options, redirectResponse: IncomingMessage) => {
740+
if (this.persistCookiesPerSession) {
741+
session!.setCookiesFromResponse(redirectResponse);
742+
743+
const cookieString = session!.getCookieString(updatedOptions.url!.toString());
744+
if (cookieString !== '') {
745+
updatedOptions.headers.Cookie = cookieString;
746+
}
747+
}
748+
});
749+
736750
stream.on('error', reject);
737751
stream.on('response', () => {
738752
resolve(addResponsePropertiesToStream(stream));

test/core/crawlers/http_crawler.test.ts

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,25 @@ router.set('/invalidContentType', (req, res) => {
2222
res.end(`<html><head><title>Example Domain</title></head></html>`);
2323
});
2424

25+
router.set('/redirectAndCookies', (req, res) => {
26+
res.setHeader('content-type', 'text/html');
27+
res.setHeader('set-cookie', 'foo=bar');
28+
res.setHeader('location', '/cookies');
29+
res.statusCode = 302;
30+
res.end();
31+
});
32+
33+
router.set('/cookies', (req, res) => {
34+
res.setHeader('content-type', 'text/html');
35+
res.end(JSON.stringify(req.headers.cookie));
36+
});
37+
38+
router.set('/redirectWithoutCookies', (req, res) => {
39+
res.setHeader('location', '/cookies');
40+
res.statusCode = 302;
41+
res.end();
42+
});
43+
2544
let server: http.Server;
2645
let url: string;
2746

@@ -151,3 +170,64 @@ test('invalid content type defaults to octet-stream', async () => {
151170
},
152171
]);
153172
});
173+
174+
test('handles cookies from redirects', async () => {
175+
const results: string[] = [];
176+
177+
const crawler = new HttpCrawler({
178+
sessionPoolOptions: {
179+
maxPoolSize: 1,
180+
},
181+
handlePageFunction: async ({ body }) => {
182+
results.push(JSON.parse(body.toString()));
183+
},
184+
});
185+
186+
await crawler.run([`${url}/redirectAndCookies`]);
187+
188+
expect(results).toStrictEqual([
189+
'foo=bar',
190+
]);
191+
});
192+
193+
test('handles cookies from redirects - no empty cookie header', async () => {
194+
const results: string[] = [];
195+
196+
const crawler = new HttpCrawler({
197+
sessionPoolOptions: {
198+
maxPoolSize: 1,
199+
},
200+
handlePageFunction: async ({ body }) => {
201+
const str = body.toString();
202+
203+
if (str !== '') {
204+
results.push(JSON.parse(str));
205+
}
206+
},
207+
});
208+
209+
await crawler.run([`${url}/redirectWithoutCookies`]);
210+
211+
expect(results).toStrictEqual([]);
212+
});
213+
214+
test('no empty cookie header', async () => {
215+
const results: string[] = [];
216+
217+
const crawler = new HttpCrawler({
218+
sessionPoolOptions: {
219+
maxPoolSize: 1,
220+
},
221+
handlePageFunction: async ({ body }) => {
222+
const str = body.toString();
223+
224+
if (str !== '') {
225+
results.push(JSON.parse(str));
226+
}
227+
},
228+
});
229+
230+
await crawler.run([`${url}/cookies`]);
231+
232+
expect(results).toStrictEqual([]);
233+
});

0 commit comments

Comments
 (0)