Skip to content
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions docs/guides/custom-http-client/custom-http-client.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,8 @@ The <ApiLink to="basic-crawler/class/BasicCrawler">`BasicCrawler`</ApiLink> clas

Crawlee provides several HTTP client implementations out of the box:

- **`GotScrapingHttpClient`** (default) - Uses the `got-scraping` library for browser-like requests with support for custom headers, browser fingerprints, and proxies.
- **`ImpitHttpClient`** - Uses the `impit` library for making requests that closely mimic browser behavior.
- **`ImpitHttpClient`** (default) - Uses the `impit` library for making requests that closely mimic browser behavior.
- **`GotScrapingHttpClient`** - Uses the `got-scraping` library for browser-like requests with support for custom headers, browser fingerprints, and proxies. This was the default HTTP client in Crawlee v3.
- **`FetchHttpClient`** - Simple implementation using the native `fetch` API (does not support proxies).

## Implementing a custom HTTP client
Expand Down
2 changes: 1 addition & 1 deletion docs/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
"@crawlee/stagehand": "workspace:*",
"apify": "*",
"crawlee": "workspace:*",
"impit": "^0.7.1",
"impit": "^0.14.2",
"pino": "^9.6.0",
"playwright-extra": "^4.3.6",
"puppeteer-extra": "^3.3.6",
Expand Down
5 changes: 4 additions & 1 deletion packages/basic-crawler/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@
"@apify/timeout": "^0.3.2",
"@apify/utilities": "^2.15.5",
"@crawlee/core": "workspace:*",
"@crawlee/got-scraping-client": "workspace:*",
"@crawlee/http-client": "workspace:^",
"@crawlee/types": "workspace:*",
"@crawlee/utils": "workspace:*",
"csv-stringify": "^6.5.2",
Expand All @@ -51,5 +51,8 @@
"tldts": "^7.0.6",
"tslib": "^2.8.1",
"type-fest": "^4.41.0"
},
"optionalDependencies": {
"@crawlee/impit-client": "workspace:^"
}
}
29 changes: 26 additions & 3 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ import {
EnqueueStrategy,
EventType,
KeyValueStore,
log,
LogLevel,
mergeCookies,
MissingSessionError,
Expand All @@ -61,7 +62,7 @@ import {
Statistics,
validators,
} from '@crawlee/core';
import { GotScrapingHttpClient } from '@crawlee/got-scraping-client';
import { FetchHttpClient } from '@crawlee/http-client';
import type {
Awaitable,
BaseHttpClient,
Expand All @@ -86,6 +87,28 @@ import { cryptoRandomObjectId } from '@apify/utilities';

import { createSendRequest } from './send-request.js';

class LazyDefaultHttpClient implements BaseHttpClient {

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This charade is necessary because of AdaptivePlaywrightCrawler, as it doesn't call the async init() on the HTTP-based crawler.

This way, we defer the async import of impit until the actual usage (sendRequest() callsite).

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oomph. Wouldn't calling init in AdaptivePlaywrightCrawler be less annoying? That being said, the rest of the PR is clean, and this isn't too bad, either. So treat this comment as non-blocking.

Copy link
Copy Markdown
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed, but I won't deal w/ AdaptivePlaywrightCrawler internals if I can choose not to 😄

private _delegate?: BaseHttpClient;
private readonly _delegatePromise: Promise<BaseHttpClient>;

constructor(options?: { logger?: CrawleeLogger }) {
this._delegatePromise = import('@crawlee/impit-client')
.then(({ ImpitHttpClient }) => new ImpitHttpClient(options))
.catch(() => {
(options?.logger ?? log).warning(
'Optional dependency @crawlee/impit-client is not installed. ' +
'Falling back to native fetch — proxy support and browser fingerprinting are unavailable.',
);
return new FetchHttpClient(options);
});
}

async sendRequest(...args: Parameters<BaseHttpClient['sendRequest']>): Promise<Response> {
this._delegate ??= await this._delegatePromise;
Comment thread
barjin marked this conversation as resolved.
Outdated
return this._delegate.sendRequest(...args);
}
}

export interface BasicCrawlingContext<UserData extends Dictionary = Dictionary> extends CrawlingContext<UserData> {}

/**
Expand Down Expand Up @@ -377,7 +400,7 @@ export interface BasicCrawlerOptions<

/**
* HTTP client implementation for the `sendRequest` context helper and for plain HTTP crawling.
* Defaults to a new instance of {@apilink GotScrapingHttpClient}
* Defaults to {@apilink ImpitHttpClient} when `@crawlee/impit-client` is installed, otherwise {@apilink FetchHttpClient}.
*/
httpClient?: BaseHttpClient;

Expand Down Expand Up @@ -807,7 +830,7 @@ export class BasicCrawler<
this.requestManager = new RequestManagerTandem(requestList, () => this.openOwnedRequestQueue());
}

this.httpClient = httpClient ?? new GotScrapingHttpClient({ logger: this.log });
this.httpClient = httpClient ?? new LazyDefaultHttpClient({ logger: this.log });
this.proxyConfiguration = proxyConfiguration;
this.statusMessageLoggingInterval = statusMessageLoggingInterval;
this.statusMessageCallback = statusMessageCallback as StatusMessageCallback;
Expand Down
1 change: 1 addition & 0 deletions packages/crawlee/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,7 @@
},
"dependencies": {
"@crawlee/basic": "workspace:*",
"@crawlee/impit-client": "workspace:*",
"@crawlee/browser": "workspace:*",
"@crawlee/browser-pool": "workspace:*",
"@crawlee/cheerio": "workspace:*",
Expand Down
2 changes: 1 addition & 1 deletion packages/impit-client/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@
"@apify/datastructures": "^2.0.3",
"@crawlee/http-client": "workspace:*",
"@crawlee/types": "workspace:*",
"impit": "^0.14.1",
"impit": "^0.14.2",
"tough-cookie": "^6.0.0"
}
}
7 changes: 5 additions & 2 deletions packages/impit-client/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -97,12 +97,15 @@ export class ImpitHttpClient extends BaseHttpClient {

private resolveImpitBrowser(fingerprint?: SessionFingerprint): ImpitBrowser | undefined {
if (!fingerprint?.browser) return undefined;
const versions = IMPIT_VERSIONS_BY_BROWSER[fingerprint.browser];
if (!versions?.length) return undefined;

const cached = this.impitBrowserByFingerprint.get(fingerprint);
if (cached) return cached;

// impit can only impersonate Chrome and Firefox. Map other (Chromium-based or
// unsupported) families like `edge`/`safari` onto Chrome so the request still
// carries realistic browser headers instead of impit's bare `*/*` defaults.
const versions = IMPIT_VERSIONS_BY_BROWSER[fingerprint.browser] ?? IMPIT_VERSIONS_BY_BROWSER.chrome!;

const picked = versions[Math.floor(Math.random() * versions.length)];
this.impitBrowserByFingerprint.set(fingerprint, picked);
return picked;
Expand Down
Loading
Loading