Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
35 changes: 26 additions & 9 deletions packages/basic-crawler/src/internals/basic-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,14 @@ import type {
RequestTransform,
RouterHandler,
RouterRoutes,
RouteSchemas,
RoutesFromSchemas,
SkippedRequestCallback,
Source,
StatisticsOptions,
StatisticState,
StorageIdentifier,
TypedRequestsLike,
} from '@crawlee/core';
import {
AutoscaledPool,
Expand Down Expand Up @@ -110,7 +113,7 @@ export type ErrorHandler<

export interface StatusMessageCallbackParams<
Context extends CrawlingContext = BasicCrawlingContext,
Crawler extends BasicCrawler<any> = BasicCrawler<Context>,
Crawler extends BasicCrawler<any, any, any, any> = BasicCrawler<Context>,
> {
state: StatisticState;
crawler: Crawler;
Expand All @@ -120,7 +123,7 @@ export interface StatusMessageCallbackParams<

export type StatusMessageCallback<
Context extends CrawlingContext = BasicCrawlingContext,
Crawler extends BasicCrawler<any> = BasicCrawler<Context>,
Crawler extends BasicCrawler<any, any, any, any> = BasicCrawler<Context>,
> = (params: StatusMessageCallbackParams<Context, Crawler>) => Awaitable<void>;

export type RequireContextPipeline<
Expand All @@ -134,6 +137,7 @@ export interface BasicCrawlerOptions<
Context extends CrawlingContext = CrawlingContext,
ContextExtension = Dictionary<never>,
ExtendedContext extends Context = Context & ContextExtension,
Routes extends Record<keyof Routes, Dictionary> = Record<string, GetUserDataFromRequest<Context['request']>>,
> {
/**
* User-provided function that performs the logic of the crawler. It is called for each URL to crawl.
Expand All @@ -152,7 +156,7 @@ export interface BasicCrawlerOptions<
* The exceptions are logged to the request using the
* {@apilink Request.pushErrorMessage|`Request.pushErrorMessage()`} function.
*/
requestHandler?: RequestHandler<ExtendedContext>;
requestHandler?: RouterHandler<ExtendedContext, Routes> | RequestHandler<ExtendedContext>;

/**
* Allows the user to extend the crawling context passed to the request handler with custom functionality.
Expand Down Expand Up @@ -512,6 +516,7 @@ export class BasicCrawler<
Context extends CrawlingContext = CrawlingContext,
ContextExtension = Dictionary<never>,
ExtendedContext extends Context = Context & ContextExtension,
Routes extends Record<keyof Routes, Dictionary> = Record<string, GetUserDataFromRequest<Context['request']>>,
> {
protected static readonly CRAWLEE_STATE_KEY = 'CRAWLEE_STATE';

Expand Down Expand Up @@ -580,7 +585,10 @@ export class BasicCrawler<
* Default {@apilink Router} instance that will be used if we don't specify any {@apilink BasicCrawlerOptions.requestHandler|`requestHandler`}.
* See {@apilink Router.addHandler|`router.addHandler()`} and {@apilink Router.addDefaultHandler|`router.addDefaultHandler()`}.
*/
readonly router: RouterHandler<Context> = Router.create<Context>();
readonly router: RouterHandler<Context, Routes> = Router.create<Context>() as unknown as RouterHandler<
Context,
Routes
>;

private _basicContextPipeline?: ContextPipeline<{ request: Request }, CrawlingContext>;

Expand Down Expand Up @@ -705,7 +713,7 @@ export class BasicCrawler<
* All `BasicCrawler` parameters are passed via an options object.
*/
constructor(
options: BasicCrawlerOptions<Context, ContextExtension, ExtendedContext> &
options: BasicCrawlerOptions<Context, ContextExtension, ExtendedContext, Routes> &
RequireContextPipeline<CrawlingContext, Context> = {} as any, // cast because the constructor logic handles missing `contextPipelineBuilder` - the type is just for DX
) {
ow(options, 'BasicCrawlerOptions', ow.object.exactShape(BasicCrawler.optionsShape));
Expand Down Expand Up @@ -1263,7 +1271,7 @@ export class BasicCrawler<
* @param [requests] The requests to add.
* @param [options] Options for the request queue.
*/
async run(requests?: RequestsLike, options?: CrawlerRunOptions): Promise<FinalStatistics> {
async run(requests?: TypedRequestsLike<Routes>, options?: CrawlerRunOptions): Promise<FinalStatistics> {
if (this.running) {
throw new Error(
'This crawler instance is already running, you can add more requests to it via `crawler.addRequests()`.',
Expand Down Expand Up @@ -1534,7 +1542,7 @@ export class BasicCrawler<
* @param options Options for the request queue
*/
async addRequests(
requests: ReadonlyDeep<RequestsLike>,
requests: ReadonlyDeep<TypedRequestsLike<Routes>>,
options: CrawlerAddRequestsOptions = {},
): Promise<CrawlerAddRequestsResult> {
await this.getRequestManager();
Expand Down Expand Up @@ -2345,9 +2353,18 @@ export interface CrawlerRunOptions extends CrawlerAddRequestsOptions {
* await crawler.run();
* ```
*/
export function createBasicRouter<
Context extends BasicCrawlingContext = BasicCrawlingContext,
Routes extends Record<keyof Routes, Dictionary> = Record<string, GetUserDataFromRequest<Context['request']>>,
>(routes?: RouterRoutes<Context, Routes>): RouterHandler<Context, Routes>;
export function createBasicRouter<
Context extends BasicCrawlingContext = BasicCrawlingContext,
UserData extends Dictionary = GetUserDataFromRequest<Context['request']>,
>(routes?: RouterRoutes<Context, UserData>) {
return Router.create<Context>(routes);
>(routes?: RouterRoutes<Context, Record<string, UserData>>): RouterHandler<Context, Record<string, UserData>>;
export function createBasicRouter<
Context extends BasicCrawlingContext = BasicCrawlingContext,
const Schemas extends RouteSchemas = RouteSchemas,
>(schemas: Schemas): RouterHandler<Context, RoutesFromSchemas<Schemas>>;
export function createBasicRouter(routesOrSchemas?: any): any {
return Router.create(routesOrSchemas);
}
8 changes: 6 additions & 2 deletions packages/browser-crawler/src/internals/browser-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,12 @@ import type {
Dictionary,
EnqueueLinksOptions,
ErrorHandler,
GetUserDataFromRequest,
IRequestManager,
LoadedRequest,
Request,
RequestHandler,
RouterHandler,
SkippedRequestCallback,
} from '@crawlee/basic';
import {
Expand Down Expand Up @@ -106,6 +108,7 @@ export interface BrowserCrawlerOptions<
ContextExtension = Dictionary<never>,
ExtendedContext extends Context = Context & ContextExtension,
InternalBrowserPoolOptions extends BrowserPoolOptions = BrowserPoolOptions,
Routes extends Record<keyof Routes, Dictionary> = Record<string, GetUserDataFromRequest<Context['request']>>,
__BrowserPlugins extends BrowserPlugin[] = InferBrowserPluginArray<InternalBrowserPoolOptions['browserPlugins']>,
__BrowserControllerReturn extends BrowserController = ReturnType<__BrowserPlugins[number]['createController']>,
__LaunchContextReturn extends LaunchContext = ReturnType<__BrowserPlugins[number]['createLaunchContext']>,
Expand Down Expand Up @@ -149,7 +152,7 @@ export interface BrowserCrawlerOptions<
* The exceptions are logged to the request using the
* {@apilink Request.pushErrorMessage|`Request.pushErrorMessage()`} function.
*/
requestHandler?: RequestHandler<ExtendedContext>;
requestHandler?: RouterHandler<ExtendedContext, Routes> | RequestHandler<ExtendedContext>;

/**
* User-provided function that allows modifying the request object before it gets retried by the crawler.
Expand Down Expand Up @@ -313,8 +316,9 @@ export abstract class BrowserCrawler<
>,
ContextExtension = Dictionary<never>,
ExtendedContext extends Context = Context & ContextExtension,
Routes extends Record<keyof Routes, Dictionary> = Record<string, GetUserDataFromRequest<Context['request']>>,
GoToOptions extends Dictionary = Dictionary,
> extends BasicCrawler<Context, ContextExtension, ExtendedContext> {
> extends BasicCrawler<Context, ContextExtension, ExtendedContext, Routes> {
/**
* A reference to the underlying browser pool that manages the crawler's browsers. Typed as
* {@apilink IBrowserPool} so custom implementations can be plugged in via the `browserPool` constructor option.
Expand Down
27 changes: 22 additions & 5 deletions packages/cheerio-crawler/src/internals/cheerio-crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,10 @@ import type {
InternalHttpHook,
IRequestManager,
RequestHandler,
RouterHandler,
RouterRoutes,
RouteSchemas,
RoutesFromSchemas,
SkippedRequestCallback,
} from '@crawlee/http';
import {
Expand All @@ -34,7 +37,8 @@ export interface CheerioCrawlerOptions<
ExtendedContext extends CheerioCrawlingContext = CheerioCrawlingContext & ContextExtension,
UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
JSONData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
> extends HttpCrawlerOptions<CheerioCrawlingContext<UserData, JSONData>, ContextExtension, ExtendedContext> {}
Routes extends Record<keyof Routes, Dictionary> = Record<string, UserData>,
> extends HttpCrawlerOptions<CheerioCrawlingContext<UserData, JSONData>, ContextExtension, ExtendedContext, Routes> {}

export type CheerioHook<
UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler
Expand Down Expand Up @@ -179,11 +183,15 @@ export type CheerioRequestHandler<
export class CheerioCrawler<
ContextExtension = Dictionary<never>,
ExtendedContext extends CheerioCrawlingContext = CheerioCrawlingContext & ContextExtension,
> extends HttpCrawler<CheerioCrawlingContext, ContextExtension, ExtendedContext> {
Routes extends Record<keyof Routes, Dictionary> = Record<
string,
GetUserDataFromRequest<CheerioCrawlingContext['request']>
>,
> extends HttpCrawler<CheerioCrawlingContext, ContextExtension, ExtendedContext, Routes> {
/**
* All `CheerioCrawler` parameters are passed via an options object.
*/
constructor(options?: CheerioCrawlerOptions<ContextExtension, ExtendedContext>) {
constructor(options?: CheerioCrawlerOptions<ContextExtension, ExtendedContext, any, any, Routes>) {
const { contextPipelineBuilder, ...rest } = options ?? {};

super({
Expand Down Expand Up @@ -361,9 +369,18 @@ export async function cheerioCrawlerEnqueueLinks(
* await crawler.run();
* ```
*/
export function createCheerioRouter<
Context extends CheerioCrawlingContext = CheerioCrawlingContext,
Routes extends Record<keyof Routes, Dictionary> = Record<string, GetUserDataFromRequest<Context['request']>>,
>(routes?: RouterRoutes<Context, Routes>): RouterHandler<Context, Routes>;
export function createCheerioRouter<
Context extends CheerioCrawlingContext = CheerioCrawlingContext,
UserData extends Dictionary = GetUserDataFromRequest<Context['request']>,
>(routes?: RouterRoutes<Context, UserData>) {
return Router.create<Context>(routes);
>(routes?: RouterRoutes<Context, Record<string, UserData>>): RouterHandler<Context, Record<string, UserData>>;
export function createCheerioRouter<
Context extends CheerioCrawlingContext = CheerioCrawlingContext,
const Schemas extends RouteSchemas = RouteSchemas,
>(schemas: Schemas): RouterHandler<Context, RoutesFromSchemas<Schemas>>;
export function createCheerioRouter(routesOrSchemas?: any): any {
return Router.create(routesOrSchemas);
}
1 change: 1 addition & 0 deletions packages/core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@
"@crawlee/types": "workspace:*",
"@crawlee/utils": "workspace:*",
"@sapphire/async-queue": "^1.5.5",
"@standard-schema/spec": "^1.0.0",
"@vladfrangu/async_event_emitter": "^2.4.6",
"csv-stringify": "^6.5.2",
"json5": "^2.2.3",
Expand Down
69 changes: 68 additions & 1 deletion packages/core/src/crawlers/crawler_commons.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import type { ReadonlyDeep, SetRequired } from 'type-fest';
import type { Configuration } from '../configuration.js';
import type { EnqueueLinksOptions } from '../enqueue_links/enqueue_links.js';
import type { CrawleeLogger } from '../log.js';
import type { Request, Source } from '../request.js';
import type { Request, RequestOptions, Source } from '../request.js';
import type { Dataset } from '../storages/dataset.js';
import { KeyValueStore, type RecordOptions } from '../storages/key_value_store.js';
import type { RequestQueueOperationOptions } from '../storages/request_queue.js';
Expand All @@ -13,6 +13,73 @@ import type { StorageIdentifier } from '../storages/storage_instance_manager.js'
/** @internal */
export type IsAny<T> = 0 extends 1 & T ? true : false;

/**
* A request input (URL string, request-options object, or {@apilink Request}) whose `userData` is typed
* according to its `label`, based on a router's route map.
*
* When the route map is open (the default `Record<string, ...>`), this is just the regular loose
* {@apilink Source} input. When the map declares concrete labels, providing a `label` requires the matching
* `userData` shape and rejects labels not present in the map; unlabeled requests keep loose `userData`.
*/
export type LabeledSource<Routes extends Record<keyof Routes, Dictionary>> = string extends keyof Routes
? string | Source
:
| string
| Request
| ({ requestsFromUrl?: string; regex?: RegExp } & (
| {
[Label in keyof Routes & string]: Omit<Partial<RequestOptions<Routes[Label]>>, 'label'> & {
label: Label;
};
}[keyof Routes & string]
| (Omit<Partial<RequestOptions>, 'label'> & { label?: undefined })
));

/**
* The iterable/array of {@apilink LabeledSource} inputs accepted by the label-aware `addRequests`/`run`
* methods of a crawler bound to a typed router.
*/
export type TypedRequestsLike<Routes extends Record<keyof Routes, Dictionary>> =
| AsyncIterable<LabeledSource<Routes>>
| Iterable<LabeledSource<Routes>>
| LabeledSource<Routes>[];

/**
* The label-aware `addRequests` method signature exposed on a request handler's context when the crawler is
* bound to a typed router. Mirrors {@apilink RestrictedCrawlingContext.addRequests} with typed sources.
*/
export type TypedContextAddRequests<Routes extends Record<keyof Routes, Dictionary>> = (
requestsLike: ReadonlyDeep<LabeledSource<Routes>[]>,
options?: ReadonlyDeep<RequestQueueOperationOptions>,
) => Promise<void>;

/**
* An `enqueueLinks`-options object with its `label`/`userData` retyped according to a router's route map: a
* declared `label` requires the matching `userData` shape (unknown labels are rejected), while unlabeled
* calls keep loose `userData`. Returns the options unchanged when the route map is open (the default).
*/
type TypedEnqueueLinksOptions<Options, Routes extends Record<keyof Routes, Dictionary>> = string extends keyof Routes
? Options
: Omit<Options, 'label' | 'userData'> &
(
| { [Label in keyof Routes & string]: { label: Label; userData?: Routes[Label] } }[keyof Routes & string]
| { label?: undefined; userData?: Dictionary }
);

/**
* Transforms a context's existing `enqueueLinks` method so that the `label`/`userData` in its options follow
* the router's route map, while preserving everything else about the signature (argument optionality and
* return type, which differ between crawler types).
*/
export type TypedContextEnqueueLinks<
EnqueueLinks,
Routes extends Record<keyof Routes, Dictionary>,
> = EnqueueLinks extends (options?: infer Options) => infer Result
? (options?: TypedEnqueueLinksOptions<Options, Routes>) => Result
: EnqueueLinks extends (options: infer Options) => infer Result
? (options: TypedEnqueueLinksOptions<Options, Routes>) => Result
: EnqueueLinks;

/** @internal */
export type WithRequired<T, K extends keyof T> = T & { [P in K]-?: T[P] };

Expand Down
26 changes: 26 additions & 0 deletions packages/core/src/errors.ts
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,32 @@ export class CriticalError extends NonRetryableError {}
*/
export class MissingRouteError extends CriticalError {}

/**
* Thrown when a request's `userData` does not match the {@apilink RouteSchemas|Standard Schema} registered for its label.
*
* As the `userData` does not change between attempts, this error is non-retryable.
*/
export class RequestValidationError extends NonRetryableError {
constructor(
readonly label: string | symbol,
readonly issues: readonly {
readonly message: string;
readonly path?: readonly (PropertyKey | { key: PropertyKey })[];
}[],
) {
const details = issues
.map((issue) => {
const path = (issue.path ?? [])
.map((segment) => (typeof segment === 'object' ? segment.key : segment))
.join('.');
return `- ${path ? `${path}: ` : ''}${issue.message}`;
})
.join('\n');

super(`Request userData for label '${String(label)}' failed schema validation:\n${details}`);
}
}

/**
* Errors of `RetryRequestError` type will always be retried by the crawler.
*
Expand Down
Loading
Loading