diff --git a/packages/basic-crawler/src/internals/basic-crawler.ts b/packages/basic-crawler/src/internals/basic-crawler.ts index eeb4d4adfea4..2f49cb743be8 100644 --- a/packages/basic-crawler/src/internals/basic-crawler.ts +++ b/packages/basic-crawler/src/internals/basic-crawler.ts @@ -21,11 +21,14 @@ import type { RequestTransform, RouterHandler, RouterRoutes, + RouteSchemas, + RoutesFromSchemas, SkippedRequestCallback, Source, StatisticsOptions, StatisticState, StorageIdentifier, + TypedRequestsLike, } from '@crawlee/core'; import { AutoscaledPool, @@ -110,7 +113,7 @@ export type ErrorHandler< export interface StatusMessageCallbackParams< Context extends CrawlingContext = BasicCrawlingContext, - Crawler extends BasicCrawler = BasicCrawler, + Crawler extends BasicCrawler = BasicCrawler, > { state: StatisticState; crawler: Crawler; @@ -120,7 +123,7 @@ export interface StatusMessageCallbackParams< export type StatusMessageCallback< Context extends CrawlingContext = BasicCrawlingContext, - Crawler extends BasicCrawler = BasicCrawler, + Crawler extends BasicCrawler = BasicCrawler, > = (params: StatusMessageCallbackParams) => Awaitable; export type RequireContextPipeline< @@ -134,6 +137,7 @@ export interface BasicCrawlerOptions< Context extends CrawlingContext = CrawlingContext, ContextExtension = Dictionary, ExtendedContext extends Context = Context & ContextExtension, + Routes extends Record = Record>, > { /** * User-provided function that performs the logic of the crawler. It is called for each URL to crawl. @@ -152,7 +156,7 @@ export interface BasicCrawlerOptions< * The exceptions are logged to the request using the * {@apilink Request.pushErrorMessage|`Request.pushErrorMessage()`} function. */ - requestHandler?: RequestHandler; + requestHandler?: RouterHandler | RequestHandler; /** * Allows the user to extend the crawling context passed to the request handler with custom functionality. @@ -512,6 +516,7 @@ export class BasicCrawler< Context extends CrawlingContext = CrawlingContext, ContextExtension = Dictionary, ExtendedContext extends Context = Context & ContextExtension, + Routes extends Record = Record>, > { protected static readonly CRAWLEE_STATE_KEY = 'CRAWLEE_STATE'; @@ -580,7 +585,10 @@ export class BasicCrawler< * Default {@apilink Router} instance that will be used if we don't specify any {@apilink BasicCrawlerOptions.requestHandler|`requestHandler`}. * See {@apilink Router.addHandler|`router.addHandler()`} and {@apilink Router.addDefaultHandler|`router.addDefaultHandler()`}. */ - readonly router: RouterHandler = Router.create(); + readonly router: RouterHandler = Router.create() as unknown as RouterHandler< + Context, + Routes + >; private _basicContextPipeline?: ContextPipeline<{ request: Request }, CrawlingContext>; @@ -705,7 +713,7 @@ export class BasicCrawler< * All `BasicCrawler` parameters are passed via an options object. */ constructor( - options: BasicCrawlerOptions & + options: BasicCrawlerOptions & RequireContextPipeline = {} as any, // cast because the constructor logic handles missing `contextPipelineBuilder` - the type is just for DX ) { ow(options, 'BasicCrawlerOptions', ow.object.exactShape(BasicCrawler.optionsShape)); @@ -1263,7 +1271,7 @@ export class BasicCrawler< * @param [requests] The requests to add. * @param [options] Options for the request queue. */ - async run(requests?: RequestsLike, options?: CrawlerRunOptions): Promise { + async run(requests?: TypedRequestsLike, options?: CrawlerRunOptions): Promise { if (this.running) { throw new Error( 'This crawler instance is already running, you can add more requests to it via `crawler.addRequests()`.', @@ -1534,7 +1542,7 @@ export class BasicCrawler< * @param options Options for the request queue */ async addRequests( - requests: ReadonlyDeep, + requests: ReadonlyDeep>, options: CrawlerAddRequestsOptions = {}, ): Promise { await this.getRequestManager(); @@ -2345,9 +2353,18 @@ export interface CrawlerRunOptions extends CrawlerAddRequestsOptions { * await crawler.run(); * ``` */ +export function createBasicRouter< + Context extends BasicCrawlingContext = BasicCrawlingContext, + Routes extends Record = Record>, +>(routes?: RouterRoutes): RouterHandler; export function createBasicRouter< Context extends BasicCrawlingContext = BasicCrawlingContext, UserData extends Dictionary = GetUserDataFromRequest, ->(routes?: RouterRoutes) { - return Router.create(routes); +>(routes?: RouterRoutes>): RouterHandler>; +export function createBasicRouter< + Context extends BasicCrawlingContext = BasicCrawlingContext, + const Schemas extends RouteSchemas = RouteSchemas, +>(schemas: Schemas): RouterHandler>; +export function createBasicRouter(routesOrSchemas?: any): any { + return Router.create(routesOrSchemas); } diff --git a/packages/browser-crawler/src/internals/browser-crawler.ts b/packages/browser-crawler/src/internals/browser-crawler.ts index e93c52b3fa20..8302a5117b0c 100644 --- a/packages/browser-crawler/src/internals/browser-crawler.ts +++ b/packages/browser-crawler/src/internals/browser-crawler.ts @@ -7,10 +7,12 @@ import type { Dictionary, EnqueueLinksOptions, ErrorHandler, + GetUserDataFromRequest, IRequestManager, LoadedRequest, Request, RequestHandler, + RouterHandler, SkippedRequestCallback, } from '@crawlee/basic'; import { @@ -106,6 +108,7 @@ export interface BrowserCrawlerOptions< ContextExtension = Dictionary, ExtendedContext extends Context = Context & ContextExtension, InternalBrowserPoolOptions extends BrowserPoolOptions = BrowserPoolOptions, + Routes extends Record = Record>, __BrowserPlugins extends BrowserPlugin[] = InferBrowserPluginArray, __BrowserControllerReturn extends BrowserController = ReturnType<__BrowserPlugins[number]['createController']>, __LaunchContextReturn extends LaunchContext = ReturnType<__BrowserPlugins[number]['createLaunchContext']>, @@ -149,7 +152,7 @@ export interface BrowserCrawlerOptions< * The exceptions are logged to the request using the * {@apilink Request.pushErrorMessage|`Request.pushErrorMessage()`} function. */ - requestHandler?: RequestHandler; + requestHandler?: RouterHandler | RequestHandler; /** * User-provided function that allows modifying the request object before it gets retried by the crawler. @@ -313,8 +316,9 @@ export abstract class BrowserCrawler< >, ContextExtension = Dictionary, ExtendedContext extends Context = Context & ContextExtension, + Routes extends Record = Record>, GoToOptions extends Dictionary = Dictionary, -> extends BasicCrawler { +> extends BasicCrawler { /** * A reference to the underlying browser pool that manages the crawler's browsers. Typed as * {@apilink IBrowserPool} so custom implementations can be plugged in via the `browserPool` constructor option. diff --git a/packages/cheerio-crawler/src/internals/cheerio-crawler.ts b/packages/cheerio-crawler/src/internals/cheerio-crawler.ts index bb3b0da76ebf..46a5c5ec6365 100644 --- a/packages/cheerio-crawler/src/internals/cheerio-crawler.ts +++ b/packages/cheerio-crawler/src/internals/cheerio-crawler.ts @@ -8,7 +8,10 @@ import type { InternalHttpHook, IRequestManager, RequestHandler, + RouterHandler, RouterRoutes, + RouteSchemas, + RoutesFromSchemas, SkippedRequestCallback, } from '@crawlee/http'; import { @@ -34,7 +37,8 @@ export interface CheerioCrawlerOptions< ExtendedContext extends CheerioCrawlingContext = CheerioCrawlingContext & ContextExtension, UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler -> extends HttpCrawlerOptions, ContextExtension, ExtendedContext> {} + Routes extends Record = Record, +> extends HttpCrawlerOptions, ContextExtension, ExtendedContext, Routes> {} export type CheerioHook< UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler @@ -179,11 +183,15 @@ export type CheerioRequestHandler< export class CheerioCrawler< ContextExtension = Dictionary, ExtendedContext extends CheerioCrawlingContext = CheerioCrawlingContext & ContextExtension, -> extends HttpCrawler { + Routes extends Record = Record< + string, + GetUserDataFromRequest + >, +> extends HttpCrawler { /** * All `CheerioCrawler` parameters are passed via an options object. */ - constructor(options?: CheerioCrawlerOptions) { + constructor(options?: CheerioCrawlerOptions) { const { contextPipelineBuilder, ...rest } = options ?? {}; super({ @@ -361,9 +369,18 @@ export async function cheerioCrawlerEnqueueLinks( * await crawler.run(); * ``` */ +export function createCheerioRouter< + Context extends CheerioCrawlingContext = CheerioCrawlingContext, + Routes extends Record = Record>, +>(routes?: RouterRoutes): RouterHandler; export function createCheerioRouter< Context extends CheerioCrawlingContext = CheerioCrawlingContext, UserData extends Dictionary = GetUserDataFromRequest, ->(routes?: RouterRoutes) { - return Router.create(routes); +>(routes?: RouterRoutes>): RouterHandler>; +export function createCheerioRouter< + Context extends CheerioCrawlingContext = CheerioCrawlingContext, + const Schemas extends RouteSchemas = RouteSchemas, +>(schemas: Schemas): RouterHandler>; +export function createCheerioRouter(routesOrSchemas?: any): any { + return Router.create(routesOrSchemas); } diff --git a/packages/core/package.json b/packages/core/package.json index 0bef02294a6a..07bd847a272e 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -57,6 +57,7 @@ "@crawlee/types": "workspace:*", "@crawlee/utils": "workspace:*", "@sapphire/async-queue": "^1.5.5", + "@standard-schema/spec": "^1.0.0", "@vladfrangu/async_event_emitter": "^2.4.6", "csv-stringify": "^6.5.2", "json5": "^2.2.3", diff --git a/packages/core/src/crawlers/crawler_commons.ts b/packages/core/src/crawlers/crawler_commons.ts index 7f0c921e42e9..9071bbfc4c72 100644 --- a/packages/core/src/crawlers/crawler_commons.ts +++ b/packages/core/src/crawlers/crawler_commons.ts @@ -4,7 +4,7 @@ import type { ReadonlyDeep, SetRequired } from 'type-fest'; import type { Configuration } from '../configuration.js'; import type { EnqueueLinksOptions } from '../enqueue_links/enqueue_links.js'; import type { CrawleeLogger } from '../log.js'; -import type { Request, Source } from '../request.js'; +import type { Request, RequestOptions, Source } from '../request.js'; import type { Dataset } from '../storages/dataset.js'; import { KeyValueStore, type RecordOptions } from '../storages/key_value_store.js'; import type { RequestQueueOperationOptions } from '../storages/request_queue.js'; @@ -13,6 +13,73 @@ import type { StorageIdentifier } from '../storages/storage_instance_manager.js' /** @internal */ export type IsAny = 0 extends 1 & T ? true : false; +/** + * A request input (URL string, request-options object, or {@apilink Request}) whose `userData` is typed + * according to its `label`, based on a router's route map. + * + * When the route map is open (the default `Record`), this is just the regular loose + * {@apilink Source} input. When the map declares concrete labels, providing a `label` requires the matching + * `userData` shape and rejects labels not present in the map; unlabeled requests keep loose `userData`. + */ +export type LabeledSource> = string extends keyof Routes + ? string | Source + : + | string + | Request + | ({ requestsFromUrl?: string; regex?: RegExp } & ( + | { + [Label in keyof Routes & string]: Omit>, 'label'> & { + label: Label; + }; + }[keyof Routes & string] + | (Omit, 'label'> & { label?: undefined }) + )); + +/** + * The iterable/array of {@apilink LabeledSource} inputs accepted by the label-aware `addRequests`/`run` + * methods of a crawler bound to a typed router. + */ +export type TypedRequestsLike> = + | AsyncIterable> + | Iterable> + | LabeledSource[]; + +/** + * The label-aware `addRequests` method signature exposed on a request handler's context when the crawler is + * bound to a typed router. Mirrors {@apilink RestrictedCrawlingContext.addRequests} with typed sources. + */ +export type TypedContextAddRequests> = ( + requestsLike: ReadonlyDeep[]>, + options?: ReadonlyDeep, +) => Promise; + +/** + * An `enqueueLinks`-options object with its `label`/`userData` retyped according to a router's route map: a + * declared `label` requires the matching `userData` shape (unknown labels are rejected), while unlabeled + * calls keep loose `userData`. Returns the options unchanged when the route map is open (the default). + */ +type TypedEnqueueLinksOptions> = string extends keyof Routes + ? Options + : Omit & + ( + | { [Label in keyof Routes & string]: { label: Label; userData?: Routes[Label] } }[keyof Routes & string] + | { label?: undefined; userData?: Dictionary } + ); + +/** + * Transforms a context's existing `enqueueLinks` method so that the `label`/`userData` in its options follow + * the router's route map, while preserving everything else about the signature (argument optionality and + * return type, which differ between crawler types). + */ +export type TypedContextEnqueueLinks< + EnqueueLinks, + Routes extends Record, +> = EnqueueLinks extends (options?: infer Options) => infer Result + ? (options?: TypedEnqueueLinksOptions) => Result + : EnqueueLinks extends (options: infer Options) => infer Result + ? (options: TypedEnqueueLinksOptions) => Result + : EnqueueLinks; + /** @internal */ export type WithRequired = T & { [P in K]-?: T[P] }; diff --git a/packages/core/src/errors.ts b/packages/core/src/errors.ts index 3d4301b305db..92a30dbe68c9 100644 --- a/packages/core/src/errors.ts +++ b/packages/core/src/errors.ts @@ -16,6 +16,32 @@ export class CriticalError extends NonRetryableError {} */ export class MissingRouteError extends CriticalError {} +/** + * Thrown when a request's `userData` does not match the {@apilink RouteSchemas|Standard Schema} registered for its label. + * + * As the `userData` does not change between attempts, this error is non-retryable. + */ +export class RequestValidationError extends NonRetryableError { + constructor( + readonly label: string | symbol, + readonly issues: readonly { + readonly message: string; + readonly path?: readonly (PropertyKey | { key: PropertyKey })[]; + }[], + ) { + const details = issues + .map((issue) => { + const path = (issue.path ?? []) + .map((segment) => (typeof segment === 'object' ? segment.key : segment)) + .join('.'); + return `- ${path ? `${path}: ` : ''}${issue.message}`; + }) + .join('\n'); + + super(`Request userData for label '${String(label)}' failed schema validation:\n${details}`); + } +} + /** * Errors of `RetryRequestError` type will always be retried by the crawler. * diff --git a/packages/core/src/router.ts b/packages/core/src/router.ts index 4d0897389746..012a6ca916b7 100644 --- a/packages/core/src/router.ts +++ b/packages/core/src/router.ts @@ -1,22 +1,74 @@ import type { Dictionary } from '@crawlee/types'; +import type { StandardSchemaV1 } from '@standard-schema/spec'; -import type { CrawlingContext, LoadedRequest, RestrictedCrawlingContext } from './crawlers/crawler_commons.js'; -import { MissingRouteError } from './errors.js'; +import type { + CrawlingContext, + LoadedRequest, + RestrictedCrawlingContext, + TypedContextAddRequests, + TypedContextEnqueueLinks, +} from './crawlers/crawler_commons.js'; +import { MissingRouteError, RequestValidationError } from './errors.js'; import type { Request } from './request.js'; import type { Awaitable } from './typedefs.js'; const defaultRoute = Symbol('default-route'); +/** + * The crawling context received by a route handler, with `request.userData` narrowed to `UserData`, and + * `addRequests`/`enqueueLinks` typed according to the router's route map (`Routes`) so that enqueuing a + * request under a declared label requires the matching `userData` shape. + */ +export type RouterHandlerContext< + Context, + UserData extends Dictionary, + Routes extends Record, +> = Omit & { + request: LoadedRequest>; + addRequests: TypedContextAddRequests; +} & (Context extends { enqueueLinks: infer EnqueueLinks } + ? { enqueueLinks: TypedContextEnqueueLinks } + : {}); + +/** + * The set of labels accepted by {@apilink Router.addHandler}. When the router declares a concrete + * route map (e.g. `{ PRODUCT: ...; CATEGORY: ... }`), only those labels (plus symbols) are + * allowed — unknown labels become a compile-time error. When the map is left open (the default + * `Record`), any string or symbol label is accepted, preserving the original behaviour. + */ +export type RouterLabel> = string extends keyof Routes + ? string | symbol + : (keyof Routes & string) | symbol; + +/** + * A map of request labels to a [Standard Schema](https://standardschema.dev) (Zod, Valibot, ArkType, …) + * validating that label's `request.userData`. Pass it to {@apilink Router.create} or a `createXRouter` + * factory to derive the per-label `request.userData` types *and* validate them at runtime before the + * matching handler runs. + */ +export type RouteSchemas = Record; + +/** + * Derives a route map (label → `userData` type) from a {@apilink RouteSchemas} map by inferring each + * schema's output type. Outputs that are not object-shaped fall back to a plain {@apilink Dictionary}. + */ +export type RoutesFromSchemas = { + [Label in keyof Schemas]: StandardSchemaV1.InferOutput extends Dictionary + ? StandardSchemaV1.InferOutput + : Dictionary; +}; + export interface RouterHandler< Context extends Omit = CrawlingContext, -> extends Router { + Routes extends Record = Record>, +> extends Router { (ctx: Context): Awaitable; } export type GetUserDataFromRequest = T extends Request ? Y : never; -export type RouterRoutes = { - [label in string | symbol]: (ctx: Omit & { request: Request }) => Awaitable; +export type RouterRoutes> = { + [Label in keyof Routes]: (ctx: Omit & { request: Request }) => Awaitable; }; /** @@ -83,9 +135,57 @@ export type RouterRoutes = { * ctx.log.info('...'); * }); * ``` + * + * ## Typed labels + * + * To get `request.userData` typed per label, declare a route map and pass it as the second type + * argument. The label passed to {@apilink Router.addHandler} then drives the type of `request.userData`, + * and unknown labels are rejected at compile time: + * + * ```ts + * import { createCheerioRouter, CheerioCrawlingContext } from 'crawlee'; + * + * interface Routes { + * PRODUCT: { sku: string; price: number }; + * CATEGORY: { categoryId: string }; + * } + * + * const router = createCheerioRouter(); + * + * router.addHandler('PRODUCT', async ({ request }) => { + * request.userData.sku; // string + * request.userData.price; // number + * }); + * + * router.addHandler('TYPO', async () => {}); // compile error: not a known label + * ``` + * + * ## Schema-validated labels + * + * Passing a [Standard Schema](https://standardschema.dev) per label both infers the `request.userData` + * types *and* validates them at runtime before the handler runs (replacing `request.userData` with the + * parsed value). A failing request throws a {@apilink RequestValidationError}. + * + * ```ts + * import { z } from 'zod'; + * import { createCheerioRouter } from 'crawlee'; + * + * const router = createCheerioRouter({ + * PRODUCT: z.object({ sku: z.string(), price: z.number() }), + * CATEGORY: z.object({ categoryId: z.string() }), + * }); + * + * router.addHandler('PRODUCT', async ({ request }) => { + * request.userData.price; // number, inferred from the schema and validated at runtime + * }); + * ``` */ -export class Router> { +export class Router< + Context extends Omit, + Routes extends Record = Record>, +> { private readonly routes: Map Awaitable> = new Map(); + private readonly schemas: Map = new Map(); private readonly middlewares: ((ctx: Context) => Awaitable)[] = []; /** @@ -95,26 +195,54 @@ export class Router( + label: Label, + handler: (ctx: RouterHandlerContext) => Awaitable, + ): void; + + /** + * Registers new route handler for given label, explicitly typing `request.userData` via the + * `UserData` type argument. Useful when the router has no declared route map (the open default) + * and you want to type a single handler, or to register a handler under a `symbol` label. */ addHandler>( - label: string | symbol, - handler: (ctx: Omit & { request: LoadedRequest> }) => Awaitable, - ) { + label: RouterLabel, + handler: (ctx: RouterHandlerContext) => Awaitable, + ): void; + + addHandler(label: string | symbol, handler: (ctx: any) => Awaitable): void { this.validate(label); this.routes.set(label, handler); } /** - * Registers default route handler. + * Registers default route handler. As a fallback it can receive any request (including labels not + * declared in the route map), so `request.userData` defaults to the context's `userData` type + * (loosely typed by default). Pass an explicit `UserData` type argument to narrow it. */ addDefaultHandler>( - handler: (ctx: Omit & { request: LoadedRequest> }) => Awaitable, + handler: (ctx: RouterHandlerContext) => Awaitable, ) { this.validate(defaultRoute); this.routes.set(defaultRoute, handler); } + /** + * Registers {@apilink RouteSchemas|Standard Schema} validators for the given labels. Before a matching + * route handler runs, `request.userData` is validated against the label's schema and replaced with the + * parsed value; a failing request throws a {@apilink RequestValidationError}. + */ + addSchemas(schemas: Partial>) { + for (const [label, schema] of Object.entries(schemas)) { + if (schema) { + this.schemas.set(label, schema as StandardSchemaV1); + } + } + } + /** * Registers a middleware that will be fired before the matching route handler. * Multiple middlewares can be registered, they will be fired in the same order. @@ -142,6 +270,27 @@ export class Router` constraint and falls through to the second overload, + // where it is treated as the legacy flat `userData` shape shared by all handlers. The third overload + // accepts a Standard Schema per label, inferring the route map and validating `userData` at runtime. + static create< + Context extends Omit = CrawlingContext, + Routes extends Record = Record>, + >(routes?: RouterRoutes): RouterHandler; + static create< Context extends Omit = CrawlingContext, UserData extends Dictionary = GetUserDataFromRequest, - >(routes?: RouterRoutes): RouterHandler { - const router = new Router(); + >(routes?: RouterRoutes>): RouterHandler>; + + static create< + Context extends Omit = CrawlingContext, + const Schemas extends RouteSchemas = RouteSchemas, + >(schemas: Schemas): RouterHandler>; + + static create = CrawlingContext>( + routesOrSchemas?: Record Awaitable) | StandardSchemaV1>, + ): RouterHandler { + const router = new Router(); const obj = Object.create(Function.prototype); obj.addHandler = router.addHandler.bind(router); obj.addDefaultHandler = router.addDefaultHandler.bind(router); + obj.addSchemas = router.addSchemas.bind(router); obj.getHandler = router.getHandler.bind(router); obj.use = router.use.bind(router); - for (const [label, handler] of Object.entries(routes ?? {})) { - router.addHandler(label, handler); + for (const [label, value] of Object.entries(routesOrSchemas ?? {})) { + if (typeof value === 'function') { + router.addHandler(label, value as (ctx: any) => Awaitable); + } else { + router.schemas.set(label, value); + } } const func = async function (context: Context) { const { url, loadedUrl, label } = context.request; context.log.debug('Page opened.', { label, url: loadedUrl ?? url }); + await router.validateRequest(context); + for (const middleware of router.middlewares) { await middleware(context); } @@ -204,6 +390,6 @@ export class Router; + return func as unknown as RouterHandler; } } diff --git a/packages/http-crawler/src/internals/file-download.ts b/packages/http-crawler/src/internals/file-download.ts index 81dd217d0618..08f6b4ae9f11 100644 --- a/packages/http-crawler/src/internals/file-download.ts +++ b/packages/http-crawler/src/internals/file-download.ts @@ -6,7 +6,16 @@ import type { CrawlingContext, LoadedRequest, Request } from '@crawlee/core'; import { ResponseWithUrl } from '@crawlee/http-client'; import type { Dictionary } from '@crawlee/types'; -import type { ErrorHandler, GetUserDataFromRequest, InternalHttpHook, RequestHandler, RouterRoutes } from '../index.js'; +import type { + ErrorHandler, + GetUserDataFromRequest, + InternalHttpHook, + RequestHandler, + RouterHandler, + RouterRoutes, + RouteSchemas, + RoutesFromSchemas, +} from '../index.js'; import { Router } from '../index.js'; import { parseContentTypeFromResponse } from './utils.js'; @@ -251,9 +260,18 @@ function trackBodyConsumption(response: Response): { response: ResponseWithUrl; * await crawler.run(); * ``` */ +export function createFileRouter< + Context extends FileDownloadCrawlingContext = FileDownloadCrawlingContext, + Routes extends Record = Record>, +>(routes?: RouterRoutes): RouterHandler; export function createFileRouter< Context extends FileDownloadCrawlingContext = FileDownloadCrawlingContext, UserData extends Dictionary = GetUserDataFromRequest, ->(routes?: RouterRoutes) { - return Router.create(routes); +>(routes?: RouterRoutes>): RouterHandler>; +export function createFileRouter< + Context extends FileDownloadCrawlingContext = FileDownloadCrawlingContext, + const Schemas extends RouteSchemas = RouteSchemas, +>(schemas: Schemas): RouterHandler>; +export function createFileRouter(routesOrSchemas?: any): any { + return Router.create(routesOrSchemas); } diff --git a/packages/http-crawler/src/internals/http-crawler.ts b/packages/http-crawler/src/internals/http-crawler.ts index 288a83fc1855..f9b34b2b2e34 100644 --- a/packages/http-crawler/src/internals/http-crawler.ts +++ b/packages/http-crawler/src/internals/http-crawler.ts @@ -11,7 +11,10 @@ import type { Request as CrawleeRequest, RequestHandler, RequireContextPipeline, + RouterHandler, RouterRoutes, + RouteSchemas, + RoutesFromSchemas, } from '@crawlee/basic'; import { BasicCrawler, @@ -61,7 +64,8 @@ export interface HttpCrawlerOptions< Context extends InternalHttpCrawlingContext = InternalHttpCrawlingContext, ContextExtension = Dictionary, ExtendedContext extends Context = Context & ContextExtension, -> extends BasicCrawlerOptions { + Routes extends Record = Record>, +> extends BasicCrawlerOptions { /** * Timeout in which the HTTP request to the resource needs to finish, given in seconds. */ @@ -312,7 +316,8 @@ export class HttpCrawler< Context extends InternalHttpCrawlingContext = InternalHttpCrawlingContext, ContextExtension = Dictionary, ExtendedContext extends Context = Context & ContextExtension, -> extends BasicCrawler { + Routes extends Record = Record>, +> extends BasicCrawler { protected preNavigationHooks: InternalHttpHook[]; protected postNavigationHooks: (( crawlingContext: CrawlingContextWithResponse, @@ -838,9 +843,18 @@ interface RequestFunctionOptions { * await crawler.run(); * ``` */ +export function createHttpRouter< + Context extends HttpCrawlingContext = HttpCrawlingContext, + Routes extends Record = Record>, +>(routes?: RouterRoutes): RouterHandler; export function createHttpRouter< Context extends HttpCrawlingContext = HttpCrawlingContext, UserData extends Dictionary = GetUserDataFromRequest, ->(routes?: RouterRoutes) { - return Router.create(routes); +>(routes?: RouterRoutes>): RouterHandler>; +export function createHttpRouter< + Context extends HttpCrawlingContext = HttpCrawlingContext, + const Schemas extends RouteSchemas = RouteSchemas, +>(schemas: Schemas): RouterHandler>; +export function createHttpRouter(routesOrSchemas?: any): any { + return Router.create(routesOrSchemas); } diff --git a/packages/jsdom-crawler/src/internals/jsdom-crawler.ts b/packages/jsdom-crawler/src/internals/jsdom-crawler.ts index fc8debd5c9e7..4be258afcb29 100644 --- a/packages/jsdom-crawler/src/internals/jsdom-crawler.ts +++ b/packages/jsdom-crawler/src/internals/jsdom-crawler.ts @@ -8,7 +8,10 @@ import type { InternalHttpHook, IRequestManager, RequestHandler, + RouterHandler, RouterRoutes, + RouteSchemas, + RoutesFromSchemas, SkippedRequestCallback, } from '@crawlee/http'; import { @@ -38,7 +41,8 @@ export interface JSDOMCrawlerOptions< ExtendedContext extends JSDOMCrawlingContext = JSDOMCrawlingContext & ContextExtension, UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler -> extends HttpCrawlerOptions, ContextExtension, ExtendedContext> { + Routes extends Record = Record, +> extends HttpCrawlerOptions, ContextExtension, ExtendedContext, Routes> { /** * Download and run scripts. */ @@ -183,7 +187,11 @@ const resources = new ResourceLoader({ export class JSDOMCrawler< ContextExtension = Dictionary, ExtendedContext extends JSDOMCrawlingContext = JSDOMCrawlingContext & ContextExtension, -> extends HttpCrawler { + Routes extends Record = Record< + string, + GetUserDataFromRequest + >, +> extends HttpCrawler { protected static override optionsShape = { ...HttpCrawler.optionsShape, runScripts: ow.optional.boolean, @@ -194,7 +202,7 @@ export class JSDOMCrawler< protected hideInternalConsole: boolean; protected virtualConsole: VirtualConsole | null = null; - constructor(options: JSDOMCrawlerOptions = {}) { + constructor(options: JSDOMCrawlerOptions = {}) { const { runScripts = false, hideInternalConsole = false, contextPipelineBuilder, ...httpOptions } = options; super({ @@ -492,9 +500,18 @@ function extractUrlsFromWindow(window: DOMWindow, selector: string, baseUrl: str * await crawler.run(); * ``` */ +export function createJSDOMRouter< + Context extends JSDOMCrawlingContext = JSDOMCrawlingContext, + Routes extends Record = Record>, +>(routes?: RouterRoutes): RouterHandler; export function createJSDOMRouter< Context extends JSDOMCrawlingContext = JSDOMCrawlingContext, UserData extends Dictionary = GetUserDataFromRequest, ->(routes?: RouterRoutes) { - return Router.create(routes); +>(routes?: RouterRoutes>): RouterHandler>; +export function createJSDOMRouter< + Context extends JSDOMCrawlingContext = JSDOMCrawlingContext, + const Schemas extends RouteSchemas = RouteSchemas, +>(schemas: Schemas): RouterHandler>; +export function createJSDOMRouter(routesOrSchemas?: any): any { + return Router.create(routesOrSchemas); } diff --git a/packages/linkedom-crawler/src/internals/linkedom-crawler.ts b/packages/linkedom-crawler/src/internals/linkedom-crawler.ts index 1d4065949d8c..84df2554841b 100644 --- a/packages/linkedom-crawler/src/internals/linkedom-crawler.ts +++ b/packages/linkedom-crawler/src/internals/linkedom-crawler.ts @@ -8,7 +8,10 @@ import type { InternalHttpHook, IRequestManager, RequestHandler, + RouterHandler, RouterRoutes, + RouteSchemas, + RoutesFromSchemas, SkippedRequestCallback, } from '@crawlee/http'; import { @@ -34,7 +37,8 @@ export interface LinkeDOMCrawlerOptions< ExtendedContext extends LinkeDOMCrawlingContext = LinkeDOMCrawlingContext & ContextExtension, UserData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler JSONData extends Dictionary = any, // with default to Dictionary we cant use a typed router in untyped crawler -> extends HttpCrawlerOptions, ContextExtension, ExtendedContext> {} + Routes extends Record = Record, +> extends HttpCrawlerOptions, ContextExtension, ExtendedContext, Routes> {} export interface LinkeDOMCrawlerEnqueueLinksOptions extends Omit {} @@ -166,10 +170,14 @@ export type LinkeDOMRequestHandler< export class LinkeDOMCrawler< ContextExtension = Dictionary, ExtendedContext extends LinkeDOMCrawlingContext = LinkeDOMCrawlingContext & ContextExtension, -> extends HttpCrawler { + Routes extends Record = Record< + string, + GetUserDataFromRequest + >, +> extends HttpCrawler { private static parser = new DOMParser(); - constructor(options: LinkeDOMCrawlerOptions) { + constructor(options: LinkeDOMCrawlerOptions) { const { contextPipelineBuilder, ...rest } = options; super({ @@ -382,9 +390,18 @@ function extractUrlsFromWindow(window: Window, selector: string, baseUrl: string * await crawler.run(); * ``` */ +export function createLinkeDOMRouter< + Context extends LinkeDOMCrawlingContext = LinkeDOMCrawlingContext, + Routes extends Record = Record>, +>(routes?: RouterRoutes): RouterHandler; export function createLinkeDOMRouter< Context extends LinkeDOMCrawlingContext = LinkeDOMCrawlingContext, UserData extends Dictionary = GetUserDataFromRequest, ->(routes?: RouterRoutes) { - return Router.create(routes); +>(routes?: RouterRoutes>): RouterHandler>; +export function createLinkeDOMRouter< + Context extends LinkeDOMCrawlingContext = LinkeDOMCrawlingContext, + const Schemas extends RouteSchemas = RouteSchemas, +>(schemas: Schemas): RouterHandler>; +export function createLinkeDOMRouter(routesOrSchemas?: any): any { + return Router.create(routesOrSchemas); } diff --git a/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts b/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts index 2c359e51b752..438255534fdc 100644 --- a/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts +++ b/packages/playwright-crawler/src/internals/adaptive-playwright-crawler.ts @@ -13,7 +13,10 @@ import type { GetUserDataFromRequest, RequestQueue, RestrictedCrawlingContext, + RouterHandler, RouterRoutes, + RouteSchemas, + RoutesFromSchemas, StatisticPersistedState, StatisticsOptions, StatisticState, @@ -168,8 +171,17 @@ interface AdaptivePostNavigationHook extends BrowserHook< export interface AdaptivePlaywrightCrawlerOptions< ExtendedContext extends AdaptivePlaywrightCrawlerContext = AdaptivePlaywrightCrawlerContext, + Routes extends Record = Record< + string, + GetUserDataFromRequest + >, > extends Omit< - BasicCrawlerOptions, + BasicCrawlerOptions< + AdaptivePlaywrightCrawlerContext, + ExtendedContext, + AdaptivePlaywrightCrawlerContext & ExtendedContext, + Routes + >, 'preNavigationHooks' | 'postNavigationHooks' > { /** @@ -275,7 +287,16 @@ type LogProxyCall = [log: CrawleeLogger, method: (typeof proxyLogMethods)[number */ export class AdaptivePlaywrightCrawler< ExtendedContext extends AdaptivePlaywrightCrawlerContext = AdaptivePlaywrightCrawlerContext, -> extends BasicCrawler { + Routes extends Record = Record< + string, + GetUserDataFromRequest + >, +> extends BasicCrawler< + AdaptivePlaywrightCrawlerContext, + ExtendedContext, + AdaptivePlaywrightCrawlerContext & ExtendedContext, + Routes +> { private renderingTypePredictor: NonNullable; private resultChecker: NonNullable; private resultComparator: NonNullable; @@ -289,7 +310,7 @@ export class AdaptivePlaywrightCrawler< private teardownHooks: (() => Promise)[] = []; - constructor(options: AdaptivePlaywrightCrawlerOptions = {}) { + constructor(options: AdaptivePlaywrightCrawlerOptions = {}) { const { requestHandler, renderingTypeDetectionRatio = 0.1, @@ -779,9 +800,18 @@ export class AdaptivePlaywrightCrawler< } } +export function createAdaptivePlaywrightRouter< + Context extends AdaptivePlaywrightCrawlerContext = AdaptivePlaywrightCrawlerContext, + Routes extends Record = Record>, +>(routes?: RouterRoutes): RouterHandler; export function createAdaptivePlaywrightRouter< Context extends AdaptivePlaywrightCrawlerContext = AdaptivePlaywrightCrawlerContext, UserData extends Dictionary = GetUserDataFromRequest, ->(routes?: RouterRoutes) { - return Router.create(routes); +>(routes?: RouterRoutes>): RouterHandler>; +export function createAdaptivePlaywrightRouter< + Context extends AdaptivePlaywrightCrawlerContext = AdaptivePlaywrightCrawlerContext, + const Schemas extends RouteSchemas = RouteSchemas, +>(schemas: Schemas): RouterHandler>; +export function createAdaptivePlaywrightRouter(routesOrSchemas?: any): any { + return Router.create(routesOrSchemas); } diff --git a/packages/playwright-crawler/src/internals/playwright-crawler.ts b/packages/playwright-crawler/src/internals/playwright-crawler.ts index 42431bbbc446..c1dbf6722d2f 100644 --- a/packages/playwright-crawler/src/internals/playwright-crawler.ts +++ b/packages/playwright-crawler/src/internals/playwright-crawler.ts @@ -4,7 +4,10 @@ import type { BrowserHook, GetUserDataFromRequest, RequestHandler, + RouterHandler, RouterRoutes, + RouteSchemas, + RoutesFromSchemas, } from '@crawlee/browser'; import { BrowserCrawler, RequestState, Router, serviceLocator } from '@crawlee/browser'; import type { BrowserPoolOptions, PlaywrightPlugin } from '@crawlee/browser-pool'; @@ -35,13 +38,18 @@ export interface PlaywrightHook extends BrowserHook { export interface PlaywrightCrawlerOptions< ContextExtension = Dictionary, ExtendedContext extends PlaywrightCrawlingContext = PlaywrightCrawlingContext & ContextExtension, + Routes extends Record = Record< + string, + GetUserDataFromRequest + >, > extends BrowserCrawlerOptions< Page, Response, PlaywrightCrawlingContext, ContextExtension, ExtendedContext, - { browserPlugins: [PlaywrightPlugin] } + { browserPlugins: [PlaywrightPlugin] }, + Routes > { /** * The same options as used by {@apilink launchPlaywright}. @@ -70,7 +78,7 @@ export interface PlaywrightCrawlerOptions< * The exceptions are logged to the request using the * {@apilink Request.pushErrorMessage} function. */ - requestHandler?: RequestHandler; + requestHandler?: RouterHandler | RequestHandler; /** * Async functions that are sequentially evaluated before the navigation. Good for setting additional cookies @@ -177,6 +185,10 @@ export interface PlaywrightCrawlerOptions< export class PlaywrightCrawler< ContextExtension = Dictionary, ExtendedContext extends PlaywrightCrawlingContext = PlaywrightCrawlingContext & ContextExtension, + Routes extends Record = Record< + string, + GetUserDataFromRequest + >, > extends BrowserCrawler< Page, Response, @@ -184,7 +196,8 @@ export class PlaywrightCrawler< LaunchOptions, PlaywrightCrawlingContext, ContextExtension, - ExtendedContext + ExtendedContext, + Routes > { protected static override optionsShape = { ...BrowserCrawler.optionsShape, @@ -197,7 +210,7 @@ export class PlaywrightCrawler< /** * All `PlaywrightCrawler` parameters are passed via an options object. */ - constructor(options: PlaywrightCrawlerOptions = {}) { + constructor(options: PlaywrightCrawlerOptions = {}) { ow(options, 'PlaywrightCrawlerOptions', ow.object.exactShape(PlaywrightCrawler.optionsShape)); const { launchContext = {}, headless, contextPipelineBuilder, ...browserCrawlerOptions } = options; @@ -344,9 +357,18 @@ export function handleCloudflareChallengeHook(options?: HandleCloudflareChalleng * await crawler.run(); * ``` */ +export function createPlaywrightRouter< + Context extends PlaywrightCrawlingContext = PlaywrightCrawlingContext, + Routes extends Record = Record>, +>(routes?: RouterRoutes): RouterHandler; export function createPlaywrightRouter< Context extends PlaywrightCrawlingContext = PlaywrightCrawlingContext, UserData extends Dictionary = GetUserDataFromRequest, ->(routes?: RouterRoutes) { - return Router.create(routes); +>(routes?: RouterRoutes>): RouterHandler>; +export function createPlaywrightRouter< + Context extends PlaywrightCrawlingContext = PlaywrightCrawlingContext, + const Schemas extends RouteSchemas = RouteSchemas, +>(schemas: Schemas): RouterHandler>; +export function createPlaywrightRouter(routesOrSchemas?: any): any { + return Router.create(routesOrSchemas); } diff --git a/packages/puppeteer-crawler/src/internals/puppeteer-crawler.ts b/packages/puppeteer-crawler/src/internals/puppeteer-crawler.ts index cd6680bbab00..0b66748aba0c 100644 --- a/packages/puppeteer-crawler/src/internals/puppeteer-crawler.ts +++ b/packages/puppeteer-crawler/src/internals/puppeteer-crawler.ts @@ -3,7 +3,10 @@ import type { BrowserCrawlingContext, BrowserHook, GetUserDataFromRequest, + RouterHandler, RouterRoutes, + RouteSchemas, + RoutesFromSchemas, } from '@crawlee/browser'; import { BrowserCrawler, RequestState, Router } from '@crawlee/browser'; import type { BrowserPoolOptions, PuppeteerPlugin } from '@crawlee/browser-pool'; @@ -35,13 +38,18 @@ export interface PuppeteerHook extends BrowserHook {} export interface PuppeteerCrawlerOptions< ContextExtension = Dictionary, ExtendedContext extends PuppeteerCrawlingContext = PuppeteerCrawlingContext & ContextExtension, + Routes extends Record = Record< + string, + GetUserDataFromRequest + >, > extends BrowserCrawlerOptions< Page, HTTPResponse, PuppeteerCrawlingContext, ContextExtension, ExtendedContext, - { browserPlugins: [PuppeteerPlugin] } + { browserPlugins: [PuppeteerPlugin] }, + Routes > { /** * Options used by {@apilink launchPuppeteer} to start new Puppeteer instances. @@ -153,6 +161,10 @@ export interface PuppeteerCrawlerOptions< export class PuppeteerCrawler< ContextExtension = Dictionary, ExtendedContext extends PuppeteerCrawlingContext = PuppeteerCrawlingContext & ContextExtension, + Routes extends Record = Record< + string, + GetUserDataFromRequest + >, > extends BrowserCrawler< Page, HTTPResponse, @@ -160,7 +172,8 @@ export class PuppeteerCrawler< LaunchOptions, PuppeteerCrawlingContext, ContextExtension, - ExtendedContext + ExtendedContext, + Routes > { protected static override optionsShape = { ...BrowserCrawler.optionsShape, @@ -170,7 +183,7 @@ export class PuppeteerCrawler< /** * All `PuppeteerCrawler` parameters are passed via an options object. */ - constructor(options: PuppeteerCrawlerOptions = {}) { + constructor(options: PuppeteerCrawlerOptions = {}) { ow(options, 'PuppeteerCrawlerOptions', ow.object.exactShape(PuppeteerCrawler.optionsShape)); const { @@ -307,9 +320,18 @@ export class PuppeteerCrawler< * await crawler.run(); * ``` */ +export function createPuppeteerRouter< + Context extends PuppeteerCrawlingContext = PuppeteerCrawlingContext, + Routes extends Record = Record>, +>(routes?: RouterRoutes): RouterHandler; export function createPuppeteerRouter< Context extends PuppeteerCrawlingContext = PuppeteerCrawlingContext, UserData extends Dictionary = GetUserDataFromRequest, ->(routes?: RouterRoutes) { - return Router.create(routes); +>(routes?: RouterRoutes>): RouterHandler>; +export function createPuppeteerRouter< + Context extends PuppeteerCrawlingContext = PuppeteerCrawlingContext, + const Schemas extends RouteSchemas = RouteSchemas, +>(schemas: Schemas): RouterHandler>; +export function createPuppeteerRouter(routesOrSchemas?: any): any { + return Router.create(routesOrSchemas); } diff --git a/packages/stagehand-crawler/src/internals/stagehand-crawler.ts b/packages/stagehand-crawler/src/internals/stagehand-crawler.ts index 75fe0ca79dce..457764db2081 100644 --- a/packages/stagehand-crawler/src/internals/stagehand-crawler.ts +++ b/packages/stagehand-crawler/src/internals/stagehand-crawler.ts @@ -20,7 +20,10 @@ import type { GetUserDataFromRequest, LoadedContext, RequestHandler, + RouterHandler, RouterRoutes, + RouteSchemas, + RoutesFromSchemas, } from '@crawlee/browser'; import { BrowserCrawler, Router } from '@crawlee/browser'; import type { BrowserPoolOptions } from '@crawlee/browser-pool'; @@ -247,13 +250,18 @@ export interface StagehandRequestHandler extends RequestHandler, ExtendedContext extends StagehandCrawlingContext = StagehandCrawlingContext & ContextExtension, + Routes extends Record = Record< + string, + GetUserDataFromRequest + >, > extends BrowserCrawlerOptions< StagehandPage, Response, StagehandCrawlingContext, ContextExtension, ExtendedContext, - { browserPlugins: [StagehandPlugin] } + { browserPlugins: [StagehandPlugin] }, + Routes > { /** * Stagehand-specific configuration options. @@ -310,7 +318,11 @@ export interface StagehandCrawlerOptions< * } * ``` */ - requestHandler?: StagehandRequestHandler; + // Both union members must share the exact same call signature, otherwise TS cannot contextually type + // an inline `requestHandler({ page, request, ... })`. `StagehandRequestHandler` wraps the context in + // `LoadedContext`, while the router member uses `ExtendedContext`; since `StagehandCrawlingContext` + // already carries a `LoadedRequest`, using `ExtendedContext` for both keeps the signatures identical. + requestHandler?: RouterHandler | RequestHandler; /** * Async functions that are sequentially evaluated before the navigation. @@ -375,6 +387,10 @@ export interface StagehandCrawlerOptions< export class StagehandCrawler< ContextExtension = Dictionary, ExtendedContext extends StagehandCrawlingContext = StagehandCrawlingContext & ContextExtension, + Routes extends Record = Record< + string, + GetUserDataFromRequest + >, > extends BrowserCrawler< StagehandPage, Response, @@ -382,7 +398,8 @@ export class StagehandCrawler< LaunchOptions, StagehandCrawlingContext, ContextExtension, - ExtendedContext + ExtendedContext, + Routes > { protected static override optionsShape = { ...BrowserCrawler.optionsShape, @@ -395,7 +412,7 @@ export class StagehandCrawler< * * @param options - Crawler configuration options */ - constructor(options: StagehandCrawlerOptions = {}) { + constructor(options: StagehandCrawlerOptions = {}) { ow(options, 'StagehandCrawlerOptions', ow.object.exactShape(StagehandCrawler.optionsShape)); const { stagehandOptions = {}, launchContext = {}, contextPipelineBuilder, ...browserCrawlerOptions } = options; @@ -505,9 +522,18 @@ export class StagehandCrawler< * }); * ``` */ +export function createStagehandRouter< + Context extends StagehandCrawlingContext = StagehandCrawlingContext, + Routes extends Record = Record>, +>(routes?: RouterRoutes): RouterHandler; export function createStagehandRouter< Context extends StagehandCrawlingContext = StagehandCrawlingContext, UserData extends Dictionary = GetUserDataFromRequest, ->(routes?: RouterRoutes) { - return Router.create(routes); +>(routes?: RouterRoutes>): RouterHandler>; +export function createStagehandRouter< + Context extends StagehandCrawlingContext = StagehandCrawlingContext, + const Schemas extends RouteSchemas = RouteSchemas, +>(schemas: Schemas): RouterHandler>; +export function createStagehandRouter(routesOrSchemas?: any): any { + return Router.create(routesOrSchemas); } diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index 2c9cf535b126..7c709dd0ce8f 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -439,6 +439,9 @@ importers: '@sapphire/async-queue': specifier: ^1.5.5 version: 1.5.5 + '@standard-schema/spec': + specifier: ^1.0.0 + version: 1.1.0 '@vladfrangu/async_event_emitter': specifier: ^2.4.6 version: 2.4.7 diff --git a/test/core/router.test.ts b/test/core/router.test.ts index 49aa0036a505..15c3aa447ebf 100644 --- a/test/core/router.test.ts +++ b/test/core/router.test.ts @@ -1,7 +1,18 @@ import { BasicCrawler } from '@crawlee/basic'; import type { CrawlingContext } from '@crawlee/core'; -import { MissingRouteError, Router } from '@crawlee/core'; -import { createPlaywrightRouter, type PlaywrightCrawlingContext } from 'crawlee'; +import { MissingRouteError, RequestValidationError, Router } from '@crawlee/core'; +import { + CheerioCrawler, + type CheerioCrawlingContext, + createCheerioRouter, + createPlaywrightRouter, + createPuppeteerRouter, + PlaywrightCrawler, + type PlaywrightCrawlingContext, + PuppeteerCrawler, + type PuppeteerCrawlingContext, +} from 'crawlee'; +import { z } from 'zod'; describe('Router', () => { test('should be callable and route based on the label', async () => { @@ -173,4 +184,198 @@ describe('Router', () => { testType<'bar'>(ctx.request.userData.foo); }); }); + + test('addHandler infers userData from a declared route map', async () => { + const testType = (t: T): void => {}; + + interface Routes { + PRODUCT: { sku: string; price: number }; + CATEGORY: { categoryId: string }; + } + + const router: Router = { + addHandler: () => {}, + addDefaultHandler: () => {}, + } as any; + + router.addHandler('PRODUCT', (ctx) => { + testType(ctx.request.userData.sku); + testType(ctx.request.userData.price); + }); + + router.addHandler('CATEGORY', (ctx) => { + testType(ctx.request.userData.categoryId); + }); + + // @ts-expect-error unknown labels are rejected when a route map is declared + router.addHandler('UNKNOWN', () => {}); + + router.addDefaultHandler((ctx) => { + // the default handler is a fallback for any request, so userData stays loosely typed + testType>(ctx.request.userData); + }); + }); + + test('factory infers userData from a route map passed as the second type argument', async () => { + const testType = (t: T): void => {}; + + interface Routes { + PRODUCT: { sku: string; price: number }; + CATEGORY: { categoryId: string }; + } + + // the documented two-argument form: `Routes` is the second type argument of the factory + const router = createCheerioRouter(); + + router.addHandler('PRODUCT', (ctx) => { + testType(ctx.request.userData.sku); + testType(ctx.request.userData.price); + }); + + router.addHandler('CATEGORY', (ctx) => { + testType(ctx.request.userData.categoryId); + }); + + // @ts-expect-error unknown labels are rejected when a route map is declared + router.addHandler('UNKNOWN', () => {}); + }); + + test('factory keeps the legacy flat-userData generic working (backwards compatibility)', async () => { + const testType = (t: T): void => {}; + + // a flat `userData` shape (with a scalar field) resolves to the legacy open-map router, + // so any label is accepted and `userData` is typed as the passed shape + const router = createCheerioRouter(); + + router.addHandler('anyLabel', (ctx) => { + testType(ctx.request.userData.token); + }); + + router.addHandler('anotherLabel', (ctx) => { + testType(ctx.request.userData.token); + }); + }); + + test('schema map infers userData types and validates at runtime', async () => { + const testType = (t: T): void => {}; + + const logs: string[] = []; + const router = createCheerioRouter({ + PRODUCT: z.object({ sku: z.string(), price: z.coerce.number() }), + CATEGORY: z.object({ categoryId: z.string() }), + }); + + router.addHandler('PRODUCT', async (ctx) => { + // inferred from the schema (note: price is coerced to a number) + testType(ctx.request.userData.sku); + testType(ctx.request.userData.price); + logs.push(`product ${ctx.request.userData.sku} @ ${ctx.request.userData.price}`); + }); + + const log = { info: vitest.fn(), warn: vitest.fn(), debug: vitest.fn() }; + + // valid userData passes and is replaced with the parsed (coerced) value + const validRequest = { + loadedUrl: 'https://example.com/p', + label: 'PRODUCT', + userData: { sku: 'A1', price: '42' }, + }; + await router({ request: validRequest, log } as any); + expect(logs).toEqual(['product A1 @ 42']); + expect(validRequest.userData.price).toBe(42); + + // invalid userData throws a RequestValidationError before the handler runs + await expect( + router({ + request: { loadedUrl: 'https://example.com/p', label: 'PRODUCT', userData: { sku: 123 } }, + log, + } as any), + ).rejects.toThrow(RequestValidationError); + }); + + test('crawler infers the route map from a typed requestHandler and types addRequests/context', () => { + // type-level only: the block is never executed, it just has to type-check + const typeOnly = async () => { + interface Routes { + PRODUCT: { sku: string; price: number }; + CATEGORY: { categoryId: string }; + } + + const router = createCheerioRouter(); + + router.addHandler('PRODUCT', async ({ addRequests, enqueueLinks }) => { + // context methods are typed from the route map + await addRequests([{ url: 'https://e.com/c', label: 'CATEGORY', userData: { categoryId: 'c1' } }]); + await enqueueLinks({ urls: ['https://e.com/p'], label: 'PRODUCT', userData: { sku: 's', price: 1 } }); + // @ts-expect-error wrong userData shape for the label + await addRequests([{ url: 'https://e.com/p', label: 'PRODUCT', userData: { categoryId: 'x' } }]); + // @ts-expect-error label not present in the route map + await addRequests([{ url: 'https://e.com/x', label: 'NOPE' }]); + }); + + // the crawler infers `Routes` from the typed router passed as `requestHandler` + const crawler = new CheerioCrawler({ requestHandler: router }); + + await crawler.addRequests([{ url: 'https://e.com/p', label: 'PRODUCT', userData: { sku: 's', price: 1 } }]); + await crawler.run([ + 'https://e.com', + { url: 'https://e.com/c', label: 'CATEGORY', userData: { categoryId: 'c1' } }, + ]); + // @ts-expect-error wrong userData shape for the label + await crawler.addRequests([{ url: 'https://e.com/p', label: 'PRODUCT', userData: { categoryId: 'x' } }]); + // @ts-expect-error label not present in the route map + await crawler.addRequests([{ url: 'https://e.com/x', label: 'NOPE' }]); + }; + + expect(typeof typeOnly).toBe('function'); + }); + + test('browser crawler also infers the route map from a typed requestHandler', () => { + // type-level only: never executed + const typeOnly = async () => { + interface Routes { + PRODUCT: { sku: string }; + } + + const router = createPlaywrightRouter(); + + router.addHandler('PRODUCT', async ({ addRequests }) => { + await addRequests([{ url: 'https://e.com/p', label: 'PRODUCT', userData: { sku: 's' } }]); + // @ts-expect-error wrong userData shape for the label + await addRequests([{ url: 'https://e.com/p', label: 'PRODUCT', userData: { sku: 1 } }]); + // @ts-expect-error label not present in the route map + await addRequests([{ url: 'https://e.com/x', label: 'NOPE' }]); + }); + + const crawler = new PlaywrightCrawler({ requestHandler: router }); + + await crawler.addRequests([{ url: 'https://e.com/p', label: 'PRODUCT', userData: { sku: 's' } }]); + // @ts-expect-error wrong userData shape for the label + await crawler.addRequests([{ url: 'https://e.com/p', label: 'PRODUCT', userData: { sku: 1 } }]); + // @ts-expect-error label not present in the route map + await crawler.addRequests([{ url: 'https://e.com/x', label: 'NOPE' }]); + }; + + expect(typeof typeOnly).toBe('function'); + }); + + test('puppeteer crawler infers the route map too (inherited requestHandler path)', () => { + // type-level only: never executed + const typeOnly = async () => { + interface Routes { + PRODUCT: { sku: string }; + } + + const router = createPuppeteerRouter(); + const crawler = new PuppeteerCrawler({ requestHandler: router }); + + await crawler.addRequests([{ url: 'https://e.com/p', label: 'PRODUCT', userData: { sku: 's' } }]); + // @ts-expect-error wrong userData shape for the label + await crawler.addRequests([{ url: 'https://e.com/p', label: 'PRODUCT', userData: { sku: 1 } }]); + // @ts-expect-error label not present in the route map + await crawler.addRequests([{ url: 'https://e.com/x', label: 'NOPE' }]); + }; + + expect(typeof typeOnly).toBe('function'); + }); });