Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions docs/upgrading/upgrading_v4.md
Original file line number Diff line number Diff line change
Expand Up @@ -619,6 +619,10 @@ The high-level storage classes (`Dataset`, `KeyValueStore`, `RequestQueue`) now

`timeoutSecs` and `doNotRetryTimeouts` were removed from `RecordOptions` (used by `KeyValueStore.setValue`). Only `contentType` remains.

### `maybeStringify` is removed

The `maybeStringify` helper exported from `@crawlee/core` has been removed. Value (de)serialization now lives entirely in the `KeyValueStore` frontend: writing serializes the value (and infers its content type), reading parses it back, and the storage client is a plain byte transport. If you imported `maybeStringify` directly, use the `serializeValue` / `parseValue` functions exported from `@crawlee/core` instead.

### `KeyValueStoreIteratorOptions` simplified

`exclusiveStartKey` and `collection` were removed. Only `prefix` remains.
Expand Down
1 change: 1 addition & 0 deletions packages/core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@
"@crawlee/utils": "workspace:*",
"@sapphire/async-queue": "^1.5.5",
"@vladfrangu/async_event_emitter": "^2.4.6",
"content-type": "^1.0.5",
"csv-stringify": "^6.5.2",
"json5": "^2.2.3",
"minimatch": "^10.0.1",
Expand Down
1 change: 1 addition & 0 deletions packages/core/src/storages/index.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
export * from './dataset.js';
export * from './key_value_store.js';
export * from './key_value_store_codec.js';
export * from './request_list.js';
export type * from './request_loader.js';
export type * from './request_manager.js';
Expand Down
97 changes: 50 additions & 47 deletions packages/core/src/storages/key_value_store.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,53 +6,21 @@ import JSON5 from 'json5';
import ow, { ArgumentError } from 'ow';

import { KEY_VALUE_STORE_KEY_REGEX } from '@apify/consts';
import { jsonStringifyExtended } from '@apify/utilities';

import { Configuration } from '../configuration.js';
import { serviceLocator } from '../service_locator.js';
import type { Awaitable } from '../typedefs.js';
import { checkStorageAccess } from './access_checking.js';
import { parseValue, serializeValue } from './key_value_store_codec.js';
import type { StorageIdentifier } from './storage_instance_manager.js';
import type { StorageOpenOptions } from './utils.js';
import { resolveStorageIdentifier } from './storage_instance_manager.js';
import { createDualIterable, purgeDefaultStorages } from './utils.js';
import { isBuffer, isStream } from '@crawlee/utils';

/** @internal */
const KVS_KEYS_DEFAULT_LIMIT = 1000;

/**
* Helper function to possibly stringify value if options.contentType is not set.
*
* @ignore
*/
export const maybeStringify = <T>(value: T, options: { contentType?: string }) => {
// If contentType is missing, value will be stringified to JSON
if (options.contentType === null || options.contentType === undefined) {
options.contentType = 'application/json; charset=utf-8';

try {
// Format JSON to simplify debugging, the overheads with compression is negligible
value = jsonStringifyExtended(value as Dictionary, null, 2) as unknown as T;
} catch (e) {
const error = e as Error;
// Give more meaningful error message
if (error.message?.includes('Invalid string length')) {
error.message = 'Object is too large';
}
throw new Error(`The "value" parameter cannot be stringified to JSON: ${error.message}`);
}

if (value === undefined) {
throw new Error(
'The "value" parameter was stringified to JSON and returned undefined. ' +
"Make sure you're not trying to stringify an undefined value.",
);
}
}

return value;
};

/**
* The `KeyValueStore` class represents a key-value store, a simple data storage that is used
* for saving and reading data records or files. Each data record is
Expand Down Expand Up @@ -232,7 +200,48 @@ export class KeyValueStore {
ow(key, ow.string.nonEmpty);
const record = await this.client.getValue(key);

return (record?.value as T) ?? defaultValue ?? null;
const parsed = record ? parseValue(record.value, record.contentType ?? null) : undefined;

return (parsed as T) ?? defaultValue ?? null;
}

/**
* Reads a record from the key-value store without parsing the value.
*
* Use this when you need the raw bytes and the content type — for example, to run your own
* parser (`simdjson`, a custom XML library, etc.) or to forward the bytes verbatim.
*
* There is no symmetric `setRecord` method, because {@apilink KeyValueStore.setValue} already
* passes a `Buffer` (or `string` / `Stream`) through unchanged when an explicit `contentType`
* is provided. To write pre-serialized bytes, call
* `setValue(key, buffer, { contentType: 'application/json; charset=utf-8' })`.
*
* Returns `null` if the record does not exist.
*
* **Example usage:**
* ```javascript
* const store = await KeyValueStore.open();
* const record = await store.getRecord('huge.json');
* if (record) {
* const data = simdjson.parse(record.value);
* }
* ```
*
* @param key
* Unique key of the record. It can be at most 256 characters long and only consist
* of the following characters: `a`-`z`, `A`-`Z`, `0`-`9` and `!-_.'()`
*/
async getRecord(key: string): Promise<{ value: Buffer; contentType: string | null } | null> {
checkStorageAccess();

ow(key, ow.string.nonEmpty);
const record = await this.client.getValue(key);
if (!record) return null;

return {
value: record.value as Buffer,
contentType: record.contentType ?? null,
};
}

/**
Expand Down Expand Up @@ -301,7 +310,7 @@ export class KeyValueStore {
const results: T[] = [];
for (const item of page) {
const record = await this.client.getValue(item.key);
if (record) results.push(mapRecord(item.key, record.value));
if (record) results.push(mapRecord(item.key, parseValue(record.value, record.contentType ?? null)));
}
yield results;
}
Expand Down Expand Up @@ -375,15 +384,9 @@ export class KeyValueStore {
message: `The "key" argument "${key}" must be at most 256 characters long and only contain the following characters: a-zA-Z0-9!-_.'()`,
})),
);
if (
options.contentType &&
!(
ow.isValid(value, ow.any(ow.string, ow.uint8Array)) ||
(ow.isValid(value, ow.object) && typeof (value as Dictionary).pipe === 'function')
)
) {
if (options.contentType && !(typeof value === 'string' || isBuffer(value) || isStream(value))) {
throw new ArgumentError(
'The "value" parameter must be a String, Buffer or Stream when "options.contentType" is specified.',
'The "value" parameter must be a String, Buffer, ArrayBuffer, TypedArray, or Stream when "options.contentType" is specified.',
this.setValue,
);
}
Expand Down Expand Up @@ -417,12 +420,12 @@ export class KeyValueStore {
// In this case delete the record.
if (value === null) return this.client.deleteValue(key);

value = maybeStringify(value, optionsCopy);
const serialized = serializeValue(value, optionsCopy.contentType);

return this.client.setValue({
key,
value,
contentType: optionsCopy.contentType,
value: serialized.value,
contentType: serialized.contentType,
});
}

Expand Down
131 changes: 131 additions & 0 deletions packages/core/src/storages/key_value_store_codec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import type { Dictionary } from '@crawlee/types';
import contentTypeParser from 'content-type';
import { isBuffer, isStream } from '@crawlee/utils';
import JSON5 from 'json5';

import { jsonStringifyExtended } from '@apify/utilities';

const CONTENT_TYPE_JSON = 'application/json';
const STRINGIFIABLE_CONTENT_TYPE_RXS = [new RegExp(`^${CONTENT_TYPE_JSON}$`, 'i'), /^application\/.*xml$/i, /^text\//i];

/**
* Canonical write path for key-value store records.
*
* When a content type is provided, the value passes through unchanged — it is the caller's
* responsibility to supply a String/Buffer/Stream (the frontend validates this).
*
* When no content type is provided, it is inferred from the value's shape:
* - Buffer / typed array / ArrayBuffer / stream → `application/octet-stream` (passthrough)
* - `string` → `text/plain; charset=utf-8` (passthrough)
* - anything else → `application/json; charset=utf-8` (serialized via `jsonStringifyExtended`)
*
* Does NOT drain streams — that is storage mechanics and stays in the storage client.
*
* Backend-independent.
*/
export function serializeValue(
value: unknown,
contentType?: string,
): {
value: Buffer | ArrayBuffer | ArrayBufferView | string | NodeJS.ReadableStream | ReadableStream;
contentType: string;
} {
if (contentType !== null && contentType !== undefined) {
return { value: value as Buffer | string | NodeJS.ReadableStream | ReadableStream, contentType };
}

if (isStream(value) || isBuffer(value)) {
return {
value,
contentType: 'application/octet-stream',
};
}

if (typeof value === 'string') {
return { value, contentType: 'text/plain; charset=utf-8' };
}

let serialized: string;
try {
// Format JSON to simplify debugging, the overheads with compression is negligible
serialized = jsonStringifyExtended(value as Dictionary, null, 2);
Comment thread
janbuchar marked this conversation as resolved.
} catch (e) {
const error = e as Error;
// Give more meaningful error message
if (error.message?.includes('Invalid string length')) {
error.message = 'Object is too large';
}
throw new Error(`The "value" parameter cannot be stringified to JSON: ${error.message}`);
}

if (serialized === undefined) {
throw new Error(
'The "value" parameter was stringified to JSON and returned undefined. ' +
"Make sure you're not trying to stringify an undefined value.",
);
}

return { value: serialized, contentType: 'application/json; charset=utf-8' };
}

/**
* Parses a Buffer or ArrayBuffer using the provided content type header.
*
* - application/json is returned as a parsed object.
* - application/*xml and text/* are returned as strings.
* - everything else is returned as original body.
*
* If the header includes a charset, the body will be stringified only
* if the charset represents a known encoding to Node.js or Browser.
*
* Backend-independent — this is the canonical read path for the {@apilink KeyValueStore} frontend.
*/
export function parseValue(
body: Buffer | ArrayBuffer,
contentTypeHeader: string | null,
): string | Buffer | ArrayBuffer | Record<string, unknown> {
// No content type at all → we have no basis for interpretation; hand back the raw bytes.
if (contentTypeHeader === null) return body;

let contentType: string;
let charset: BufferEncoding;
try {
const result = contentTypeParser.parse(contentTypeHeader);
contentType = result.type;
charset = result.parameters.charset as BufferEncoding;
} catch {
// Unparseable header → keep the original buffer rather than a mangled string.
return body;
}

// If we can't successfully parse it, we return
// the original buffer rather than a mangled string.
if (!areDataStringifiable(contentType, charset)) return body;
const dataString = isomorphicBufferToString(body, charset);

return contentType === CONTENT_TYPE_JSON ? JSON5.parse(dataString) : dataString;
}

function isomorphicBufferToString(buffer: Buffer | ArrayBuffer, encoding: BufferEncoding): string {
if (buffer.constructor.name !== ArrayBuffer.name) {
return buffer.toString(encoding);
}

// Browser decoding only works with UTF-8.
const utf8decoder = new TextDecoder();
return utf8decoder.decode(new Uint8Array(buffer));
}

function isCharsetStringifiable(charset: string): charset is BufferEncoding {
if (!charset) return true; // hope that it's utf-8
return Buffer.isEncoding(charset);
}

function isContentTypeStringifiable(contentType: string): boolean {
if (!contentType) return false; // keep buffer
return STRINGIFIABLE_CONTENT_TYPE_RXS.some((rx) => rx.test(contentType));
}

function areDataStringifiable(contentType: string, charset: string): boolean {
return isContentTypeStringifiable(contentType) && isCharsetStringifiable(charset);
}
1 change: 1 addition & 0 deletions packages/fs-storage/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
},
"dependencies": {
"@crawlee/types": "workspace:*",
"@crawlee/utils": "workspace:*",
"@sapphire/async-queue": "^1.5.5",
"@sapphire/shapeshift": "^4.0.0",
"content-type": "^1.0.5",
Expand Down
20 changes: 2 additions & 18 deletions packages/fs-storage/src/utils.ts
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { createHash } from 'node:crypto';

import type * as storage from '@crawlee/types';
import { s } from '@sapphire/shapeshift';
import { isBuffer, isStream } from '@crawlee/utils';

import { REQUEST_ID_LENGTH } from './consts.js';

Expand Down Expand Up @@ -31,23 +31,7 @@ export function uniqueKeyToRequestId(uniqueKey: string): string {
return str.length > REQUEST_ID_LENGTH ? str.slice(0, REQUEST_ID_LENGTH) : str;
}

export function isBuffer(value: unknown): boolean {
try {
s.union([s.instance(Buffer), s.instance(ArrayBuffer), s.typedArray()]).parse(value);

return true;
} catch {
return false;
}
}

export function isStream(value: any): boolean {
return (
typeof value === 'object' &&
value &&
['on', 'pipe'].every((key) => key in value && typeof value[key] === 'function')
);
}
export { isBuffer, isStream };

export type BackgroundHandlerReceivedMessage = BackgroundHandlerUpdateMetadataMessage;

Expand Down
1 change: 1 addition & 0 deletions packages/memory-storage/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
},
"dependencies": {
"@crawlee/types": "workspace:*",
"@crawlee/utils": "workspace:*",
"@sapphire/async-queue": "^1.5.5",
"@sapphire/shapeshift": "^4.0.0",
"content-type": "^1.0.5",
Expand Down
Loading
Loading