diff --git a/client/dive-common/use/stereo/README.md b/client/dive-common/use/stereo/README.md new file mode 100644 index 000000000..116c8f131 --- /dev/null +++ b/client/dive-common/use/stereo/README.md @@ -0,0 +1,65 @@ +# Client-side stereo transfer (ONNX) + +Warp a detection annotated on one camera onto the other camera, entirely in the +browser / Electron renderer — no backend — using VIAME's epipolar +template-matching model (stereo measurement "method 1") exported to ONNX and run +with `onnxruntime-web`. + +This is the client counterpart to the desktop backend stereo service: the +desktop `ViewerLoader` warps via native IPC (`stereoTransferLine` / +`stereoTransferPoints`); this module does the equivalent correspondence search +client-side so it also works on the web. + +## Modules + +| File | Role | +| --- | --- | +| `StereoOnnxMatcher.ts` | Loads the `match` ONNX model and warps source points → target points via NCC along the epipolar curve. | +| `calibration.ts` | `StereoRig` + loaders (`rigFromNpz`, `rigFromJson`) mirroring VIAME's `read_stereo_rig`; `invertRig` to swap the source/target camera. | +| `npz.ts` | Minimal `.npz`/`.npy` reader (calibration files are NumPy archives). | +| `image.ts` | RGBA → BT.601 grayscale (matches OpenCV `BGR2GRAY` used by the C++ NCC). | +| `frameSource.ts` | Pull full-resolution frame pixels from a GeoJS viewer / image element. | +| `useStereoOnnxTransfer.ts` | Platform-agnostic composable: on annotation complete, warp a box (corners) or head/tail line (keypoints) to the other camera and write the feature. | + +The web glue lives in `platform/web-girder/useStereoOnnxWeb.ts` and is bound to +the `Viewer`'s `stereo-annotation-complete` event in the web `ViewerLoader.vue`. + +## How it works + +Per warp: generate epipolar candidates from the calibration, then NCC +template-match the source patch along that curve in the target frame (this is +exactly the VIAME C++ `epipolar_template_matching` method, as a single ONNX +graph). The matcher returns the matched point + scores; the composable rebuilds +the box / head-tail feature on the other camera. + +The world frame is the left (calibration) camera. When the user annotates on the +rig's right camera, the rig is inverted (`invertRig`) so the annotated camera is +the source. + +## Setup (web) + +1. Export the model (small; method 1 has no learned weights): + ```bash + python plugins/onnx/export_stereo_mapping.py --model match \ + --out stereo_match.onnx --num-samples 1500 + ``` + Fewer `--num-samples` ⇒ faster client inference, slightly coarser depth + sampling. +2. Serve it as a static asset at `client/public/models/stereo_match.onnx` + (override the URL via `useStereoOnnxWeb({ modelUrl })`). +3. Load a stereo calibration file (`.npz`/`.json`) in the session, as usual for + multi-camera datasets. Transfer no-ops if calibration, the model, or a second + camera is missing. +4. The disparity search range defaults to `{ minDisparity: 2, maxDisparity: 512 }`; + tune per rig via the `range` option (should become a user setting). + +## Testing status + +- **Tested** (`__tests__/stereoOnnx.spec.ts`, runs under `npm test`): the core — + `.npz` calibration parsing, grayscale conversion, and `StereoOnnxMatcher` + warping points, validated against the VIAME C++/Python reference (matches to + ~0.25 px) using `onnxruntime-web` in Node. +- **Needs live testing**: the web glue (`useStereoOnnxWeb`, ViewerLoader + binding, and the GeoJS frame-pixel read in `frameSource.geoViewerToImageElement`) + is type-checked and lint-clean but has not been exercised in a running web + viewer with a real stereo dataset. diff --git a/client/dive-common/use/stereo/StereoOnnxMatcher.ts b/client/dive-common/use/stereo/StereoOnnxMatcher.ts new file mode 100644 index 000000000..1c92443d7 --- /dev/null +++ b/client/dive-common/use/stereo/StereoOnnxMatcher.ts @@ -0,0 +1,136 @@ +/** + * Client-side wrapper around the exported VIAME stereo "match" ONNX model + * (method 1: epipolar candidate generation + NCC template matching). Runs fully + * in the browser / Electron renderer via onnxruntime-web — no backend — so a + * detection annotated on one camera can be warped onto the other. + * + * The model and its conventions are produced by + * `plugins/onnx/export_stereo_mapping.py --model match`; see that plugin's + * README. This wrapper only feeds inputs and reads the matched points. + */ + +import * as ort from 'onnxruntime-web'; + +import { GrayImage } from './image'; +import { StereoRig, baseline } from './calibration'; + +/** Search-range specification (disparity is unit-independent; depth needs calib units). */ +export type SearchRange = + | { minDisparity: number; maxDisparity: number } + | { minDepth: number; maxDepth: number }; + +export interface WarpOptions { + range: SearchRange; + /** Minimum NCC score to accept a match (model default region). Default 0.2. */ + threshold?: number; + /** Reject if secondScore/score exceeds this (0 disables). Default 0.85. */ + uniquenessRatio?: number; +} + +export interface WarpResult { + /** Matched point in the right (target) image. */ + x: number; + y: number; + /** Best NCC score (TM_CCOEFF_NORMED). */ + score: number; + /** Best NCC score outside a template-size neighborhood (uniqueness check). */ + secondScore: number; + /** Passed the score threshold and uniqueness-ratio test. */ + accepted: boolean; +} + +const IDENTITY_3X3 = Float32Array.from([1, 0, 0, 0, 1, 0, 0, 0, 1]); +const ZERO_3 = Float32Array.from([0, 0, 0]); + +function scalar(v: number): ort.Tensor { + return new ort.Tensor('float32', Float32Array.from([v]), []); +} + +function resolveDepthRange(rig: StereoRig, range: SearchRange): [number, number] { + if ('minDisparity' in range) { + const fx = rig.Kl[0]; + const b = baseline(rig); + // min disparity <-> far (max depth); max disparity <-> near (min depth). + return [(fx * b) / range.maxDisparity, (fx * b) / range.minDisparity]; + } + return [range.minDepth, range.maxDepth]; +} + +export class StereoOnnxMatcher { + private session: ort.InferenceSession; + + private constructor(session: ort.InferenceSession) { + this.session = session; + } + + /** + * Create a matcher from a model URL or in-memory model bytes. By default the + * wasm backend runs single-threaded, which works without cross-origin + * isolation (SharedArrayBuffer); pass `threads` to override. + */ + static async create( + model: string | ArrayBuffer | Uint8Array, + opts: { threads?: number } = {}, + ): Promise { + ort.env.wasm.numThreads = opts.threads ?? 1; + ort.env.wasm.proxy = false; + const session = await ort.InferenceSession.create(model as string, { + executionProviders: ['wasm'], + graphOptimizationLevel: 'all', + }); + return new StereoOnnxMatcher(session); + } + + /** + * Warp a set of source-image points onto the target image. `source`/`target` + * are grayscale frames; `rig` is the stereo calibration with `source` as the + * left camera. Returns one {@link WarpResult} per input point. + */ + async warpPoints( + points: [number, number][], + source: GrayImage, + target: GrayImage, + rig: StereoRig, + opts: WarpOptions, + ): Promise { + const [minDepth, maxDepth] = resolveDepthRange(rig, opts.range); + const threshold = opts.threshold ?? 0.2; + const uniqueness = opts.uniquenessRatio ?? 0.85; + + const pts = new Float32Array(points.length * 2); + points.forEach(([x, y], i) => { pts[i * 2] = x; pts[i * 2 + 1] = y; }); + + const feeds: Record = { + left_gray: new ort.Tensor('float32', source.data, [source.height, source.width]), + right_gray: new ort.Tensor('float32', target.data, [target.height, target.width]), + points_left: new ort.Tensor('float32', pts, [points.length, 2]), + K_left: new ort.Tensor('float32', rig.Kl, [3, 3]), + dist_left: new ort.Tensor('float32', rig.distl, [8]), + R_left: new ort.Tensor('float32', IDENTITY_3X3, [3, 3]), + t_left: new ort.Tensor('float32', ZERO_3, [3]), + K_right: new ort.Tensor('float32', rig.Kr, [3, 3]), + dist_right: new ort.Tensor('float32', rig.distr, [8]), + R_right: new ort.Tensor('float32', rig.R, [3, 3]), + t_right: new ort.Tensor('float32', rig.T, [3]), + min_depth: scalar(minDepth), + max_depth: scalar(maxDepth), + }; + + const out = await this.session.run(feeds); + const rp = out.right_points.data as Float32Array; + const best = out.best_score.data as Float32Array; + const second = out.second_score.data as Float32Array; + + return points.map((_, i) => { + const score = best[i]; + const secondScore = second[i]; + let accepted = score >= threshold; + if (accepted && uniqueness > 0 && secondScore > 0 && score > 0) { + accepted = secondScore / score <= uniqueness; + } + return { + x: rp[i * 2], y: rp[i * 2 + 1], score, secondScore, accepted, + }; + }); + } +} diff --git a/client/dive-common/use/stereo/__tests__/fixtures/.gitattributes b/client/dive-common/use/stereo/__tests__/fixtures/.gitattributes new file mode 100644 index 000000000..da729d154 --- /dev/null +++ b/client/dive-common/use/stereo/__tests__/fixtures/.gitattributes @@ -0,0 +1,3 @@ +*.onnx binary +*.png binary +*.npz binary diff --git a/client/dive-common/use/stereo/__tests__/fixtures/calibration.npz b/client/dive-common/use/stereo/__tests__/fixtures/calibration.npz new file mode 100644 index 000000000..94ba13709 Binary files /dev/null and b/client/dive-common/use/stereo/__tests__/fixtures/calibration.npz differ diff --git a/client/dive-common/use/stereo/__tests__/fixtures/left.png b/client/dive-common/use/stereo/__tests__/fixtures/left.png new file mode 100644 index 000000000..02c675a12 Binary files /dev/null and b/client/dive-common/use/stereo/__tests__/fixtures/left.png differ diff --git a/client/dive-common/use/stereo/__tests__/fixtures/right.png b/client/dive-common/use/stereo/__tests__/fixtures/right.png new file mode 100644 index 000000000..e23315383 Binary files /dev/null and b/client/dive-common/use/stereo/__tests__/fixtures/right.png differ diff --git a/client/dive-common/use/stereo/__tests__/fixtures/stereo_match.onnx b/client/dive-common/use/stereo/__tests__/fixtures/stereo_match.onnx new file mode 100644 index 000000000..ad09998ec Binary files /dev/null and b/client/dive-common/use/stereo/__tests__/fixtures/stereo_match.onnx differ diff --git a/client/dive-common/use/stereo/__tests__/stereoOnnx.spec.ts b/client/dive-common/use/stereo/__tests__/stereoOnnx.spec.ts new file mode 100644 index 000000000..f7e17c07a --- /dev/null +++ b/client/dive-common/use/stereo/__tests__/stereoOnnx.spec.ts @@ -0,0 +1,77 @@ +import { readFileSync } from 'fs'; +import { fileURLToPath } from 'url'; +import { PNG } from 'pngjs'; +import { + describe, it, expect, +} from 'vitest'; +import { StereoOnnxMatcher } from '../StereoOnnxMatcher'; +import { rigFromNpz, baseline } from '../calibration'; +import { rgbaToGray, GrayImage } from '../image'; + +const fixture = (name: string) => fileURLToPath(new URL(`./fixtures/${name}`, import.meta.url)); + +function loadGray(name: string): GrayImage { + const png = PNG.sync.read(readFileSync(fixture(name))); + return rgbaToGray({ data: png.data, width: png.width, height: png.height }); +} + +describe('rgbaToGray', () => { + it('uses BT.601 luma weights', () => { + const red = rgbaToGray({ data: new Uint8ClampedArray([255, 0, 0, 255]), width: 1, height: 1 }); + expect(red.data[0]).toBeCloseTo(0.299 * 255, 2); + }); + it('passes through gray pixels', () => { + const g = rgbaToGray({ data: new Uint8ClampedArray([123, 123, 123, 255]), width: 1, height: 1 }); + expect(g.data[0]).toBeCloseTo(123, 4); + }); +}); + +describe('rigFromNpz', () => { + it('parses the calibration archive into a stereo rig', async () => { + const rig = await rigFromNpz(readFileSync(fixture('calibration.npz'))); + expect(rig.Kl).toHaveLength(9); + expect(rig.Kr).toHaveLength(9); + expect(rig.distl).toHaveLength(8); + expect(rig.R).toHaveLength(9); + expect(rig.T).toHaveLength(3); + expect(rig.Kl[0]).toBeGreaterThan(0); // fx + expect(rig.Kl[8]).toBeCloseTo(1, 6); // homogeneous 1 + expect(baseline(rig)).toBeGreaterThan(0); + }); + + it('handles a Uint8Array view with a non-zero byteOffset (pool-safe)', async () => { + // Node's readFileSync returns a Buffer backed by a shared pool, so a naive + // `.buffer` read sees bytes beyond the file. Mimic that here. + const file = readFileSync(fixture('calibration.npz')); + const backing = new Uint8Array(file.length + 64); + backing.set(file, 32); + const view = backing.subarray(32, 32 + file.length); + const rig = await rigFromNpz(view); + expect(rig.Kl).toHaveLength(9); + expect(baseline(rig)).toBeGreaterThan(0); + }); +}); + +describe('StereoOnnxMatcher.warpPoints', () => { + it('warps points to the right image matching the VIAME reference', async () => { + const rig = await rigFromNpz(readFileSync(fixture('calibration.npz'))); + const left = loadGray('left.png'); + const right = loadGray('right.png'); + const matcher = await StereoOnnxMatcher.create(fixture('stereo_match.onnx')); + + const pts: [number, number][] = [[330.43, 234.78], [361.74, 234.78]]; + const res = await matcher.warpPoints(pts, left, right, rig, { + range: { minDisparity: 8, maxDisparity: 700 }, + }); + + // Reference (VIAME C++ / Python ONNX): head -> (~309.6, ~235.5), + // tail -> (~340.5, ~235.5), both confident matches. + expect(res[0].x).toBeCloseTo(309.6, 0); + expect(res[0].y).toBeCloseTo(235.5, 0); + expect(res[1].x).toBeCloseTo(340.5, 0); + expect(res[1].y).toBeCloseTo(235.5, 0); + expect(res[0].score).toBeGreaterThan(0.9); + expect(res[0].accepted).toBe(true); + expect(res[1].accepted).toBe(true); + }, 60000); +}); diff --git a/client/dive-common/use/stereo/calibration.ts b/client/dive-common/use/stereo/calibration.ts new file mode 100644 index 000000000..cfac66779 --- /dev/null +++ b/client/dive-common/use/stereo/calibration.ts @@ -0,0 +1,153 @@ +/** + * Client-side stereo calibration loading, mirroring VIAME's `read_stereo_rig` + * (`plugins/core/camera_rig_io.cxx`) and the Python `calibration_io.py`. + * + * The world frame is the left camera: the left camera is at (R = I, t = 0) and + * the right camera carries the rig's relative rotation R and translation T. + * Distortion uses the OpenCV/vital radial-tangential model + * [k1, k2, p1, p2, k3, k4, k5, k6], zero-padded to 8. + */ + +import { parseNpz, NpyArray } from './npz'; + +export interface StereoRig { + /** 3x3 left intrinsics, row-major. */ + Kl: Float32Array; + /** 8 distortion coefficients for the left camera. */ + distl: Float32Array; + Kr: Float32Array; + distr: Float32Array; + /** 3x3 rotation of the right camera (world = left camera), row-major. */ + R: Float32Array; + /** 3-vector translation of the right camera. */ + T: Float32Array; +} + +function pad8(arr: ArrayLike | undefined): Float32Array { + const out = new Float32Array(8); + if (arr) { + const n = Math.min(8, arr.length); + for (let i = 0; i < n; i += 1) out[i] = arr[i]; + } + return out; +} + +function as3x3(arr: ArrayLike): Float32Array { + if (arr.length < 9) throw new Error('Expected a 3x3 matrix (9 values)'); + return Float32Array.from(Array.from({ length: 9 }, (_, i) => arr[i])); +} + +function pick(arrays: Record, ...names: string[]): NpyArray | undefined { + return names.map((n) => arrays[n]).find((a) => a !== undefined); +} + +/** Build a {@link StereoRig} from a parsed `.npz` array map. */ +export function rigFromNpzArrays(arrays: Record): StereoRig { + const K1 = pick(arrays, 'cameraMatrixL', 'cameraMatrix1', 'M1'); + const K2 = pick(arrays, 'cameraMatrixR', 'cameraMatrix2', 'M2'); + const R = pick(arrays, 'R'); + const T = pick(arrays, 'T'); + if (!K1 || !K2 || !R || !T) { + throw new Error('NPZ missing required arrays (R, T, cameraMatrixL, cameraMatrixR)'); + } + const d1 = pick(arrays, 'distCoeffsL', 'distCoeffs1', 'D1'); + const d2 = pick(arrays, 'distCoeffsR', 'distCoeffs2', 'D2'); + return { + Kl: as3x3(K1.data), + distl: pad8(d1?.data), + Kr: as3x3(K2.data), + distr: pad8(d2?.data), + R: as3x3(R.data), + T: Float32Array.from([T.data[0], T.data[1], T.data[2]]), + }; +} + +/** Parse a `.npz` calibration archive into a {@link StereoRig}. */ +export async function rigFromNpz(buffer: ArrayBuffer | Uint8Array): Promise { + return rigFromNpzArrays(await parseNpz(buffer)); +} + +/** + * Build a {@link StereoRig} from a JSON calibration object, accepting either the + * VIAME `fx_left/cx_left/k1_left/.../R/T` schema or an explicit + * `{ cameraMatrixL, distCoeffsL, cameraMatrixR, distCoeffsR, R, T }` object. + */ +export function rigFromJson(obj: Record): StereoRig { + const num = (v: unknown): number => (typeof v === 'number' ? v : Number(v)); + const flat = (v: unknown): number[] => (Array.isArray(v) ? (v.flat(Infinity) as number[]).map(num) : []); + + if ('fx_left' in obj) { + const kFor = (side: string) => Float32Array.from([ + num(obj[`fx_${side}`]), 0, num(obj[`cx_${side}`]), + 0, num(obj[`fy_${side}`]), num(obj[`cy_${side}`]), + 0, 0, 1, + ]); + const distFor = (side: string) => pad8( + ['k1', 'k2', 'p1', 'p2', 'k3'] + .filter((k) => `${k}_${side}` in obj) + .map((k) => num(obj[`${k}_${side}`])), + ); + return { + Kl: kFor('left'), + distl: distFor('left'), + Kr: kFor('right'), + distr: distFor('right'), + R: as3x3(flat(obj.R)), + T: Float32Array.from(flat(obj.T)), + }; + } + + const K1 = flat(obj.cameraMatrixL ?? obj.M1); + const K2 = flat(obj.cameraMatrixR ?? obj.M2); + if (!K1.length || !K2.length) { + throw new Error('JSON calibration missing camera matrices'); + } + return { + Kl: as3x3(K1), + distl: pad8(flat(obj.distCoeffsL ?? obj.D1)), + Kr: as3x3(K2), + distr: pad8(flat(obj.distCoeffsR ?? obj.D2)), + R: as3x3(flat(obj.R)), + T: Float32Array.from(flat(obj.T)), + }; +} + +/** Stereo baseline (||T||) — the distance between the two camera centers. */ +export function baseline(rig: StereoRig): number { + const [x, y, z] = rig.T; + return Math.sqrt(x * x + y * y + z * z); +} + +/** + * Swap which physical camera is treated as "left" (the source). The matcher + * always warps the source camera (rig left) to the target (rig right); when the + * user annotates on the rig's right camera, invert the rig so the annotated + * camera becomes the source. + * + * New world frame = old right camera, so the new right camera (old left) has + * R' = Rᵀ and T' = -Rᵀ·T; intrinsics/distortion swap sides. + */ +export function invertRig(rig: StereoRig): StereoRig { + const { R } = rig; + // R' = Rᵀ + const Rt = Float32Array.from([ + R[0], R[3], R[6], + R[1], R[4], R[7], + R[2], R[5], R[8], + ]); + const [tx, ty, tz] = rig.T; + // T' = -Rᵀ·T + const Tp = Float32Array.from([ + -(Rt[0] * tx + Rt[1] * ty + Rt[2] * tz), + -(Rt[3] * tx + Rt[4] * ty + Rt[5] * tz), + -(Rt[6] * tx + Rt[7] * ty + Rt[8] * tz), + ]); + return { + Kl: rig.Kr, + distl: rig.distr, + Kr: rig.Kl, + distr: rig.distl, + R: Rt, + T: Tp, + }; +} diff --git a/client/dive-common/use/stereo/frameSource.ts b/client/dive-common/use/stereo/frameSource.ts new file mode 100644 index 000000000..d3b871dc8 --- /dev/null +++ b/client/dive-common/use/stereo/frameSource.ts @@ -0,0 +1,56 @@ +/** + * Helpers to obtain full-resolution RGBA pixels for a camera's current frame, + * for {@link StereoOnnxMatcher}. The matcher needs the original image pixels + * (not the zoomed/panned GeoJS render), so we read the source image element. + * + * `imageElementToRgba` is the simple, correct path when you already hold the + * frame's HTMLImageElement. `geoViewerToImageElement` is a best-effort scan of a + * GeoJS viewer's quad/image features to find that element; GeoJS internals vary, + * so it may need adjustment for a given annotator setup. + */ + +import { RgbaImage } from './image'; + +/** Draw an image element to an offscreen canvas and read back RGBA pixels. */ +export function imageElementToRgba( + img: HTMLImageElement | HTMLCanvasElement | ImageBitmap, +): RgbaImage { + const width = (img as HTMLImageElement).naturalWidth + || (img as HTMLCanvasElement).width + || (img as ImageBitmap).width; + const height = (img as HTMLImageElement).naturalHeight + || (img as HTMLCanvasElement).height + || (img as ImageBitmap).height; + const canvas = document.createElement('canvas'); + canvas.width = width; + canvas.height = height; + const ctx = canvas.getContext('2d'); + if (!ctx) throw new Error('Could not get 2d canvas context'); + ctx.drawImage(img as CanvasImageSource, 0, 0, width, height); + return { ...ctx.getImageData(0, 0, width, height), width, height }; +} + +/** + * Best-effort: find the source image element backing a GeoJS viewer's image + * quad. Returns null if none is found (the caller should then fall back to + * another source, e.g. fetching the frame image URL directly). + */ +// eslint-disable-next-line @typescript-eslint/no-explicit-any +export function geoViewerToImageElement(geoViewer: any): HTMLImageElement | null { + try { + // eslint-disable-next-line @typescript-eslint/no-explicit-any + const layers: any[] = geoViewer?.layers?.() ?? []; + // GeoJS quadFeature stores its quads (each with an `.image`) in data(). + const quads = layers + .flatMap((layer) => (layer.features?.() ?? [])) + // eslint-disable-next-line @typescript-eslint/no-explicit-any + .flatMap((feature: any) => (feature.data?.() ?? [])); + const quad = quads.find( + // eslint-disable-next-line @typescript-eslint/no-explicit-any + (d: any) => d?.image instanceof HTMLImageElement && d.image.naturalWidth, + ); + return quad ? (quad.image as HTMLImageElement) : null; + } catch { + return null; + } +} diff --git a/client/dive-common/use/stereo/image.ts b/client/dive-common/use/stereo/image.ts new file mode 100644 index 000000000..1944f6cb1 --- /dev/null +++ b/client/dive-common/use/stereo/image.ts @@ -0,0 +1,52 @@ +/** + * Image -> grayscale helpers for the stereo ONNX matcher. The NCC stage matches + * VIAME's C++, which derives grayscale from color with OpenCV's BGR2GRAY + * (BT.601 luma); we apply the same weights to RGBA pixel data. + */ + +export interface GrayImage { + /** Row-major grayscale, length width*height. */ + data: Float32Array; + width: number; + height: number; +} + +/** RGBA-ish pixel source (ImageData and Canvas getImageData both satisfy this). */ +export interface RgbaImage { + data: Uint8ClampedArray | Uint8Array; + width: number; + height: number; +} + +const R_W = 0.299; +const G_W = 0.587; +const B_W = 0.114; + +/** Convert RGBA pixel data to BT.601 grayscale (matches cv2 BGR2GRAY). */ +export function rgbaToGray(img: RgbaImage): GrayImage { + const { data, width, height } = img; + const out = new Float32Array(width * height); + for (let i = 0, p = 0; i < out.length; i += 1, p += 4) { + out[i] = R_W * data[p] + G_W * data[p + 1] + B_W * data[p + 2]; + } + return { data: out, width, height }; +} + +/** + * Draw an image source (HTMLImageElement / HTMLCanvasElement / + * ImageBitmap) to an offscreen canvas and return BT.601 grayscale. Browser / + * Electron-renderer only (needs a DOM canvas). + */ +export function drawableToGray( + source: CanvasImageSource, + width: number, + height: number, +): GrayImage { + const canvas = document.createElement('canvas'); + canvas.width = width; + canvas.height = height; + const ctx = canvas.getContext('2d'); + if (!ctx) throw new Error('Could not get 2d canvas context'); + ctx.drawImage(source, 0, 0, width, height); + return rgbaToGray(ctx.getImageData(0, 0, width, height)); +} diff --git a/client/dive-common/use/stereo/index.ts b/client/dive-common/use/stereo/index.ts new file mode 100644 index 000000000..b38d28e6c --- /dev/null +++ b/client/dive-common/use/stereo/index.ts @@ -0,0 +1,10 @@ +export { StereoOnnxMatcher } from './StereoOnnxMatcher'; +export type { WarpOptions, WarpResult, SearchRange } from './StereoOnnxMatcher'; +export { + rigFromNpz, rigFromNpzArrays, rigFromJson, baseline, +} from './calibration'; +export type { StereoRig } from './calibration'; +export { parseNpz, parseNpy } from './npz'; +export type { NpyArray } from './npz'; +export { rgbaToGray, drawableToGray } from './image'; +export type { GrayImage, RgbaImage } from './image'; diff --git a/client/dive-common/use/stereo/npz.ts b/client/dive-common/use/stereo/npz.ts new file mode 100644 index 000000000..e1f20bffa --- /dev/null +++ b/client/dive-common/use/stereo/npz.ts @@ -0,0 +1,149 @@ +/** + * Minimal client-side reader for NumPy `.npz` archives (a ZIP of `.npy` arrays) + * and individual `.npy` buffers. Used to parse stereo calibration files + * (produced by `np.savez(... R, T, cameraMatrixL, ...)`) in the browser, so the + * stereo ONNX matcher needs no backend. + * + * Only what calibration needs is supported: little-endian numeric dtypes, + * C-order arrays, and ZIP entries that are either stored or DEFLATE-compressed + * (decompressed with the platform `DecompressionStream`, available in modern + * browsers, Electron, and Node 18+). + */ + +export interface NpyArray { + dtype: string; // numpy descr, e.g. ' number { + // Strip byte-order char; we only support little-endian (the numpy default). + const kind = dtype.replace(/^[<>=|]/, ''); + switch (kind) { + case 'f8': return (dv, o) => dv.getFloat64(o, true); + case 'f4': return (dv, o) => dv.getFloat32(o, true); + case 'i8': return (dv, o) => Number(dv.getBigInt64(o, true)); + case 'i4': return (dv, o) => dv.getInt32(o, true); + case 'i2': return (dv, o) => dv.getInt16(o, true); + case 'i1': return (dv, o) => dv.getInt8(o); + case 'u8': return (dv, o) => Number(dv.getBigUint64(o, true)); + case 'u4': return (dv, o) => dv.getUint32(o, true); + case 'u1': return (dv, o) => dv.getUint8(o); + default: throw new Error(`Unsupported npy dtype: ${dtype}`); + } +} + +function dtypeSize(dtype: string): number { + return parseInt(dtype.replace(/^[<>=|][a-z]/i, ''), 10); +} + +/** Parse a single `.npy` buffer into an {@link NpyArray}. */ +export function parseNpy(buf: Uint8Array): NpyArray { + if (buf[0] !== 0x93 || TEXT.decode(buf.subarray(1, 6)) !== 'NUMPY') { + throw new Error('Not a .npy file'); + } + const major = buf[6]; + let headerLen: number; + let headerStart: number; + const dv = new DataView(buf.buffer, buf.byteOffset, buf.byteLength); + if (major === 1) { + headerLen = dv.getUint16(8, true); + headerStart = 10; + } else { + headerLen = dv.getUint32(8, true); + headerStart = 12; + } + const header = TEXT.decode(buf.subarray(headerStart, headerStart + headerLen)); + const descr = (/'descr'\s*:\s*'([^']+)'/.exec(header) || [])[1]; + const shapeStr = (/'shape'\s*:\s*\(([^)]*)\)/.exec(header) || [])[1] ?? ''; + if (!descr) throw new Error('Could not parse npy header descr'); + if (/'fortran_order'\s*:\s*True/.test(header)) { + throw new Error('Fortran-order npy arrays are not supported'); + } + const shape = shapeStr.split(',') + .map((s) => s.trim()).filter((s) => s.length) + .map((s) => parseInt(s, 10)); + + const dataStart = headerStart + headerLen; + const read = dtypeReader(descr); + const size = dtypeSize(descr); + const count = shape.reduce((a, b) => a * b, 1); + const data = new Float64Array(count); + const ddv = new DataView(buf.buffer, buf.byteOffset + dataStart); + for (let i = 0; i < count; i += 1) { + data[i] = read(ddv, i * size); + } + return { dtype: descr, shape, data }; +} + +async function inflateRaw(bytes: Uint8Array): Promise { + // deflate-raw decompression, available in browsers / Electron / Node 18+. + const ds = new DecompressionStream('deflate-raw'); + const stream = new Blob([bytes]).stream().pipeThrough(ds); + const ab = await new Response(stream).arrayBuffer(); + return new Uint8Array(ab); +} + +/** + * Parse a `.npz` archive into a map of array name (without the `.npy` + * extension) to {@link NpyArray}. Reads the ZIP central directory so it works + * regardless of how the archive was written. + */ +export async function parseNpz( + buffer: ArrayBuffer | Uint8Array, +): Promise> { + // Accept a Uint8Array/Buffer too, honoring its byteOffset/length: Node's + // readFileSync returns a Buffer backed by a shared pool, so `.buffer` is + // larger than the file. Build the view from the exact bytes. + const u8 = buffer instanceof Uint8Array ? buffer : new Uint8Array(buffer); + const dv = new DataView(u8.buffer, u8.byteOffset, u8.byteLength); + + // Locate the End Of Central Directory record (scan back over its 22-byte + // fixed part plus any trailing comment). + let eocd = -1; + for (let i = u8.length - 22; i >= 0; i -= 1) { + if (dv.getUint32(i, true) === 0x06054b50) { eocd = i; break; } + } + if (eocd < 0) throw new Error('Not a valid .npz (no ZIP end-of-central-directory)'); + + const entryCount = dv.getUint16(eocd + 10, true); + let p = dv.getUint32(eocd + 16, true); // central directory offset + const out: Record = {}; + + for (let e = 0; e < entryCount; e += 1) { + if (dv.getUint32(p, true) !== 0x02014b50) { + throw new Error('Corrupt ZIP central directory'); + } + const method = dv.getUint16(p + 10, true); + const compSize = dv.getUint32(p + 20, true); + const nameLen = dv.getUint16(p + 28, true); + const extraLen = dv.getUint16(p + 30, true); + const commentLen = dv.getUint16(p + 32, true); + const localOff = dv.getUint32(p + 42, true); + const name = TEXT.decode(u8.subarray(p + 46, p + 46 + nameLen)); + + // Local header: 30 fixed bytes + name + extra, then the file data. + const lNameLen = dv.getUint16(localOff + 26, true); + const lExtraLen = dv.getUint16(localOff + 28, true); + const dataOff = localOff + 30 + lNameLen + lExtraLen; + const raw = u8.subarray(dataOff, dataOff + compSize); + + let npyBytes: Uint8Array; + if (method === 0) { + npyBytes = raw; + } else if (method === 8) { + // eslint-disable-next-line no-await-in-loop + npyBytes = await inflateRaw(raw); + } else { + throw new Error(`Unsupported ZIP compression method ${method} for ${name}`); + } + + const key = name.replace(/\.npy$/, ''); + out[key] = parseNpy(npyBytes); + p += 46 + nameLen + extraLen + commentLen; + } + return out; +} diff --git a/client/dive-common/use/stereo/useStereoOnnxTransfer.ts b/client/dive-common/use/stereo/useStereoOnnxTransfer.ts new file mode 100644 index 000000000..bec72bbbb --- /dev/null +++ b/client/dive-common/use/stereo/useStereoOnnxTransfer.ts @@ -0,0 +1,170 @@ +/** + * Client-side stereo transfer: when a detection is annotated on one camera, + * warp it onto the other camera using the VIAME "match" ONNX model + * ({@link StereoOnnxMatcher}) — no backend, so it works in both the web and + * desktop DIVE builds. + * + * This mirrors the desktop backend stereo handler (ViewerLoader's + * `handleStereoAnnotationComplete`) but runs the correspondence search in the + * browser. It transfers bounding boxes (via their corners) and head/tail lines + * (keypoints); polygons and segmentation seeds are intentionally out of scope. + * + * Pixel access and calibration/model loading are injected so this stays + * platform-agnostic and unit-testable; the web ViewerLoader supplies the + * concrete providers. + */ + +import CameraStore from 'vue-media-annotator/CameraStore'; +import Track from 'vue-media-annotator/track'; +import { RectBounds } from 'vue-media-annotator/utils'; +import { HeadPointKey, TailPointKey } from 'dive-common/recipes/headtail'; +import type { StereoAnnotationCompleteParams } from '../useModeManager'; +import { StereoOnnxMatcher, SearchRange } from './StereoOnnxMatcher'; +import { StereoRig, invertRig } from './calibration'; +import { rgbaToGray, RgbaImage } from './image'; + +export interface StereoOnnxTransferConfig { + cameraStore: CameraStore; + /** Names of all cameras (transfer only runs when there are at least two). */ + getMultiCamList: () => string[]; + /** Which DIVE camera corresponds to the rig's left (calibration) camera. */ + getLeftCameraName: () => string; + /** Stereo calibration, or null if unavailable (transfer is then skipped). */ + getRig: () => Promise; + /** The (lazily created / cached) ONNX matcher, or null if unavailable. */ + getMatcher: () => Promise; + /** Full-resolution RGBA pixels for a camera's current frame, or null. */ + getFrame: (cameraName: string) => Promise; + /** Disparity- or depth-based search range for the correspondence search. */ + getRange: () => SearchRange; + threshold?: number; + uniquenessRatio?: number; + /** Called after a feature is written so the host can persist the change. */ + onChange?: (cameraName: string, track: Track) => void; +} + +const BOX_PAD = 0.10; + +export default function useStereoOnnxTransfer(config: StereoOnnxTransferConfig) { + const { + cameraStore, getMultiCamList, getLeftCameraName, + getRig, getMatcher, getFrame, getRange, + } = config; + + function getOrCreateTrack(trackId: number, sourceCamera: string, targetCamera: string, frameNum: number): Track | undefined { + let track = cameraStore.getPossibleTrack(trackId, targetCamera); + if (!track) { + const targetStore = cameraStore.camMap.value.get(targetCamera)?.trackStore; + const sourceTrack = cameraStore.getPossibleTrack(trackId, sourceCamera); + const trackType = sourceTrack?.confidencePairs?.[0]?.[0] || 'unknown'; + track = targetStore?.add(frameNum, trackType, undefined, trackId); + } + return track; + } + + function boundsFromPoints(pts: [number, number][], pad = 0): RectBounds { + const xs = pts.map((p) => p[0]); + const ys = pts.map((p) => p[1]); + const minX = Math.min(...xs); + const minY = Math.min(...ys); + const maxX = Math.max(...xs); + const maxY = Math.max(...ys); + const padX = (maxX - minX) * pad || (maxY - minY) * pad; + const padY = (maxY - minY) * pad || (maxX - minX) * pad; + return [minX - padX, minY - padY, maxX + padX, maxY + padY]; + } + + /** + * Warp the just-completed annotation onto the other camera. No-op unless + * there are two cameras, calibration + model are available, and the target + * camera does not already have this track's feature at this frame (so the + * initial warp happens once and manual corrections are never overwritten). + */ + async function handleStereoAnnotationComplete( + params: StereoAnnotationCompleteParams, + ): Promise { + if (params.type !== 'box' && params.type !== 'line') return false; + + const cams = getMultiCamList(); + if (cams.length < 2) return false; + const otherCamera = cams.find((c) => c !== params.camera); + if (!otherCamera) return false; + + const existing = cameraStore.getPossibleTrack(params.trackId, otherCamera); + if (existing && existing.getFeature(params.frameNum)[0] !== null) return false; + + const [rig0, matcher] = await Promise.all([getRig(), getMatcher()]); + if (!rig0 || !matcher) return false; + // Warp from the annotated camera to the other; orient the rig so the + // annotated camera is the source ("left"). + const rig = params.camera === getLeftCameraName() ? rig0 : invertRig(rig0); + + const [srcFrame, tgtFrame] = await Promise.all([ + getFrame(params.camera), getFrame(otherCamera), + ]); + if (!srcFrame || !tgtFrame) return false; + const srcGray = rgbaToGray(srcFrame); + const tgtGray = rgbaToGray(tgtFrame); + + const warpOpts = { + range: getRange(), + threshold: config.threshold, + uniquenessRatio: config.uniquenessRatio, + }; + + if (params.type === 'box') { + const [x1, y1, x2, y2] = params.bounds; + const corners: [number, number][] = [[x1, y1], [x2, y1], [x2, y2], [x1, y2]]; + const res = await matcher.warpPoints(corners, srcGray, tgtGray, rig, warpOpts); + const ok = res.filter((r) => r.accepted).map((r) => [r.x, r.y] as [number, number]); + if (ok.length < 2) return false; // need enough confident corners for a box + const track = getOrCreateTrack(params.trackId, params.camera, otherCamera, params.frameNum); + if (!track) return false; + track.setFeature({ + frame: params.frameNum, + flick: 0, + keyframe: true, + interpolate: false, + bounds: boundsFromPoints(ok), + }); + config.onChange?.(otherCamera, track); + return true; + } + + // params.type === 'line' (head/tail keypoints) + const res = await matcher.warpPoints(params.line, srcGray, tgtGray, rig, warpOpts); + if (!res[0].accepted || !res[1].accepted) return false; + const p1: [number, number] = [res[0].x, res[0].y]; + const p2: [number, number] = [res[1].x, res[1].y]; + const geometry: GeoJSON.Feature[] = [ + { + type: 'Feature', + geometry: { type: 'LineString', coordinates: [p1, p2] }, + properties: { key: params.key }, + }, + { + type: 'Feature', + geometry: { type: 'Point', coordinates: p1 }, + properties: { key: HeadPointKey }, + }, + { + type: 'Feature', + geometry: { type: 'Point', coordinates: p2 }, + properties: { key: TailPointKey }, + }, + ]; + const track = getOrCreateTrack(params.trackId, params.camera, otherCamera, params.frameNum); + if (!track) return false; + track.setFeature({ + frame: params.frameNum, + flick: 0, + keyframe: true, + interpolate: false, + bounds: boundsFromPoints([p1, p2], BOX_PAD), + }, geometry as GeoJSON.Feature[] as never); + config.onChange?.(otherCamera, track); + return true; + } + + return { handleStereoAnnotationComplete }; +} diff --git a/client/package-lock.json b/client/package-lock.json index 17102f87f..1f3231c9d 100644 --- a/client/package-lock.json +++ b/client/package-lock.json @@ -23,6 +23,7 @@ "lodash": "^4.17.19", "moment": "^2.29.1", "mousetrap": "^1.6.5", + "onnxruntime-web": "^1.27.0", "pngjs": "^7.0.0", "semver": "^7.3.5", "vue": "^2.7.16", @@ -3219,9 +3220,6 @@ "arm" ], "dev": true, - "libc": [ - "glibc" - ], "license": "MIT", "optional": true, "os": [ @@ -3243,9 +3241,6 @@ "arm" ], "dev": true, - "libc": [ - "musl" - ], "license": "MIT", "optional": true, "os": [ @@ -3267,9 +3262,6 @@ "arm64" ], "dev": true, - "libc": [ - "glibc" - ], "license": "MIT", "optional": true, "os": [ @@ -3291,9 +3283,6 @@ "arm64" ], "dev": true, - "libc": [ - "musl" - ], "license": "MIT", "optional": true, "os": [ @@ -3315,9 +3304,6 @@ "x64" ], "dev": true, - "libc": [ - "glibc" - ], "license": "MIT", "optional": true, "os": [ @@ -3339,9 +3325,6 @@ "x64" ], "dev": true, - "libc": [ - "musl" - ], "license": "MIT", "optional": true, "os": [ @@ -3457,6 +3440,63 @@ "node": ">=14" } }, + "node_modules/@protobufjs/aspromise": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@protobufjs/aspromise/-/aspromise-1.1.2.tgz", + "integrity": "sha512-j+gKExEuLmKwvz3OgROXtrJ2UG2x8Ch2YZUxahh+s1F2HZ+wAceUNLkvy6zKCPVRkU++ZWQrdxsUeQXmcg4uoQ==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/base64": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@protobufjs/base64/-/base64-1.1.2.tgz", + "integrity": "sha512-AZkcAA5vnN/v4PDqKyMR5lx7hZttPDgClv83E//FMNhR2TMcLUhfRUBHCmSl0oi9zMgDDqRUJkSxO3wm85+XLg==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/codegen": { + "version": "2.0.5", + "resolved": "https://registry.npmjs.org/@protobufjs/codegen/-/codegen-2.0.5.tgz", + "integrity": "sha512-zgXFLzW3Ap33e6d0Wlj4MGIm6Ce8O89n/apUaGNB/jx+hw+ruWEp7EwGUshdLKVRCxZW12fp9r40E1mQrf/34g==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/eventemitter": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/@protobufjs/eventemitter/-/eventemitter-1.1.1.tgz", + "integrity": "sha512-vW1GmwMZNnL+gMRaovlh9yZX74kc+TTU3FObkkurpMaRtBfLP3ldjS9KQWlwZgraRE0+dheEEoAxdzcJQ8eXZg==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/fetch": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/@protobufjs/fetch/-/fetch-1.1.1.tgz", + "integrity": "sha512-GpptLrs57adMSuHi3VNj0mAF8dwh36LMaYF6XyJ6JMWlVsc+t42tm1HSEDmOs3A8fC9yyeisgLhsTVQokOZ0zw==", + "license": "BSD-3-Clause", + "dependencies": { + "@protobufjs/aspromise": "^1.1.1" + } + }, + "node_modules/@protobufjs/float": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/@protobufjs/float/-/float-1.0.2.tgz", + "integrity": "sha512-Ddb+kVXlXst9d+R9PfTIxh1EdNkgoRe5tOX6t01f1lYWOvJnSPDBlG241QLzcyPdoNTsblLUdujGSE4RzrTZGQ==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/path": { + "version": "1.1.2", + "resolved": "https://registry.npmjs.org/@protobufjs/path/-/path-1.1.2.tgz", + "integrity": "sha512-6JOcJ5Tm08dOHAbdR3GrvP+yUUfkjG5ePsHYczMFLq3ZmMkAD98cDgcT2iA1lJ9NVwFd4tH/iSSoe44YWkltEA==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/pool": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/@protobufjs/pool/-/pool-1.1.0.tgz", + "integrity": "sha512-0kELaGSIDBKvcgS4zkjz1PeddatrjYcmMWOlAuAPwAeccUrPHdUqo/J6LiymHHEiJT5NrF1UVwxY14f+fy4WQw==", + "license": "BSD-3-Clause" + }, + "node_modules/@protobufjs/utf8": { + "version": "1.1.1", + "resolved": "https://registry.npmjs.org/@protobufjs/utf8/-/utf8-1.1.1.tgz", + "integrity": "sha512-oOAWABowe8EAbMyWKM0tYDKi8Yaox52D+HWZhAIJqQXbqe0xI/GV7FhLWqlEKreMkfDjshR5FKgi3mnle0h6Eg==", + "license": "BSD-3-Clause" + }, "node_modules/@rollup/rollup-android-arm-eabi": { "version": "4.60.2", "resolved": "https://registry.npmjs.org/@rollup/rollup-android-arm-eabi/-/rollup-android-arm-eabi-4.60.2.tgz", @@ -4519,7 +4559,6 @@ "version": "25.6.0", "resolved": "https://registry.npmjs.org/@types/node/-/node-25.6.0.tgz", "integrity": "sha512-+qIYRKdNYJwY3vRCZMdJbPLJAtGjQBudzZzdzwQYkEPQd+PJGixUL5QfvCLDaULoLv+RhT3LDkwEfKaAkgSmNQ==", - "dev": true, "license": "MIT", "dependencies": { "undici-types": "~7.19.0" @@ -9505,6 +9544,12 @@ "node": "^10.12.0 || >=12.0.0" } }, + "node_modules/flatbuffers": { + "version": "25.9.23", + "resolved": "https://registry.npmjs.org/flatbuffers/-/flatbuffers-25.9.23.tgz", + "integrity": "sha512-MI1qs7Lo4Syw0EOzUl0xjs2lsoeqFku44KpngfIduHBYvzm8h2+7K8YMQh1JtVVVrUvhLpNwqVi4DERegUJhPQ==", + "license": "Apache-2.0" + }, "node_modules/flatted": { "version": "3.4.2", "resolved": "https://registry.npmjs.org/flatted/-/flatted-3.4.2.tgz", @@ -10031,6 +10076,12 @@ "dev": true, "license": "MIT" }, + "node_modules/guid-typescript": { + "version": "1.0.9", + "resolved": "https://registry.npmjs.org/guid-typescript/-/guid-typescript-1.0.9.tgz", + "integrity": "sha512-Y8T4vYhEfwJOTbouREvG+3XDsjr8E3kIr7uf+JZ0BYloFsttiHU0WfvANVsR7TxNUJa/WpCnw/Ino/p+DeBhBQ==", + "license": "ISC" + }, "node_modules/hammerjs": { "version": "2.0.8", "resolved": "https://registry.npmjs.org/hammerjs/-/hammerjs-2.0.8.tgz", @@ -11466,6 +11517,12 @@ "dev": true, "license": "MIT" }, + "node_modules/long": { + "version": "5.3.2", + "resolved": "https://registry.npmjs.org/long/-/long-5.3.2.tgz", + "integrity": "sha512-mNAgZ1GmyNhD7AuqnTG3/VQ26o760+ZYBPKjPvugO8+nLbYfX6TVpJPseBvopbdY+qpZ/lKUnmEc1LeZYS3QAA==", + "license": "Apache-2.0" + }, "node_modules/loose-envify": { "version": "1.4.0", "resolved": "https://registry.npmjs.org/loose-envify/-/loose-envify-1.4.0.tgz", @@ -12197,6 +12254,26 @@ "wrappy": "1" } }, + "node_modules/onnxruntime-common": { + "version": "1.27.0", + "resolved": "https://registry.npmjs.org/onnxruntime-common/-/onnxruntime-common-1.27.0.tgz", + "integrity": "sha512-3KxL5wIVqa8Ex08jxSzncm9CMgw8CjOFyOQ7SxvG9o0cVLlhTNKXyIQuTbtX4tGPJEf73OER2xrjt4HJSBL4ow==", + "license": "MIT" + }, + "node_modules/onnxruntime-web": { + "version": "1.27.0", + "resolved": "https://registry.npmjs.org/onnxruntime-web/-/onnxruntime-web-1.27.0.tgz", + "integrity": "sha512-ogDLsqIozHZwifPuN37OproAo0byX6t43/bP8GzeZWBWD6MOGExswFAx3up4NS/vvWBOg2u2PXomDt3rMmdQSg==", + "license": "MIT", + "dependencies": { + "flatbuffers": "^25.1.24", + "guid-typescript": "^1.0.9", + "long": "^5.2.3", + "onnxruntime-common": "1.27.0", + "platform": "^1.3.6", + "protobufjs": "^7.2.4" + } + }, "node_modules/optionator": { "version": "0.9.4", "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.4.tgz", @@ -12413,6 +12490,12 @@ "url": "https://github.com/sponsors/jonschlinkert" } }, + "node_modules/platform": { + "version": "1.3.6", + "resolved": "https://registry.npmjs.org/platform/-/platform-1.3.6.tgz", + "integrity": "sha512-fnWVljUchTro6RiCFvCXBbNhJc2NijN7oIQxbwsyL0buWJPG85v81ehlHI9fXrJsMNgTofEoWIQeClKpgxFLrg==", + "license": "MIT" + }, "node_modules/plimit-lit": { "version": "1.6.1", "resolved": "https://registry.npmjs.org/plimit-lit/-/plimit-lit-1.6.1.tgz", @@ -12632,6 +12715,29 @@ "dev": true, "license": "ISC" }, + "node_modules/protobufjs": { + "version": "7.6.4", + "resolved": "https://registry.npmjs.org/protobufjs/-/protobufjs-7.6.4.tgz", + "integrity": "sha512-RJJPTTpvFfHcWLkIa2JFWK4XvtSzS0yEWDmunqHXli1h3JlkbcQZXDZdcWxv+JK3Xsl5/UFDPZ0iGm7DAengYw==", + "hasInstallScript": true, + "license": "BSD-3-Clause", + "dependencies": { + "@protobufjs/aspromise": "^1.1.2", + "@protobufjs/base64": "^1.1.2", + "@protobufjs/codegen": "^2.0.5", + "@protobufjs/eventemitter": "^1.1.1", + "@protobufjs/fetch": "^1.1.1", + "@protobufjs/float": "^1.0.2", + "@protobufjs/path": "^1.1.2", + "@protobufjs/pool": "^1.1.0", + "@protobufjs/utf8": "^1.1.1", + "@types/node": ">=13.7.0", + "long": "^5.3.2" + }, + "engines": { + "node": ">=12.0.0" + } + }, "node_modules/proxy-addr": { "version": "2.0.7", "resolved": "https://registry.npmjs.org/proxy-addr/-/proxy-addr-2.0.7.tgz", @@ -14496,7 +14602,6 @@ "version": "7.19.2", "resolved": "https://registry.npmjs.org/undici-types/-/undici-types-7.19.2.tgz", "integrity": "sha512-qYVnV5OEm2AW8cJMCpdV20CDyaN3g0AjDlOGf1OW4iaDEx8MwdtChUp4zu4H0VP3nDRF/8RKWH+IPp9uW0YGZg==", - "dev": true, "license": "MIT" }, "node_modules/unicode-canonical-property-names-ecmascript": { diff --git a/client/package.json b/client/package.json index 7225ac7ff..b7fd01cb8 100644 --- a/client/package.json +++ b/client/package.json @@ -44,6 +44,7 @@ "lodash": "^4.17.19", "moment": "^2.29.1", "mousetrap": "^1.6.5", + "onnxruntime-web": "^1.27.0", "pngjs": "^7.0.0", "semver": "^7.3.5", "vue": "^2.7.16", diff --git a/client/platform/web-girder/useStereoOnnxWeb.ts b/client/platform/web-girder/useStereoOnnxWeb.ts new file mode 100644 index 000000000..6d16d5fa7 --- /dev/null +++ b/client/platform/web-girder/useStereoOnnxWeb.ts @@ -0,0 +1,121 @@ +/** + * Web wiring for client-side stereo transfer (warp a detection from one camera + * to the other) using the VIAME "match" ONNX model. Assembles the platform + * providers that {@link useStereoOnnxTransfer} needs: + * - calibration from the session's loaded stereo calibration file, + * - the ONNX matcher (lazily created from a served model asset), + * - per-camera frame pixels read from the GeoJS viewers. + * + * The exported model must be served as a static asset (default + * `/models/stereo_match.onnx`; produce it with + * `plugins/onnx/export_stereo_mapping.py --model match`). If no calibration or + * model is available the transfer simply no-ops. + * + * NOTE: exercised by unit tests at the core layer (the matcher / calibration / + * image modules); this DOM- and Girder-coupled glue needs live testing in a + * running web viewer with a real stereo dataset. + */ + +import useStereoOnnxTransfer from 'dive-common/use/stereo/useStereoOnnxTransfer'; +import { StereoOnnxMatcher } from 'dive-common/use/stereo/StereoOnnxMatcher'; +import type { SearchRange } from 'dive-common/use/stereo/StereoOnnxMatcher'; +import { + rigFromNpz, rigFromJson, StereoRig, +} from 'dive-common/use/stereo/calibration'; +import { geoViewerToImageElement, imageElementToRgba } from 'dive-common/use/stereo/frameSource'; +import type { RgbaImage } from 'dive-common/use/stereo/image'; +import { getCalibrationFile, getLastCalibration } from './multicamFileRegistry'; + +const DEFAULT_MODEL_URL = '/models/stereo_match.onnx'; +// Scene-dependent; should be surfaced as a user setting. These are permissive +// defaults that cover a wide disparity range. +const DEFAULT_RANGE: SearchRange = { minDisparity: 2, maxDisparity: 512 }; + +export interface StereoOnnxWebOptions { + /** Returns the mounted Viewer instance (exposes cameraStore, multiCamList, + * aggregateController). */ + // eslint-disable-next-line @typescript-eslint/no-explicit-any + getViewer: () => any; + modelUrl?: string; + range?: SearchRange; +} + +export default function useStereoOnnxWeb(opts: StereoOnnxWebOptions) { + const modelUrl = opts.modelUrl ?? DEFAULT_MODEL_URL; + let matcher: StereoOnnxMatcher | null = null; + let matcherTried = false; + let rig: StereoRig | null = null; + let rigKey: string | null = null; + + async function getMatcher(): Promise { + if (!matcher && !matcherTried) { + matcherTried = true; + try { + matcher = await StereoOnnxMatcher.create(modelUrl); + } catch (err) { + // eslint-disable-next-line no-console + console.warn('[StereoOnnx] failed to load model', modelUrl, err); + matcher = null; + } + } + return matcher; + } + + async function getRig(): Promise { + const name = await getLastCalibration(); + if (!name) return null; + if (rig && rigKey === name) return rig; + const file = getCalibrationFile(name); + if (!file) return null; + try { + if (name.toLowerCase().endsWith('.json')) { + rig = rigFromJson(JSON.parse(await file.text())); + } else { + rig = await rigFromNpz(await file.arrayBuffer()); + } + rigKey = name; + } catch (err) { + // eslint-disable-next-line no-console + console.warn('[StereoOnnx] failed to parse calibration', name, err); + rig = null; + } + return rig; + } + + async function getFrame(cameraName: string): Promise { + try { + const viewer = opts.getViewer(); + const controller = viewer?.aggregateController?.value?.getController(cameraName); + const geoViewer = controller?.geoViewerRef?.value; + const img = geoViewer ? geoViewerToImageElement(geoViewer) : null; + return img ? imageElementToRgba(img) : null; + } catch { + return null; + } + } + + // The Viewer mounts after this composable runs, so build the transfer lazily + // on the first event, once cameraStore is available. + let transfer: ReturnType | null = null; + + async function handleStereoAnnotationComplete( + params: Parameters['handleStereoAnnotationComplete']>[0], + ): Promise { + if (!transfer) { + const viewer = opts.getViewer(); + if (!viewer?.cameraStore) return false; + transfer = useStereoOnnxTransfer({ + cameraStore: viewer.cameraStore, + getMultiCamList: () => viewer.multiCamList?.value ?? [], + getLeftCameraName: () => viewer.multiCamList?.value?.[0], + getRig, + getMatcher, + getFrame, + getRange: () => opts.range ?? DEFAULT_RANGE, + }); + } + return transfer.handleStereoAnnotationComplete(params); + } + + return { handleStereoAnnotationComplete }; +} diff --git a/client/platform/web-girder/views/ViewerLoader.vue b/client/platform/web-girder/views/ViewerLoader.vue index a70168efc..7bfdde5b5 100644 --- a/client/platform/web-girder/views/ViewerLoader.vue +++ b/client/platform/web-girder/views/ViewerLoader.vue @@ -21,6 +21,7 @@ import { getMultiCamCameraCount } from 'dive-common/pipelineMenuFilters'; import { webExcludedPipelineTerms } from 'dive-common/constants'; import { convertLargeImage } from 'platform/web-girder/api/rpc.service'; import { useRouter } from 'vue-router/composables'; +import useStereoOnnxWeb from 'platform/web-girder/useStereoOnnxWeb'; import JobsTab from './JobsTab.vue'; import Export from './Export.vue'; import Clone from './Clone.vue'; @@ -101,6 +102,12 @@ export default defineComponent({ const { prompt } = usePrompt(); const router = useRouter(); const viewerRef = ref(); + // Client-side stereo transfer: warp a detection to the other camera via the + // VIAME "match" ONNX model (no backend). No-ops without a 2-camera dataset, + // a loaded calibration file, and a served model asset. + const { handleStereoAnnotationComplete } = useStereoOnnxWeb({ + getViewer: () => viewerRef.value, + }); const { brandData } = useBrand(); const { pipelinesEnabled } = useConfig(); const { meta: datasetMeta, loadDataset } = useDataset(); @@ -255,6 +262,7 @@ export default defineComponent({ timeFilter, pipelinesEnabled, webExcludedPipelineTerms, + handleStereoAnnotationComplete, }; }, }); @@ -271,6 +279,7 @@ export default defineComponent({ :comparison-sets="comparisonSets" @large-image-warning="largeImageWarning()" @update:set="routeSet" + @stereo-annotation-complete="handleStereoAnnotationComplete" >