Kitware · mattdawkins · Jun 22, 2026
diff --git a/client/dive-common/use/stereo/README.md b/client/dive-common/use/stereo/README.md
@@ -0,0 +1,65 @@
+# Client-side stereo transfer (ONNX)
+
+Warp a detection annotated on one camera onto the other camera, entirely in the
+browser / Electron renderer — no backend — using VIAME's epipolar
+template-matching model (stereo measurement "method 1") exported to ONNX and run
+with `onnxruntime-web`.
+
+This is the client counterpart to the desktop backend stereo service: the
+desktop `ViewerLoader` warps via native IPC (`stereoTransferLine` /
+`stereoTransferPoints`); this module does the equivalent correspondence search
+client-side so it also works on the web.
+
+## Modules
+
+| File | Role |
+| --- | --- |
+| `StereoOnnxMatcher.ts` | Loads the `match` ONNX model and warps source points → target points via NCC along the epipolar curve. |
+| `calibration.ts` | `StereoRig` + loaders (`rigFromNpz`, `rigFromJson`) mirroring VIAME's `read_stereo_rig`; `invertRig` to swap the source/target camera. |
+| `npz.ts` | Minimal `.npz`/`.npy` reader (calibration files are NumPy archives). |
+| `image.ts` | RGBA → BT.601 grayscale (matches OpenCV `BGR2GRAY` used by the C++ NCC). |
+| `frameSource.ts` | Pull full-resolution frame pixels from a GeoJS viewer / image element. |
+| `useStereoOnnxTransfer.ts` | Platform-agnostic composable: on annotation complete, warp a box (corners) or head/tail line (keypoints) to the other camera and write the feature. |
+
+The web glue lives in `platform/web-girder/useStereoOnnxWeb.ts` and is bound to
+the `Viewer`'s `stereo-annotation-complete` event in the web `ViewerLoader.vue`.
+
+## How it works
+
+Per warp: generate epipolar candidates from the calibration, then NCC
+template-match the source patch along that curve in the target frame (this is
+exactly the VIAME C++ `epipolar_template_matching` method, as a single ONNX
+graph). The matcher returns the matched point + scores; the composable rebuilds
+the box / head-tail feature on the other camera.
+
+The world frame is the left (calibration) camera. When the user annotates on the
+rig's right camera, the rig is inverted (`invertRig`) so the annotated camera is
+the source.
+
+## Setup (web)
+
+1. Export the model (small; method 1 has no learned weights):
+   ```bash
+   python plugins/onnx/export_stereo_mapping.py --model match \
+       --out stereo_match.onnx --num-samples 1500
+   ```
+   Fewer `--num-samples` ⇒ faster client inference, slightly coarser depth
+   sampling.
+2. Serve it as a static asset at `client/public/models/stereo_match.onnx`
+   (override the URL via `useStereoOnnxWeb({ modelUrl })`).
+3. Load a stereo calibration file (`.npz`/`.json`) in the session, as usual for
+   multi-camera datasets. Transfer no-ops if calibration, the model, or a second
+   camera is missing.
+4. The disparity search range defaults to `{ minDisparity: 2, maxDisparity: 512 }`;
+   tune per rig via the `range` option (should become a user setting).
+
+## Testing status
+
+- **Tested** (`__tests__/stereoOnnx.spec.ts`, runs under `npm test`): the core —
+  `.npz` calibration parsing, grayscale conversion, and `StereoOnnxMatcher`
+  warping points, validated against the VIAME C++/Python reference (matches to
+  ~0.25 px) using `onnxruntime-web` in Node.
+- **Needs live testing**: the web glue (`useStereoOnnxWeb`, ViewerLoader
+  binding, and the GeoJS frame-pixel read in `frameSource.geoViewerToImageElement`)
+  is type-checked and lint-clean but has not been exercised in a running web
+  viewer with a real stereo dataset.
diff --git a/client/dive-common/use/stereo/StereoOnnxMatcher.ts b/client/dive-common/use/stereo/StereoOnnxMatcher.ts
@@ -0,0 +1,136 @@
+/**
+ * Client-side wrapper around the exported VIAME stereo "match" ONNX model
+ * (method 1: epipolar candidate generation + NCC template matching). Runs fully
+ * in the browser / Electron renderer via onnxruntime-web — no backend — so a
+ * detection annotated on one camera can be warped onto the other.
+ *
+ * The model and its conventions are produced by
+ * `plugins/onnx/export_stereo_mapping.py --model match`; see that plugin's
+ * README. This wrapper only feeds inputs and reads the matched points.
+ */
+
+import * as ort from 'onnxruntime-web';
+
+import { GrayImage } from './image';
+import { StereoRig, baseline } from './calibration';
+
+/** Search-range specification (disparity is unit-independent; depth needs calib units). */
+export type SearchRange =
+  | { minDisparity: number; maxDisparity: number }
+  | { minDepth: number; maxDepth: number };
+
+export interface WarpOptions {
+  range: SearchRange;
+  /** Minimum NCC score to accept a match (model default region). Default 0.2. */
+  threshold?: number;
+  /** Reject if secondScore/score exceeds this (0 disables). Default 0.85. */
+  uniquenessRatio?: number;
+}
+
+export interface WarpResult {
+  /** Matched point in the right (target) image. */
+  x: number;
+  y: number;
+  /** Best NCC score (TM_CCOEFF_NORMED). */
+  score: number;
+  /** Best NCC score outside a template-size neighborhood (uniqueness check). */
+  secondScore: number;
+  /** Passed the score threshold and uniqueness-ratio test. */
+  accepted: boolean;
+}
+
+const IDENTITY_3X3 = Float32Array.from([1, 0, 0, 0, 1, 0, 0, 0, 1]);
+const ZERO_3 = Float32Array.from([0, 0, 0]);
+
+function scalar(v: number): ort.Tensor {
+  return new ort.Tensor('float32', Float32Array.from([v]), []);
+}
+
+function resolveDepthRange(rig: StereoRig, range: SearchRange): [number, number] {
+  if ('minDisparity' in range) {
+    const fx = rig.Kl[0];
+    const b = baseline(rig);
+    // min disparity <-> far (max depth); max disparity <-> near (min depth).
+    return [(fx * b) / range.maxDisparity, (fx * b) / range.minDisparity];
+  }
+  return [range.minDepth, range.maxDepth];
+}
+
+export class StereoOnnxMatcher {
+  private session: ort.InferenceSession;
+
+  private constructor(session: ort.InferenceSession) {
+    this.session = session;
+  }
+
+  /**
+   * Create a matcher from a model URL or in-memory model bytes. By default the
+   * wasm backend runs single-threaded, which works without cross-origin
+   * isolation (SharedArrayBuffer); pass `threads` to override.
+   */
+  static async create(
+    model: string | ArrayBuffer | Uint8Array,
+    opts: { threads?: number } = {},
+  ): Promise<StereoOnnxMatcher> {
+    ort.env.wasm.numThreads = opts.threads ?? 1;
+    ort.env.wasm.proxy = false;
+    const session = await ort.InferenceSession.create(model as string, {
+      executionProviders: ['wasm'],
+      graphOptimizationLevel: 'all',
+    });
+    return new StereoOnnxMatcher(session);
+  }
+
+  /**
+   * Warp a set of source-image points onto the target image. `source`/`target`
+   * are grayscale frames; `rig` is the stereo calibration with `source` as the
+   * left camera. Returns one {@link WarpResult} per input point.
+   */
+  async warpPoints(
+    points: [number, number][],
+    source: GrayImage,
+    target: GrayImage,
+    rig: StereoRig,
+    opts: WarpOptions,
+  ): Promise<WarpResult[]> {
+    const [minDepth, maxDepth] = resolveDepthRange(rig, opts.range);
+    const threshold = opts.threshold ?? 0.2;
+    const uniqueness = opts.uniquenessRatio ?? 0.85;
+
+    const pts = new Float32Array(points.length * 2);
+    points.forEach(([x, y], i) => { pts[i * 2] = x; pts[i * 2 + 1] = y; });
+
+    const feeds: Record<string, ort.Tensor> = {
+      left_gray: new ort.Tensor('float32', source.data, [source.height, source.width]),
+      right_gray: new ort.Tensor('float32', target.data, [target.height, target.width]),
+      points_left: new ort.Tensor('float32', pts, [points.length, 2]),
+      K_left: new ort.Tensor('float32', rig.Kl, [3, 3]),
+      dist_left: new ort.Tensor('float32', rig.distl, [8]),
+      R_left: new ort.Tensor('float32', IDENTITY_3X3, [3, 3]),
+      t_left: new ort.Tensor('float32', ZERO_3, [3]),
+      K_right: new ort.Tensor('float32', rig.Kr, [3, 3]),
+      dist_right: new ort.Tensor('float32', rig.distr, [8]),
+      R_right: new ort.Tensor('float32', rig.R, [3, 3]),
+      t_right: new ort.Tensor('float32', rig.T, [3]),
+      min_depth: scalar(minDepth),
+      max_depth: scalar(maxDepth),
+    };
+
+    const out = await this.session.run(feeds);
+    const rp = out.right_points.data as Float32Array;
+    const best = out.best_score.data as Float32Array;
+    const second = out.second_score.data as Float32Array;
+
+    return points.map((_, i) => {
+      const score = best[i];
+      const secondScore = second[i];
+      let accepted = score >= threshold;
+      if (accepted && uniqueness > 0 && secondScore > 0 && score > 0) {
+        accepted = secondScore / score <= uniqueness;
+      }
+      return {
+        x: rp[i * 2], y: rp[i * 2 + 1], score, secondScore, accepted,
+      };
+    });
+  }
+}
diff --git a/client/dive-common/use/stereo/__tests__/fixtures/.gitattributes b/client/dive-common/use/stereo/__tests__/fixtures/.gitattributes
@@ -0,0 +1,3 @@
+*.onnx binary
+*.png binary
+*.npz binary
diff --git a/client/dive-common/use/stereo/__tests__/fixtures/calibration.npz b/client/dive-common/use/stereo/__tests__/fixtures/calibration.npz
diff --git a/client/dive-common/use/stereo/__tests__/fixtures/left.png b/client/dive-common/use/stereo/__tests__/fixtures/left.png
diff --git a/client/dive-common/use/stereo/__tests__/fixtures/right.png b/client/dive-common/use/stereo/__tests__/fixtures/right.png
diff --git a/client/dive-common/use/stereo/__tests__/fixtures/stereo_match.onnx b/client/dive-common/use/stereo/__tests__/fixtures/stereo_match.onnx
diff --git a/client/dive-common/use/stereo/__tests__/stereoOnnx.spec.ts b/client/dive-common/use/stereo/__tests__/stereoOnnx.spec.ts
@@ -0,0 +1,77 @@
+import { readFileSync } from 'fs';
+import { fileURLToPath } from 'url';
+import { PNG } from 'pngjs';
+import {
+  describe, it, expect,
+} from 'vitest';
+import { StereoOnnxMatcher } from '../StereoOnnxMatcher';
+import { rigFromNpz, baseline } from '../calibration';
+import { rgbaToGray, GrayImage } from '../image';
+
+const fixture = (name: string) => fileURLToPath(new URL(`./fixtures/${name}`, import.meta.url));
+
+function loadGray(name: string): GrayImage {
+  const png = PNG.sync.read(readFileSync(fixture(name)));
+  return rgbaToGray({ data: png.data, width: png.width, height: png.height });
+}
+
+describe('rgbaToGray', () => {
+  it('uses BT.601 luma weights', () => {
+    const red = rgbaToGray({ data: new Uint8ClampedArray([255, 0, 0, 255]), width: 1, height: 1 });
+    expect(red.data[0]).toBeCloseTo(0.299 * 255, 2);
+  });
+  it('passes through gray pixels', () => {
+    const g = rgbaToGray({ data: new Uint8ClampedArray([123, 123, 123, 255]), width: 1, height: 1 });
+    expect(g.data[0]).toBeCloseTo(123, 4);
+  });
+});
+
+describe('rigFromNpz', () => {
+  it('parses the calibration archive into a stereo rig', async () => {
+    const rig = await rigFromNpz(readFileSync(fixture('calibration.npz')));
+    expect(rig.Kl).toHaveLength(9);
+    expect(rig.Kr).toHaveLength(9);
+    expect(rig.distl).toHaveLength(8);
+    expect(rig.R).toHaveLength(9);
+    expect(rig.T).toHaveLength(3);
+    expect(rig.Kl[0]).toBeGreaterThan(0); // fx
+    expect(rig.Kl[8]).toBeCloseTo(1, 6); // homogeneous 1
+    expect(baseline(rig)).toBeGreaterThan(0);
+  });
+
+  it('handles a Uint8Array view with a non-zero byteOffset (pool-safe)', async () => {
+    // Node's readFileSync returns a Buffer backed by a shared pool, so a naive
+    // `.buffer` read sees bytes beyond the file. Mimic that here.
+    const file = readFileSync(fixture('calibration.npz'));
+    const backing = new Uint8Array(file.length + 64);
+    backing.set(file, 32);
+    const view = backing.subarray(32, 32 + file.length);
+    const rig = await rigFromNpz(view);
+    expect(rig.Kl).toHaveLength(9);
+    expect(baseline(rig)).toBeGreaterThan(0);
+  });
+});
+
+describe('StereoOnnxMatcher.warpPoints', () => {
+  it('warps points to the right image matching the VIAME reference', async () => {
+    const rig = await rigFromNpz(readFileSync(fixture('calibration.npz')));
+    const left = loadGray('left.png');
+    const right = loadGray('right.png');
+    const matcher = await StereoOnnxMatcher.create(fixture('stereo_match.onnx'));
+
+    const pts: [number, number][] = [[330.43, 234.78], [361.74, 234.78]];
+    const res = await matcher.warpPoints(pts, left, right, rig, {
+      range: { minDisparity: 8, maxDisparity: 700 },
+    });
+
+    // Reference (VIAME C++ / Python ONNX): head -> (~309.6, ~235.5),
+    // tail -> (~340.5, ~235.5), both confident matches.
+    expect(res[0].x).toBeCloseTo(309.6, 0);
+    expect(res[0].y).toBeCloseTo(235.5, 0);
+    expect(res[1].x).toBeCloseTo(340.5, 0);
+    expect(res[1].y).toBeCloseTo(235.5, 0);
+    expect(res[0].score).toBeGreaterThan(0.9);
+    expect(res[0].accepted).toBe(true);
+    expect(res[1].accepted).toBe(true);
+  }, 60000);
+});