Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 65 additions & 0 deletions client/dive-common/use/stereo/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
# Client-side stereo transfer (ONNX)

Warp a detection annotated on one camera onto the other camera, entirely in the
browser / Electron renderer — no backend — using VIAME's epipolar
template-matching model (stereo measurement "method 1") exported to ONNX and run
with `onnxruntime-web`.

This is the client counterpart to the desktop backend stereo service: the
desktop `ViewerLoader` warps via native IPC (`stereoTransferLine` /
`stereoTransferPoints`); this module does the equivalent correspondence search
client-side so it also works on the web.

## Modules

| File | Role |
| --- | --- |
| `StereoOnnxMatcher.ts` | Loads the `match` ONNX model and warps source points → target points via NCC along the epipolar curve. |
| `calibration.ts` | `StereoRig` + loaders (`rigFromNpz`, `rigFromJson`) mirroring VIAME's `read_stereo_rig`; `invertRig` to swap the source/target camera. |
| `npz.ts` | Minimal `.npz`/`.npy` reader (calibration files are NumPy archives). |
| `image.ts` | RGBA → BT.601 grayscale (matches OpenCV `BGR2GRAY` used by the C++ NCC). |
| `frameSource.ts` | Pull full-resolution frame pixels from a GeoJS viewer / image element. |
| `useStereoOnnxTransfer.ts` | Platform-agnostic composable: on annotation complete, warp a box (corners) or head/tail line (keypoints) to the other camera and write the feature. |

The web glue lives in `platform/web-girder/useStereoOnnxWeb.ts` and is bound to
the `Viewer`'s `stereo-annotation-complete` event in the web `ViewerLoader.vue`.

## How it works

Per warp: generate epipolar candidates from the calibration, then NCC
template-match the source patch along that curve in the target frame (this is
exactly the VIAME C++ `epipolar_template_matching` method, as a single ONNX
graph). The matcher returns the matched point + scores; the composable rebuilds
the box / head-tail feature on the other camera.

The world frame is the left (calibration) camera. When the user annotates on the
rig's right camera, the rig is inverted (`invertRig`) so the annotated camera is
the source.

## Setup (web)

1. Export the model (small; method 1 has no learned weights):
```bash
python plugins/onnx/export_stereo_mapping.py --model match \
--out stereo_match.onnx --num-samples 1500
```
Fewer `--num-samples` ⇒ faster client inference, slightly coarser depth
sampling.
2. Serve it as a static asset at `client/public/models/stereo_match.onnx`
(override the URL via `useStereoOnnxWeb({ modelUrl })`).
3. Load a stereo calibration file (`.npz`/`.json`) in the session, as usual for
multi-camera datasets. Transfer no-ops if calibration, the model, or a second
camera is missing.
4. The disparity search range defaults to `{ minDisparity: 2, maxDisparity: 512 }`;
tune per rig via the `range` option (should become a user setting).

## Testing status

- **Tested** (`__tests__/stereoOnnx.spec.ts`, runs under `npm test`): the core —
`.npz` calibration parsing, grayscale conversion, and `StereoOnnxMatcher`
warping points, validated against the VIAME C++/Python reference (matches to
~0.25 px) using `onnxruntime-web` in Node.
- **Needs live testing**: the web glue (`useStereoOnnxWeb`, ViewerLoader
binding, and the GeoJS frame-pixel read in `frameSource.geoViewerToImageElement`)
is type-checked and lint-clean but has not been exercised in a running web
viewer with a real stereo dataset.
136 changes: 136 additions & 0 deletions client/dive-common/use/stereo/StereoOnnxMatcher.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
/**
* Client-side wrapper around the exported VIAME stereo "match" ONNX model
* (method 1: epipolar candidate generation + NCC template matching). Runs fully
* in the browser / Electron renderer via onnxruntime-web — no backend — so a
* detection annotated on one camera can be warped onto the other.
*
* The model and its conventions are produced by
* `plugins/onnx/export_stereo_mapping.py --model match`; see that plugin's
* README. This wrapper only feeds inputs and reads the matched points.
*/

import * as ort from 'onnxruntime-web';

import { GrayImage } from './image';
import { StereoRig, baseline } from './calibration';

/** Search-range specification (disparity is unit-independent; depth needs calib units). */
export type SearchRange =
| { minDisparity: number; maxDisparity: number }
| { minDepth: number; maxDepth: number };

export interface WarpOptions {
range: SearchRange;
/** Minimum NCC score to accept a match (model default region). Default 0.2. */
threshold?: number;
/** Reject if secondScore/score exceeds this (0 disables). Default 0.85. */
uniquenessRatio?: number;
}

export interface WarpResult {
/** Matched point in the right (target) image. */
x: number;
y: number;
/** Best NCC score (TM_CCOEFF_NORMED). */
score: number;
/** Best NCC score outside a template-size neighborhood (uniqueness check). */
secondScore: number;
/** Passed the score threshold and uniqueness-ratio test. */
accepted: boolean;
}

const IDENTITY_3X3 = Float32Array.from([1, 0, 0, 0, 1, 0, 0, 0, 1]);
const ZERO_3 = Float32Array.from([0, 0, 0]);

function scalar(v: number): ort.Tensor {
return new ort.Tensor('float32', Float32Array.from([v]), []);
}

function resolveDepthRange(rig: StereoRig, range: SearchRange): [number, number] {
if ('minDisparity' in range) {
const fx = rig.Kl[0];
const b = baseline(rig);
// min disparity <-> far (max depth); max disparity <-> near (min depth).
return [(fx * b) / range.maxDisparity, (fx * b) / range.minDisparity];
}
return [range.minDepth, range.maxDepth];
}

export class StereoOnnxMatcher {
private session: ort.InferenceSession;

private constructor(session: ort.InferenceSession) {
this.session = session;
}

/**
* Create a matcher from a model URL or in-memory model bytes. By default the
* wasm backend runs single-threaded, which works without cross-origin
* isolation (SharedArrayBuffer); pass `threads` to override.
*/
static async create(
model: string | ArrayBuffer | Uint8Array,
opts: { threads?: number } = {},
): Promise<StereoOnnxMatcher> {
ort.env.wasm.numThreads = opts.threads ?? 1;
ort.env.wasm.proxy = false;
const session = await ort.InferenceSession.create(model as string, {
executionProviders: ['wasm'],
graphOptimizationLevel: 'all',
});
return new StereoOnnxMatcher(session);
}

/**
* Warp a set of source-image points onto the target image. `source`/`target`
* are grayscale frames; `rig` is the stereo calibration with `source` as the
* left camera. Returns one {@link WarpResult} per input point.
*/
async warpPoints(
points: [number, number][],
source: GrayImage,
target: GrayImage,
rig: StereoRig,
opts: WarpOptions,
): Promise<WarpResult[]> {
const [minDepth, maxDepth] = resolveDepthRange(rig, opts.range);
const threshold = opts.threshold ?? 0.2;
const uniqueness = opts.uniquenessRatio ?? 0.85;

const pts = new Float32Array(points.length * 2);
points.forEach(([x, y], i) => { pts[i * 2] = x; pts[i * 2 + 1] = y; });

const feeds: Record<string, ort.Tensor> = {
left_gray: new ort.Tensor('float32', source.data, [source.height, source.width]),
right_gray: new ort.Tensor('float32', target.data, [target.height, target.width]),
points_left: new ort.Tensor('float32', pts, [points.length, 2]),
K_left: new ort.Tensor('float32', rig.Kl, [3, 3]),
dist_left: new ort.Tensor('float32', rig.distl, [8]),
R_left: new ort.Tensor('float32', IDENTITY_3X3, [3, 3]),
t_left: new ort.Tensor('float32', ZERO_3, [3]),
K_right: new ort.Tensor('float32', rig.Kr, [3, 3]),
dist_right: new ort.Tensor('float32', rig.distr, [8]),
R_right: new ort.Tensor('float32', rig.R, [3, 3]),
t_right: new ort.Tensor('float32', rig.T, [3]),
min_depth: scalar(minDepth),
max_depth: scalar(maxDepth),
};

const out = await this.session.run(feeds);
const rp = out.right_points.data as Float32Array;
const best = out.best_score.data as Float32Array;
const second = out.second_score.data as Float32Array;

return points.map((_, i) => {
const score = best[i];
const secondScore = second[i];
let accepted = score >= threshold;
if (accepted && uniqueness > 0 && secondScore > 0 && score > 0) {
accepted = secondScore / score <= uniqueness;
}
return {
x: rp[i * 2], y: rp[i * 2 + 1], score, secondScore, accepted,
};
});
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
*.onnx binary
*.png binary
*.npz binary
Binary file not shown.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file not shown.
77 changes: 77 additions & 0 deletions client/dive-common/use/stereo/__tests__/stereoOnnx.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
import { readFileSync } from 'fs';
import { fileURLToPath } from 'url';
import { PNG } from 'pngjs';
import {
describe, it, expect,
} from 'vitest';
import { StereoOnnxMatcher } from '../StereoOnnxMatcher';
import { rigFromNpz, baseline } from '../calibration';
import { rgbaToGray, GrayImage } from '../image';

const fixture = (name: string) => fileURLToPath(new URL(`./fixtures/${name}`, import.meta.url));

function loadGray(name: string): GrayImage {
const png = PNG.sync.read(readFileSync(fixture(name)));
return rgbaToGray({ data: png.data, width: png.width, height: png.height });
}

describe('rgbaToGray', () => {
it('uses BT.601 luma weights', () => {
const red = rgbaToGray({ data: new Uint8ClampedArray([255, 0, 0, 255]), width: 1, height: 1 });
expect(red.data[0]).toBeCloseTo(0.299 * 255, 2);
});
it('passes through gray pixels', () => {
const g = rgbaToGray({ data: new Uint8ClampedArray([123, 123, 123, 255]), width: 1, height: 1 });
expect(g.data[0]).toBeCloseTo(123, 4);
});
});

describe('rigFromNpz', () => {
it('parses the calibration archive into a stereo rig', async () => {
const rig = await rigFromNpz(readFileSync(fixture('calibration.npz')));
expect(rig.Kl).toHaveLength(9);
expect(rig.Kr).toHaveLength(9);
expect(rig.distl).toHaveLength(8);
expect(rig.R).toHaveLength(9);
expect(rig.T).toHaveLength(3);
expect(rig.Kl[0]).toBeGreaterThan(0); // fx
expect(rig.Kl[8]).toBeCloseTo(1, 6); // homogeneous 1
expect(baseline(rig)).toBeGreaterThan(0);
});

it('handles a Uint8Array view with a non-zero byteOffset (pool-safe)', async () => {
// Node's readFileSync returns a Buffer backed by a shared pool, so a naive
// `.buffer` read sees bytes beyond the file. Mimic that here.
const file = readFileSync(fixture('calibration.npz'));
const backing = new Uint8Array(file.length + 64);
backing.set(file, 32);
const view = backing.subarray(32, 32 + file.length);
const rig = await rigFromNpz(view);
expect(rig.Kl).toHaveLength(9);
expect(baseline(rig)).toBeGreaterThan(0);
});
});

describe('StereoOnnxMatcher.warpPoints', () => {
it('warps points to the right image matching the VIAME reference', async () => {
const rig = await rigFromNpz(readFileSync(fixture('calibration.npz')));
const left = loadGray('left.png');
const right = loadGray('right.png');
const matcher = await StereoOnnxMatcher.create(fixture('stereo_match.onnx'));

const pts: [number, number][] = [[330.43, 234.78], [361.74, 234.78]];
const res = await matcher.warpPoints(pts, left, right, rig, {
range: { minDisparity: 8, maxDisparity: 700 },
});

// Reference (VIAME C++ / Python ONNX): head -> (~309.6, ~235.5),
// tail -> (~340.5, ~235.5), both confident matches.
expect(res[0].x).toBeCloseTo(309.6, 0);
expect(res[0].y).toBeCloseTo(235.5, 0);
expect(res[1].x).toBeCloseTo(340.5, 0);
expect(res[1].y).toBeCloseTo(235.5, 0);
expect(res[0].score).toBeGreaterThan(0.9);
expect(res[0].accepted).toBe(true);
expect(res[1].accepted).toBe(true);
}, 60000);
});
Loading
Loading