browserbase · miguelg719 · Jun 25, 2026 · Jun 25, 2026 · Jun 25, 2026 · Jun 25, 2026
diff --git a/.changeset/odysseysbench-eval-suite.md b/.changeset/odysseysbench-eval-suite.md
@@ -0,0 +1,5 @@
+---
+"@browserbasehq/stagehand-evals": minor
+---
+
+Add OdysseysBench as a supported agent benchmark in the evals CLI. OdysseysBench is a 200-task web-agent benchmark (45 easy / 46 medium / 109 hard); each task ships a weighted rubric that is baked into the verifier's `precomputed_rubric` format so process + outcome are scored against the published criteria. Run with `--eval-name agent/odysseysbench` (or the `external_agent_benchmarks` category); supports `EVAL_ODYSSEYSBENCH_LIMIT`, `EVAL_ODYSSEYSBENCH_SAMPLE`, `EVAL_ODYSSEYSBENCH_LEVEL`, and `EVAL_ODYSSEYSBENCH_IDS`.
diff --git a/packages/evals/cli-legacy.ts b/packages/evals/cli-legacy.ts
@@ -100,6 +100,7 @@ const CATEGORY_OVERRIDES: Record<string, string[]> = {
   "agent/webvoyager": ["external_agent_benchmarks"],
   "agent/onlineMind2Web": ["external_agent_benchmarks"],
   "agent/webtailbench": ["external_agent_benchmarks"],
+  "agent/odysseysbench": ["external_agent_benchmarks"],
 };
 
 /**

diff --git a/packages/evals/datasets/odysseysbench/OdysseysBench_data.jsonl b/packages/evals/datasets/odysseysbench/OdysseysBench_data.jsonl
diff --git a/packages/evals/datasets/odysseysbench/source/tasks.json b/packages/evals/datasets/odysseysbench/source/tasks.json
diff --git a/packages/evals/index.eval.ts b/packages/evals/index.eval.ts
@@ -52,6 +52,7 @@ import { buildWebVoyagerTestcases } from "./suites/webvoyager.js";
 import { buildOnlineMind2WebTestcases } from "./suites/onlineMind2Web.js";
 import { endBrowserbaseSession } from "./browserbaseCleanup.js";
 import { buildWebTailBenchTestcases } from "./suites/webtailbench.js";
+import { buildOdysseysBenchTestcases } from "./suites/odysseysbench.js";
 import { getCurrentDirPath } from "./runtimePaths.js";
 
 import dotenv from "dotenv";
@@ -252,6 +253,25 @@ const generateFilteredTestcases = (): Testcase[] => {
     taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/webtailbench");
   }
 
+  // Special handling: fan out OdysseysBench dataset for agent/odysseysbench
+  const isOdysseysBenchTaskIncluded = taskNamesToRun.includes(
+    "agent/odysseysbench",
+  );
+
+  if (
+    isOdysseysBenchTaskIncluded &&
+    (!datasetFilter || datasetFilter === "odysseysbench")
+  ) {
+    taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/odysseysbench");
+    allTestcases.push(...buildOdysseysBenchTestcases(currentModels));
+  } else if (
+    isOdysseysBenchTaskIncluded &&
+    datasetFilter &&
+    datasetFilter !== "odysseysbench"
+  ) {
+    taskNamesToRun = taskNamesToRun.filter((t) => t !== "agent/odysseysbench");
+  }
+
   // Create a list of all remaining testcases using the determined task names and models
   const isAgentCategory =
     effectiveCategory === "agent" ||

diff --git a/packages/evals/scripts/build-odysseysbench-dataset.ts b/packages/evals/scripts/build-odysseysbench-dataset.ts
@@ -0,0 +1,151 @@
+/**
+ * Build packages/evals/datasets/odysseysbench/OdysseysBench_data.jsonl from the
+ * published OdysseysBench task set.
+ *
+ * OdysseysBench (https://odysseysbench.com) is a 200-task web-agent benchmark
+ * (45 easy / 46 medium / 109 hard). Every task ships a weighted rubric whose
+ * weights sum to 1.0. This script converts each task's `rubrics` map into the
+ * verifier's `precomputed_rubric` shape ({ items: [{ criterion, description,
+ * max_points }] }) so the suite can hand it straight to V3Evaluator.verify()
+ * without generating a rubric.
+ *
+ * Source of truth is the committed snapshot at
+ *   packages/evals/datasets/odysseysbench/source/tasks.json
+ * (mirrored from https://odysseysbench.com/assets/data/tasks.json). Re-fetch
+ * with `--fetch` to refresh that snapshot before rebuilding.
+ *
+ * Run after pulling the branch (or whenever the source snapshot changes):
+ *   pnpm tsx packages/evals/scripts/build-odysseysbench-dataset.ts
+ *
+ * Idempotent — regenerates the JSONL deterministically from the snapshot.
+ */
+import fs from "node:fs/promises";
+import path from "node:path";
+
+const SOURCE_URL = "https://odysseysbench.com/assets/data/tasks.json";
+
+const DATASET_DIR = path.join(
+  path.resolve(import.meta.dirname, ".."),
+  "datasets",
+  "odysseysbench",
+);
+const SOURCE_PATH = path.join(DATASET_DIR, "source", "tasks.json");
+const JSONL_PATH = path.join(DATASET_DIR, "OdysseysBench_data.jsonl");
+
+interface SourceRubric {
+  requirement: string;
+  verification: string;
+  weight: number;
+}
+
+interface SourceTask {
+  task_id: string;
+  confirmed_task: string;
+  website: string;
+  reference_length: number;
+  level: "easy" | "medium" | "hard";
+  rubrics: Record<string, SourceRubric>;
+  categories?: string[];
+  num_categories?: number;
+}
+
+interface RubricItem {
+  criterion: string;
+  description: string;
+  max_points: number;
+}
+
+interface OutputRow {
+  task_id: string;
+  confirmed_task: string;
+  website: string;
+  level: "easy" | "medium" | "hard";
+  reference_length: number;
+  categories?: string[];
+  precomputed_rubric: { items: RubricItem[] };
+}
+
+/** Order rubric keys R1, R2, … R10 numerically rather than lexicographically. */
+function sortRubricKeys(keys: string[]): string[] {
+  return [...keys].sort((a, b) => {
+    const na = Number.parseInt(a.replace(/^\D+/, ""), 10);
+    const nb = Number.parseInt(b.replace(/^\D+/, ""), 10);
+    if (Number.isFinite(na) && Number.isFinite(nb) && na !== nb) return na - nb;
+    return a.localeCompare(b);
+  });
+}
+
+/**
+ * Convert one OdysseysBench rubric entry into a verifier rubric item.
+ *
+ * `weight` (summing to 1.0 across a task) is scaled to integer points so the
+ * scoring model reasons over a natural 0–100 scale; the process score is a
+ * ratio, so the exact scale is immaterial. `max(1, …)` keeps every criterion
+ * worth at least one point.
+ */
+function toRubricItem(key: string, r: SourceRubric): RubricItem {
+  return {
+    criterion: r.requirement,
+    description: `${r.requirement}\n\nHow a grader verifies this: ${r.verification}`,
+    max_points: Math.max(1, Math.round(r.weight * 100)),
+  };
+}
+
+async function loadSource(): Promise<SourceTask[]> {
+  if (process.argv.includes("--fetch")) {
+    const res = await fetch(SOURCE_URL);
+    if (!res.ok) {
+      throw new Error(`Failed to fetch ${SOURCE_URL}: ${res.status}`);
+    }
+    const text = await res.text();
+    await fs.mkdir(path.dirname(SOURCE_PATH), { recursive: true });
+    await fs.writeFile(SOURCE_PATH, text);
+    console.log(`Refreshed snapshot: ${SOURCE_PATH}`);
+    return JSON.parse(text) as SourceTask[];
+  }
+  const text = await fs.readFile(SOURCE_PATH, "utf8");
+  return JSON.parse(text) as SourceTask[];
+}
+
+async function main(): Promise<void> {
+  const tasks = await loadSource();
+  if (!Array.isArray(tasks) || tasks.length === 0) {
+    throw new Error("Source tasks.json is empty or not an array");
+  }
+
+  const lines: string[] = [];
+  for (const task of tasks) {
+    const rubricKeys = sortRubricKeys(Object.keys(task.rubrics ?? {}));
+    if (rubricKeys.length === 0) {
+      throw new Error(`Task ${task.task_id} has no rubrics`);
+    }
+    const items = rubricKeys.map((k) => toRubricItem(k, task.rubrics[k]));
+
+    const row: OutputRow = {
+      task_id: task.task_id,
+      confirmed_task: task.confirmed_task,
+      website: task.website,
+      level: task.level,
+      reference_length: task.reference_length,
+      ...(Array.isArray(task.categories) && task.categories.length > 0
+        ? { categories: task.categories }
+        : {}),
+      precomputed_rubric: { items },
+    };
+    lines.push(JSON.stringify(row));
+  }
+
+  await fs.writeFile(JSONL_PATH, lines.join("\n") + "\n");
+  const byLevel = tasks.reduce<Record<string, number>>((acc, t) => {
+    acc[t.level] = (acc[t.level] ?? 0) + 1;
+    return acc;
+  }, {});
+  console.log(
+    `Wrote ${lines.length} rows to ${JSONL_PATH} (${JSON.stringify(byLevel)})`,
+  );
+}
+
+main().catch((err) => {
+  console.error(err);
+  process.exit(1);
+});
diff --git a/packages/evals/suites/odysseysbench.ts b/packages/evals/suites/odysseysbench.ts
@@ -0,0 +1,139 @@
+import type { Testcase, EvalInput, AgentModelEntry } from "../types/evals.js";
+import { normalizeRubric, type AvailableModel } from "@browserbasehq/stagehand";
+import { tasksConfig } from "../taskConfig.js";
+import { getPackageRootDir } from "../runtimePaths.js";
+import {
+  readJsonlFile,
+  parseJsonlRows,
+  applySampling,
+  normalizeAgentModelEntries,
+} from "../utils.js";
+
+/**
+ * Build OdysseysBench testcases.
+ *
+ * OdysseysBench (https://odysseysbench.com) is a 200-task web-agent benchmark
+ * spanning easy/medium/hard difficulty. Every task ships a weighted rubric
+ * (baked into `precomputed_rubric` by scripts/build-odysseysbench-dataset.ts),
+ * so the verifier scores against the published criteria directly rather than
+ * generating its own.
+ *
+ * Env knobs:
+ *   - EVAL_MAX_K / EVAL_ODYSSEYSBENCH_LIMIT — cap the number of tasks (default 25).
+ *   - EVAL_ODYSSEYSBENCH_SAMPLE — random sample size (overrides the limit cap).
+ *   - EVAL_ODYSSEYSBENCH_LEVEL — comma-separated difficulty filter (easy,medium,hard).
+ *   - EVAL_ODYSSEYSBENCH_IDS — comma-separated task_ids to run exactly, in order
+ *     (ignores sampling / limit / level knobs).
+ */
+export const buildOdysseysBenchTestcases = (
+  models: string[] | AgentModelEntry[],
+): Testcase[] => {
+  const odysseysbenchFilePath =
+    getPackageRootDir() + "/datasets/odysseysbench/OdysseysBench_data.jsonl";
+
+  const lines = readJsonlFile(odysseysbenchFilePath);
+
+  const maxCases = process.env.EVAL_MAX_K
+    ? Number(process.env.EVAL_MAX_K)
+    : process.env.EVAL_ODYSSEYSBENCH_LIMIT
+      ? Number(process.env.EVAL_ODYSSEYSBENCH_LIMIT)
+      : 25;
+  const sampleCount = process.env.EVAL_ODYSSEYSBENCH_SAMPLE
+    ? Number(process.env.EVAL_ODYSSEYSBENCH_SAMPLE)
+    : undefined;
+
+  type OdysseysBenchRow = {
+    task_id: string;
+    confirmed_task: string;
+    website?: string;
+    level?: "easy" | "medium" | "hard";
+    reference_length?: number;
+    categories?: string[];
+    /**
+     * Per-task weighted rubric in verifier `{ items: [...] }` shape, produced
+     * from the published rubrics by scripts/build-odysseysbench-dataset.ts.
+     */
+    precomputed_rubric?: unknown;
+    [key: string]: unknown;
+  };
+
+  function isOdysseysBenchRow(parsed: unknown): parsed is OdysseysBenchRow {
+    if (parsed === null || typeof parsed !== "object") return false;
+    const obj = parsed as Record<string, unknown>;
+    return (
+      typeof obj.task_id === "string" && typeof obj.confirmed_task === "string"
+    );
+  }
+
+  const candidates = parseJsonlRows(lines, isOdysseysBenchRow);
+
+  // EVAL_ODYSSEYSBENCH_IDS restricts the suite to exactly those task IDs,
+  // preserving the order given and ignoring sampling / limit / level knobs.
+  const explicitIds = process.env.EVAL_ODYSSEYSBENCH_IDS
+    ? process.env.EVAL_ODYSSEYSBENCH_IDS.split(",")
+        .map((s) => s.trim())
+        .filter(Boolean)
+    : null;
+
+  let rows: OdysseysBenchRow[];
+  if (explicitIds && explicitIds.length > 0) {
+    const byId = new Map(candidates.map((r) => [r.task_id, r]));
+    rows = explicitIds
+      .map((id) => byId.get(id))
+      .filter((r): r is OdysseysBenchRow => Boolean(r));
+  } else {
+    // Optional difficulty filter, applied before sampling.
+    const levelFilter = process.env.EVAL_ODYSSEYSBENCH_LEVEL
+      ? new Set(
+          process.env.EVAL_ODYSSEYSBENCH_LEVEL.split(",")
+            .map((s) => s.trim().toLowerCase())
+            .filter(Boolean),
+        )
+      : null;
+    const filtered = levelFilter
+      ? candidates.filter((r) => r.level && levelFilter.has(r.level))
+      : candidates;
+    rows = applySampling(filtered, sampleCount, maxCases);
+  }
+
+  const allTestcases: Testcase[] = [];
+  for (const modelEntry of normalizeAgentModelEntries(models)) {
+    for (const row of rows) {
+      const input: EvalInput = {
+        name: "agent/odysseysbench",
+        modelName: modelEntry.modelName as AvailableModel,
+        agentMode: modelEntry.mode,
+        isCUA: modelEntry.mode === "cua",
+        params: {
+          task_id: row.task_id,
+          confirmed_task: row.confirmed_task,
+          website: row.website,
+          level: row.level,
+          reference_length: row.reference_length,
+          precomputed_rubric: normalizeRubric(row.precomputed_rubric),
+        },
+      };
+      const taskCategories =
+        tasksConfig.find((t) => t.name === input.name)?.categories || [];
+      allTestcases.push({
+        input,
+        name: input.name,
+        tags: [modelEntry.modelName, modelEntry.mode, "odysseysbench"],
+        metadata: {
+          model: modelEntry.modelName as AvailableModel,
+          test: `${input.name}:${row.task_id}`,
+          tier: "bench",
+          task: input.name,
+          category: taskCategories[0] || "agent",
+          categories: taskCategories,
+          dataset: "odysseysbench",
+          task_id: row.task_id,
+          task_category: row.level,
+        },
+        expected: true,
+      });
+    }
+  }
+
+  return allTestcases;
+};
diff --git a/packages/evals/taskConfig.ts b/packages/evals/taskConfig.ts
@@ -139,6 +139,7 @@ const CATEGORY_OVERRIDES: Record<string, string[]> = {
   "agent/webvoyager": ["external_agent_benchmarks"],
   "agent/onlineMind2Web": ["external_agent_benchmarks"],
   "agent/webtailbench": ["external_agent_benchmarks"],
+  "agent/odysseysbench": ["external_agent_benchmarks"],
 };
 
 /**