From 18c0f0329219d2914dea9a3422bbd524ac10e256 Mon Sep 17 00:00:00 2001 From: yawbtng <154343001+yawbtng@users.noreply.github.com> Date: Tue, 30 Jun 2026 10:05:41 -0700 Subject: [PATCH] evals(cua): add deterministic CUA agent regression task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds agent/cua_amazon_checkout, a fixture-backed agent eval that passes only when the agent reaches an exact known URL on the pinned stagehand-eval-sites Amazon mirror. Unlike the rubric-graded agent benchmarks, the deterministic URL criterion makes a failure attributable to a real provider/plumbing regression rather than page drift or LLM-judge noise, and exercises the full computer-use loop (function-response decoding -> browser action) end to end — the path that broke in #2046 and #2035. Mirrors the existing agent/sign_in deterministic-URL pattern and reuses the act/amazon_add_to_cart fixture and expected sign-in URL. Records the model/agent-mode path and whether the agent left the start page so failures are easy to attribute. Closes a coverage gap tracked in #2188. --- .../tasks/bench/agent/cua_amazon_checkout.ts | 81 +++++++++++++++++++ 1 file changed, 81 insertions(+) create mode 100644 packages/evals/tasks/bench/agent/cua_amazon_checkout.ts diff --git a/packages/evals/tasks/bench/agent/cua_amazon_checkout.ts b/packages/evals/tasks/bench/agent/cua_amazon_checkout.ts new file mode 100644 index 000000000..624f462a7 --- /dev/null +++ b/packages/evals/tasks/bench/agent/cua_amazon_checkout.ts @@ -0,0 +1,81 @@ +import { defineBenchTask } from "../../../framework/defineTask.js"; + +/** + * Deterministic CUA agent regression task (see #2188). + * + * Unlike the rubric-graded agent benchmarks, this task runs against a pinned + * static fixture and passes only when the agent reaches an exact, known URL. + * A failure is therefore attributable to a real provider/plumbing regression + * rather than to page drift or LLM-judge noise. It exercises the full + * computer-use loop (provider function-response decoding -> browser action) + * end to end — the path that broke in #2046 (fixed by #2159) and #2035, and + * which is otherwise only covered transitively by the heavyweight + * WebVoyager / OnlineMind2Web suites. + * + * The task is mode-agnostic; point it at a CUA model to exercise the CUA path: + * evals run agent/cua_amazon_checkout --agent-mode cua \ + * --model google/gemini-2.5-computer-use-preview-10-2025 + * + * To keep failures easy to attribute (per review discussion on #2188), the + * result records the model/agent-mode path that ran and whether the agent ever + * left the start page — i.e. whether a failure occurred before or after the + * first browser action. Finer-grained path attribution (function-response vs + * browser-execution) lives in the per-step trajectory logged below. + */ +export default defineBenchTask( + { name: "agent/cua_amazon_checkout" }, + async ({ debugUrl, sessionUrl, logger, agent, v3, input, modelName }) => { + const startUrl = + "https://browserbase.github.io/stagehand-eval-sites/sites/amazon/"; + const expectedUrl = + "https://browserbase.github.io/stagehand-eval-sites/sites/amazon/sign-in.html"; + + try { + if (!agent) { + throw new Error( + "agent/cua_amazon_checkout requires an agent instance — run it under an agent mode (e.g. --agent-mode cua).", + ); + } + + const page = v3.context.pages()[0]; + await page.goto(startUrl); + + const agentResult = await agent.execute({ + instruction: + "Add the product to the cart and proceed to checkout. Stop when you reach the sign-in page.", + maxSteps: Number(process.env.AGENT_EVAL_MAX_STEPS) || 10, + }); + logger.log(agentResult); + + const currentUrl = page.url(); + + return { + _success: currentUrl === expectedUrl, + currentUrl, + expectedUrl, + // Attribution context (see #2188): which provider/model path ran, and + // whether the agent got past the initial page before failing. + modelName, + agentMode: input.agentMode, + isCUA: input.isCUA, + leftStartPage: currentUrl !== startUrl, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } catch (error) { + return { + _success: false, + error, + modelName, + agentMode: input.agentMode, + isCUA: input.isCUA, + debugUrl, + sessionUrl, + logs: logger.getLogs(), + }; + } finally { + await v3.close(); + } + }, +);