Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 10 additions & 0 deletions vero/src/vero/harbor/build/compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@

from __future__ import annotations

import dataclasses
import logging
import re
import shutil
Expand All @@ -17,6 +18,7 @@
from jinja2 import Environment, FileSystemLoader

from vero.harbor.build.config import BuildConfig
from vero.harbor.protocol import StatusSummary

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -278,6 +280,14 @@ def compile_task(
submit_enabled=config.submit_enabled,
eval_num_samples=None,
bake_inner_task=bool(config.inner_task),
# The free-baseline bullet may only render when the sidecar shipping in
# this same tree actually grants the free eval; the feature lives on a
# different PR chain than the compiler, and an instruction that promises
# it without it would send the agent to burn a metered eval on a commit
# auto_best cannot select. Introspecting the protocol keeps the
# instruction truthful under any merge order.
free_baseline="free_baseline_available"
in {f.name for f in dataclasses.fields(StatusSummary)},
Comment thread
greptile-apps[bot] marked this conversation as resolved.
Outdated
)
_render(jenv, "task.toml.j2", out / "task.toml", **ctx)
_render(jenv, "instruction.md.j2", out / "instruction.md", **ctx)
Expand Down
17 changes: 14 additions & 3 deletions vero/src/vero/harbor/build/templates/instruction.md.j2
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,24 @@ progress on the splits you *are* allowed to evaluate, within a fixed budget.
{% if submit_enabled %}5. When done, nominate your best commit: `vero harbor submit`.{% else %}
The best commit you evaluate on `{{ selection_split }}` is selected automatically and
scored on the hidden test split at the end. Only commits *other than the seeded
baseline* are selectable: evaluating the unmodified baseline spends budget without
creating a candidate, so make sure at least one eval is of a commit that contains
your changes.{% endif %}
baseline* are selectable: baseline evals create no candidate, so make sure at least
one eval is of a commit that contains your changes.{% endif %}

## Rules

- Budget is finite and metered per split — spend it wisely.
{% if free_baseline %}
- Your first eval of the seeded baseline (the commit you started from) is
budget-free: once per task, not once per split, and it stays free after you have
made commits (`vero harbor eval --commit <baseline-sha> ...`). Take it on
`{{ selection_split }}` before your first candidate eval: it is the
reference score you must beat, and without it you cannot tell an improvement
from a regression. Repeat baseline evals are metered. `vero harbor status`
shows the baseline sha and whether the free eval is still available.
{% endif %}
- Scores are noisy. Unspent budget is wasted: if you finish with evals left, spend
them re-measuring your best candidate to confirm its score, or trying one more
variant, rather than stopping early.
- The test split is hidden: you cannot evaluate it, and its labels never reach this
container. Trying to read it will fail.
- The scorer is locked. Only the eval sidecar scores.
59 changes: 55 additions & 4 deletions vero/tests/test_harbor_build.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from __future__ import annotations

import dataclasses
import json
import subprocess
import tomllib
Expand All @@ -13,8 +14,16 @@
import yaml

from vero.harbor.build import BuildConfig, compile_task
from vero.harbor.protocol import StatusSummary
from vero.harbor.serve import ServeConfig

# Whether the sidecar in THIS tree grants the budget-free first baseline eval.
# The feature and the compiler live on different PR chains; the instruction
# tests below run the arm that matches whichever chains are merged here.
_HAS_FREE_BASELINE = "free_baseline_available" in {
f.name for f in dataclasses.fields(StatusSummary)
}


def _stub_vero(root: Path) -> Path:
"""A minimal stand-in for the vero source tree (compiler just copies it)."""
Expand Down Expand Up @@ -175,12 +184,52 @@ def test_instruction_warns_baseline_not_selectable(built):
# baseline died with "no candidate experiments" at finalize).
text = (built / "instruction.md").read_text()
assert "other than the seeded" in text
assert "spends budget without" in text
assert "create no candidate" in text


@pytest.mark.skipif(
not _HAS_FREE_BASELINE, reason="sidecar in this tree has no free baseline eval"
)
def test_instruction_advertises_free_baseline_eval(built):
# The sidecar gives the first baseline eval away free; the instruction must
# say so or the offer goes unclaimed (found live: an optimizer produced only
# regressing candidates and never learned where zero was, because the old
# wording told it baseline evals waste budget).
text = (built / "instruction.md").read_text()
assert "budget-free" in text
assert "reference score" in text
# ...and it must aim the one-per-task freebie at the split where candidates
# are compared, or a multi-split task can waste it.
assert "once per task" in text


@pytest.mark.skipif(
_HAS_FREE_BASELINE, reason="sidecar in this tree grants the free baseline eval"
)
def test_instruction_omits_free_baseline_claim_when_unsupported(built):
# Merge-order guard: if the compiler chain lands without the free-baseline
# chain, the instruction must not promise a freebie the sidecar will meter;
# acting on that promise burns a metered eval on a commit auto_best cannot
# select (fatal on a run_budget=1 task).
text = (built / "instruction.md").read_text()
assert "budget-free" not in text


def test_instruction_tells_agent_to_spend_whole_budget(built):
# Two live runs ended with nearly half the eval budget unspent; the
# instruction must state that unspent evals are wasted and re-measurement
# is a legitimate spend.
text = (built / "instruction.md").read_text()
assert "Unspent budget is wasted" in text
assert "re-measuring your best candidate" in text


def test_submit_mode_instruction_has_no_baseline_warning(tmp_path, monkeypatch):
# The warning belongs to the auto_best branch only; pin the conditional
# boundary so a template refactor cannot leak it into submit-mode tasks.
# The not-selectable warning belongs to the auto_best branch only; pin the
# conditional boundary so a template refactor cannot leak it into
# submit-mode tasks. The free-baseline and spend-the-budget rules are
# mode-agnostic (metering does not depend on the selection mode) and must
# survive in both.
monkeypatch.setenv("VERO_SKIP_SECRET_CHECK", "1")
config = BuildConfig(
name="vero/gsm8k-opt",
Expand All @@ -195,4 +244,6 @@ def test_submit_mode_instruction_has_no_baseline_warning(tmp_path, monkeypatch):
out = compile_task(config, tmp_path / "task", vero_root=_stub_vero(tmp_path))
text = (out / "instruction.md").read_text()
assert "other than the seeded" not in text
assert "spends budget without" not in text
assert "create no candidate" not in text
assert ("budget-free" in text) == _HAS_FREE_BASELINE
assert "Unspent budget is wasted" in text