scaleapi · shehabyasser-scale · Jul 3, 2026 · Jul 4, 2026
diff --git a/vero/src/vero/harbor/build/compiler.py b/vero/src/vero/harbor/build/compiler.py
@@ -8,6 +8,7 @@
 
 from __future__ import annotations
 
+import dataclasses
 import logging
 import re
 import shutil
@@ -17,6 +18,7 @@
 from jinja2 import Environment, FileSystemLoader
 
 from vero.harbor.build.config import BuildConfig
+from vero.harbor.protocol import StatusSummary
 
 logger = logging.getLogger(__name__)
 
@@ -278,6 +280,14 @@ def compile_task(
         submit_enabled=config.submit_enabled,
         eval_num_samples=None,
         bake_inner_task=bool(config.inner_task),
+        # The free-baseline bullet may only render when the sidecar shipping in
+        # this same tree actually grants the free eval; the feature lives on a
+        # different PR chain than the compiler, and an instruction that promises
+        # it without it would send the agent to burn a metered eval on a commit
+        # auto_best cannot select. Introspecting the protocol keeps the
+        # instruction truthful under any merge order.
+        free_baseline="free_baseline_available"
+        in {f.name for f in dataclasses.fields(StatusSummary)},
     )
     _render(jenv, "task.toml.j2", out / "task.toml", **ctx)
     _render(jenv, "instruction.md.j2", out / "instruction.md", **ctx)

diff --git a/vero/src/vero/harbor/build/templates/instruction.md.j2 b/vero/src/vero/harbor/build/templates/instruction.md.j2
@@ -19,13 +19,24 @@ progress on the splits you *are* allowed to evaluate, within a fixed budget.
 {% if submit_enabled %}5. When done, nominate your best commit: `vero harbor submit`.{% else %}
 The best commit you evaluate on `{{ selection_split }}` is selected automatically and
 scored on the hidden test split at the end. Only commits *other than the seeded
-baseline* are selectable: evaluating the unmodified baseline spends budget without
-creating a candidate, so make sure at least one eval is of a commit that contains
-your changes.{% endif %}
+baseline* are selectable: baseline evals create no candidate, so make sure at least
+one eval is of a commit that contains your changes.{% endif %}
 
 ## Rules
 
 - Budget is finite and metered per split — spend it wisely.
+{% if free_baseline %}
+- Your first eval of the seeded baseline (the commit you started from) is
+  budget-free: once per task, not once per split, and it stays free after you have
+  made commits (`vero harbor eval --commit <baseline-sha> ...`). Take it on
+  `{{ selection_split }}` before your first candidate eval: it is the
+  reference score you must beat, and without it you cannot tell an improvement
+  from a regression. Repeat baseline evals are metered. `vero harbor status`
+  shows the baseline sha and whether the free eval is still available.
+{% endif %}
+- Scores are noisy. Unspent budget is wasted: if you finish with evals left, spend
+  them re-measuring your best candidate to confirm its score, or trying one more
+  variant, rather than stopping early.
 - The test split is hidden: you cannot evaluate it, and its labels never reach this
   container. Trying to read it will fail.
 - The scorer is locked. Only the eval sidecar scores.
diff --git a/vero/tests/test_harbor_build.py b/vero/tests/test_harbor_build.py
@@ -4,6 +4,7 @@
 
 from __future__ import annotations
 
+import dataclasses
 import json
 import subprocess
 import tomllib
@@ -13,8 +14,16 @@
 import yaml
 
 from vero.harbor.build import BuildConfig, compile_task
+from vero.harbor.protocol import StatusSummary
 from vero.harbor.serve import ServeConfig
 
+# Whether the sidecar in THIS tree grants the budget-free first baseline eval.
+# The feature and the compiler live on different PR chains; the instruction
+# tests below run the arm that matches whichever chains are merged here.
+_HAS_FREE_BASELINE = "free_baseline_available" in {
+    f.name for f in dataclasses.fields(StatusSummary)
+}
+
 
 def _stub_vero(root: Path) -> Path:
     """A minimal stand-in for the vero source tree (compiler just copies it)."""
@@ -175,12 +184,52 @@ def test_instruction_warns_baseline_not_selectable(built):
     # baseline died with "no candidate experiments" at finalize).
     text = (built / "instruction.md").read_text()
     assert "other than the seeded" in text
-    assert "spends budget without" in text
+    assert "create no candidate" in text
+
+
+@pytest.mark.skipif(
+    not _HAS_FREE_BASELINE, reason="sidecar in this tree has no free baseline eval"
+)
+def test_instruction_advertises_free_baseline_eval(built):
+    # The sidecar gives the first baseline eval away free; the instruction must
+    # say so or the offer goes unclaimed (found live: an optimizer produced only
+    # regressing candidates and never learned where zero was, because the old
+    # wording told it baseline evals waste budget).
+    text = (built / "instruction.md").read_text()
+    assert "budget-free" in text
+    assert "reference score" in text
+    # ...and it must aim the one-per-task freebie at the split where candidates
+    # are compared, or a multi-split task can waste it.
+    assert "once per task" in text
+
+
+@pytest.mark.skipif(
+    _HAS_FREE_BASELINE, reason="sidecar in this tree grants the free baseline eval"
+)
+def test_instruction_omits_free_baseline_claim_when_unsupported(built):
+    # Merge-order guard: if the compiler chain lands without the free-baseline
+    # chain, the instruction must not promise a freebie the sidecar will meter;
+    # acting on that promise burns a metered eval on a commit auto_best cannot
+    # select (fatal on a run_budget=1 task).
+    text = (built / "instruction.md").read_text()
+    assert "budget-free" not in text
+
+
+def test_instruction_tells_agent_to_spend_whole_budget(built):
+    # Two live runs ended with nearly half the eval budget unspent; the
+    # instruction must state that unspent evals are wasted and re-measurement
+    # is a legitimate spend.
+    text = (built / "instruction.md").read_text()
+    assert "Unspent budget is wasted" in text
+    assert "re-measuring your best candidate" in text
 
 
 def test_submit_mode_instruction_has_no_baseline_warning(tmp_path, monkeypatch):
-    # The warning belongs to the auto_best branch only; pin the conditional
-    # boundary so a template refactor cannot leak it into submit-mode tasks.
+    # The not-selectable warning belongs to the auto_best branch only; pin the
+    # conditional boundary so a template refactor cannot leak it into
+    # submit-mode tasks. The free-baseline and spend-the-budget rules are
+    # mode-agnostic (metering does not depend on the selection mode) and must
+    # survive in both.
     monkeypatch.setenv("VERO_SKIP_SECRET_CHECK", "1")
     config = BuildConfig(
         name="vero/gsm8k-opt",
@@ -195,4 +244,6 @@ def test_submit_mode_instruction_has_no_baseline_warning(tmp_path, monkeypatch):
     out = compile_task(config, tmp_path / "task", vero_root=_stub_vero(tmp_path))
     text = (out / "instruction.md").read_text()
     assert "other than the seeded" not in text
-    assert "spends budget without" not in text
+    assert "create no candidate" not in text
+    assert ("budget-free" in text) == _HAS_FREE_BASELINE
+    assert "Unspent budget is wasted" in text