Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions vero/src/vero/harbor/serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ class ServeConfig(BaseModel):
# Total attempts for the finalize baseline eval (>=1): a transient nested-run
# failure once silently dropped the regression check.
baseline_score_attempts: int = 2
# auto_best never ships a candidate that fails to beat the untouched baseline
# on the selection split; it reverts to base_commit instead (needs base_commit).
auto_best_baseline_floor: bool = True

# volumes / token
agent_volume: str
Expand Down Expand Up @@ -237,6 +240,7 @@ async def build_components(config: ServeConfig) -> tuple[EvaluationSidecar, Veri
selection_dataset_id=config.dataset_id,
score_baseline=config.score_baseline,
baseline_score_attempts=config.baseline_score_attempts,
auto_best_baseline_floor=config.auto_best_baseline_floor,
)

token = generate_token()
Expand Down
41 changes: 40 additions & 1 deletion vero/src/vero/harbor/verifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def __init__(
rescore_top_k: int = 3,
score_baseline: bool = False,
baseline_score_attempts: int = 2,
auto_best_baseline_floor: bool = True,
):
self.engine = engine
self.admin_volume = Path(admin_volume)
Expand All @@ -68,6 +69,12 @@ def __init__(
self.selection_dataset_id = selection_dataset_id
self.rescore_top_k = rescore_top_k
self.score_baseline = score_baseline
# auto_best selection floor: never ship a candidate that fails to beat the
# untouched baseline on the selection split. Without it, auto_best (which
# excludes base_commit from the candidate pool) selects the least-bad
# candidate even when every candidate regressed, shipping a regression
# (observed live: a weak inner model, every candidate below baseline).
self.auto_best_baseline_floor = auto_best_baseline_floor
# Baseline scoring is retried this many times total before its outcome is
# reported as an error; the nested eval can fail transiently (a nested
# harbor run crashing right after a large eval), and a single blip must
Expand Down Expand Up @@ -284,4 +291,36 @@ async def _best_from_db(self) -> str:
)
# Highest admin score wins; ties break to the earliest shortlist position.
rescored.sort(key=lambda t: (-t[0], t[1]))
return rescored[0][2]
best_score, _, best_commit = rescored[0]

# Selection floor: never ship a candidate that fails to beat the untouched
# baseline on the selection split. auto_best excludes base_commit from the
# candidate pool, so without this it selects the least-bad candidate even
# when every candidate regressed. Revert to the seed instead. Strict '>' so
# a statistical tie also reverts: if the optimizer cannot show an
# improvement, shipping the seed is the safe outcome. Needs a base_commit to
# compare against; costs one extra admin eval on the selection split.
if self.auto_best_baseline_floor and self.base_commit is not None:
base_dataset_id = self.selection_dataset_id
if base_dataset_id is None:
base_dataset_id = shortlist.iloc[0].get("dataset_subset_dataset_id")
base_exp = await self.engine.evaluate_admin(
task=self.selection_task,
dataset_id=base_dataset_id,
split=self.selection_split,
commit=self.base_commit,
)
base_s = base_exp.result.score()
base_score = float(base_s) if base_s is not None else default_minimum_score
if best_score <= base_score:
logger.info(
"auto_best floor: best candidate %s (admin_score=%s) does not beat "
"baseline %s (admin_score=%s); reverting to base_commit.",
best_commit, best_score, self.base_commit, base_score,
)
return self.base_commit
logger.info(
"auto_best floor: best candidate %s (%s) beats baseline (%s); keeping it.",
best_commit, best_score, base_score,
)
return best_commit
172 changes: 170 additions & 2 deletions vero/tests/test_harbor_verifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,7 +111,9 @@ async def _admin(*, task, dataset_id, split, commit, sample_ids=None):
assert rewards["reward"] == 0.95

@pytest.mark.asyncio
async def test_auto_best_excludes_baseline_after_rescore(self, tmp_path):
async def test_auto_best_excludes_baseline_from_ranking(self, tmp_path):
# base_commit is excluded from the candidate ranking pool. Floor off here so
# the test isolates ranking-exclusion (the floor is covered separately below).
engine = MagicMock()
engine.db.get_experiments_df.return_value = pd.DataFrame(
{
Expand All @@ -134,6 +136,7 @@ async def _admin(*, task, dataset_id, split, commit, sample_ids=None):
selection_split="validation",
base_commit="base",
selection_task="math",
auto_best_baseline_floor=False,
targets=[VerificationTarget(task="math", dataset_id="ds1", split="test", reward_key="reward")],
)
await v.finalize()
Expand All @@ -143,6 +146,169 @@ async def _admin(*, task, dataset_id, split, commit, sample_ids=None):
assert engine.evaluate_admin.await_args.kwargs["commit"] == "agent"


class TestAutoBestBaselineFloor:
"""auto_best never ships a candidate that fails to beat the baseline.

auto_best excludes base_commit from the candidate pool, so without a floor it
selects the least-bad candidate even when every candidate regressed (observed
live: a weak inner model, every candidate below baseline, shipped a -0.10
regression despite the free baseline being available). The floor reverts to the
seed instead.
"""

def _df(self):
return pd.DataFrame(
{
"dataset_subset_split": ["train", "train"],
"dataset_subset_dataset_id": ["ds1", "ds1"],
"candidate_commit": ["base", "agent"],
"mean_score": [0.3, 0.9], # agent inflated its own recorded score
"candidate_created_at": [1, 2],
}
)

@pytest.mark.asyncio
async def test_reverts_to_base_when_no_candidate_beats_baseline(self, tmp_path):
engine = MagicMock()
engine.db.get_experiments_df.return_value = self._df()

# agent admin-scores 0.2 on the selection split; base admin-scores 0.3;
# the reverted base scores 0.35 on the target split (distinct values so the
# assertions can tell the target eval apart from the floor comparison).
async def _admin(*, task, dataset_id, split, commit, sample_ids=None):
if commit == "base":
score = 0.35 if split == "validation" else 0.3
else:
score = 0.2
return MagicMock(result=MagicMock(score=MagicMock(return_value=score)))

engine.evaluate_admin = AsyncMock(side_effect=_admin)
v = Verifier(
engine=engine,
admin_volume=tmp_path,
reward_mode="auto_best",
selection_split="train",
base_commit="base",
selection_task="math",
targets=[VerificationTarget(task="math", dataset_id="ds1", split="validation", reward_key="reward")],
)
result = await v.finalize()
# winner reverted to base -> the emitted reward is the SEED's target-split
# score, not the regressed candidate's
assert result["rewards"] == {"reward": 0.35}
rescored = [c.kwargs["commit"] for c in engine.evaluate_admin.await_args_list]
assert "base" in rescored # base was admin-scored for the floor comparison
# the final call is the target eval of the reverted commit (validation split),
# not the floor comparison (train split)
assert engine.evaluate_admin.await_args.kwargs["commit"] == "base"
assert engine.evaluate_admin.await_args.kwargs["split"] == "validation"

@pytest.mark.asyncio
async def test_exact_tie_reverts_to_base(self, tmp_path):
# The floor uses '<=': a statistical tie reverts. If the optimizer cannot
# show an improvement, shipping the seed is the safe outcome. Pins the
# boundary so a refactor to '<' regresses loudly.
engine = MagicMock()
engine.db.get_experiments_df.return_value = self._df()

async def _admin(*, task, dataset_id, split, commit, sample_ids=None):
return MagicMock(result=MagicMock(score=MagicMock(return_value=0.3))) # all equal

engine.evaluate_admin = AsyncMock(side_effect=_admin)
v = Verifier(
engine=engine,
admin_volume=tmp_path,
reward_mode="auto_best",
selection_split="train",
base_commit="base",
selection_task="math",
targets=[VerificationTarget(task="math", dataset_id="ds1", split="validation", reward_key="reward")],
)
await v.finalize()
assert engine.evaluate_admin.await_args.kwargs["commit"] == "base"

@pytest.mark.asyncio
async def test_floor_noop_without_base_commit(self, tmp_path):
# floor on (default) but base_commit=None: the floor must silently no-op,
# never issuing an eval with commit=None, and the best candidate ships.
engine = MagicMock()
engine.db.get_experiments_df.return_value = pd.DataFrame(
{
"dataset_subset_split": ["train"],
"dataset_subset_dataset_id": ["ds1"],
"candidate_commit": ["agent"],
"mean_score": [0.9],
"candidate_created_at": [1],
}
)

async def _admin(*, task, dataset_id, split, commit, sample_ids=None):
return MagicMock(result=MagicMock(score=MagicMock(return_value=0.5)))

engine.evaluate_admin = AsyncMock(side_effect=_admin)
v = Verifier(
engine=engine,
admin_volume=tmp_path,
reward_mode="auto_best",
selection_split="train",
selection_task="math",
targets=[VerificationTarget(task="math", dataset_id="ds1", split="validation", reward_key="reward")],
)
await v.finalize()
commits = [c.kwargs["commit"] for c in engine.evaluate_admin.await_args_list]
assert None not in commits
assert engine.evaluate_admin.await_args.kwargs["commit"] == "agent"

@pytest.mark.asyncio
async def test_keeps_candidate_that_beats_baseline(self, tmp_path):
engine = MagicMock()
engine.db.get_experiments_df.return_value = self._df()

async def _admin(*, task, dataset_id, split, commit, sample_ids=None):
score = 0.3 if commit == "base" else 0.6 # agent genuinely improves
return MagicMock(result=MagicMock(score=MagicMock(return_value=score)))

engine.evaluate_admin = AsyncMock(side_effect=_admin)
v = Verifier(
engine=engine,
admin_volume=tmp_path,
reward_mode="auto_best",
selection_split="train",
base_commit="base",
selection_task="math",
targets=[VerificationTarget(task="math", dataset_id="ds1", split="validation", reward_key="reward")],
)
await v.finalize()
# 'agent' beats base -> it is selected and target-scored
assert engine.evaluate_admin.await_args.kwargs["commit"] == "agent"

@pytest.mark.asyncio
async def test_floor_off_ships_least_bad_candidate(self, tmp_path):
# With the floor disabled, the old behavior stands: the best candidate is
# shipped even if it did not beat the baseline (base is never scored).
engine = MagicMock()
engine.db.get_experiments_df.return_value = self._df()

async def _admin(*, task, dataset_id, split, commit, sample_ids=None):
return MagicMock(result=MagicMock(score=MagicMock(return_value=0.2)))

engine.evaluate_admin = AsyncMock(side_effect=_admin)
v = Verifier(
engine=engine,
admin_volume=tmp_path,
reward_mode="auto_best",
selection_split="train",
base_commit="base",
selection_task="math",
auto_best_baseline_floor=False,
targets=[VerificationTarget(task="math", dataset_id="ds1", split="validation", reward_key="reward")],
)
await v.finalize()
rescored = [c.kwargs["commit"] for c in engine.evaluate_admin.await_args_list]
assert "base" not in rescored
assert engine.evaluate_admin.await_args.kwargs["commit"] == "agent"


class TestNoCandidateFallback:
"""finalize() floors rewards when the optimizer produced no candidate.

Expand Down Expand Up @@ -212,7 +378,8 @@ async def test_auto_best_missing_db_still_raises(self, tmp_path):

@pytest.mark.asyncio
async def test_candidates_present_keeps_normal_selection(self, tmp_path):
# Regression guard: the fallback must not swallow the normal path.
# Regression guard: the fallback must not swallow the normal path. Floor off
# so this isolates candidate selection (the floor is covered separately).
engine = MagicMock()
engine.db.get_experiments_df.return_value = pd.DataFrame(
{
Expand All @@ -234,6 +401,7 @@ async def _admin(*, task, dataset_id, split, commit, sample_ids=None):
reward_mode="auto_best",
selection_split="train",
base_commit="base",
auto_best_baseline_floor=False,
targets=[VerificationTarget(task=None, dataset_id="ds1", split="validation", reward_key="accuracy")],
)
rewards = (await v.finalize())["rewards"]
Expand Down