scaleapi · shehabyasser-scale · Jul 4, 2026
diff --git a/vero/src/vero/harbor/serve.py b/vero/src/vero/harbor/serve.py
@@ -74,6 +74,9 @@ class ServeConfig(BaseModel):
     # Total attempts for the finalize baseline eval (>=1): a transient nested-run
     # failure once silently dropped the regression check.
     baseline_score_attempts: int = 2
+    # auto_best never ships a candidate that fails to beat the untouched baseline
+    # on the selection split; it reverts to base_commit instead (needs base_commit).
+    auto_best_baseline_floor: bool = True
 
     # volumes / token
     agent_volume: str
@@ -237,6 +240,7 @@ async def build_components(config: ServeConfig) -> tuple[EvaluationSidecar, Veri
         selection_dataset_id=config.dataset_id,
         score_baseline=config.score_baseline,
         baseline_score_attempts=config.baseline_score_attempts,
+        auto_best_baseline_floor=config.auto_best_baseline_floor,
     )
 
     token = generate_token()

diff --git a/vero/src/vero/harbor/verifier.py b/vero/src/vero/harbor/verifier.py
@@ -54,6 +54,7 @@ def __init__(
         rescore_top_k: int = 3,
         score_baseline: bool = False,
         baseline_score_attempts: int = 2,
+        auto_best_baseline_floor: bool = True,
     ):
         self.engine = engine
         self.admin_volume = Path(admin_volume)
@@ -68,6 +69,12 @@ def __init__(
         self.selection_dataset_id = selection_dataset_id
         self.rescore_top_k = rescore_top_k
         self.score_baseline = score_baseline
+        # auto_best selection floor: never ship a candidate that fails to beat the
+        # untouched baseline on the selection split. Without it, auto_best (which
+        # excludes base_commit from the candidate pool) selects the least-bad
+        # candidate even when every candidate regressed, shipping a regression
+        # (observed live: a weak inner model, every candidate below baseline).
+        self.auto_best_baseline_floor = auto_best_baseline_floor
         # Baseline scoring is retried this many times total before its outcome is
         # reported as an error; the nested eval can fail transiently (a nested
         # harbor run crashing right after a large eval), and a single blip must
@@ -284,4 +291,36 @@ async def _best_from_db(self) -> str:
             )
         # Highest admin score wins; ties break to the earliest shortlist position.
         rescored.sort(key=lambda t: (-t[0], t[1]))
-        return rescored[0][2]
+        best_score, _, best_commit = rescored[0]
+
+        # Selection floor: never ship a candidate that fails to beat the untouched
+        # baseline on the selection split. auto_best excludes base_commit from the
+        # candidate pool, so without this it selects the least-bad candidate even
+        # when every candidate regressed. Revert to the seed instead. Strict '>' so
+        # a statistical tie also reverts: if the optimizer cannot show an
+        # improvement, shipping the seed is the safe outcome. Needs a base_commit to
+        # compare against; costs one extra admin eval on the selection split.
+        if self.auto_best_baseline_floor and self.base_commit is not None:
+            base_dataset_id = self.selection_dataset_id
+            if base_dataset_id is None:
+                base_dataset_id = shortlist.iloc[0].get("dataset_subset_dataset_id")
+            base_exp = await self.engine.evaluate_admin(
+                task=self.selection_task,
+                dataset_id=base_dataset_id,
+                split=self.selection_split,
+                commit=self.base_commit,
+            )
+            base_s = base_exp.result.score()
+            base_score = float(base_s) if base_s is not None else default_minimum_score
+            if best_score <= base_score:
+                logger.info(
+                    "auto_best floor: best candidate %s (admin_score=%s) does not beat "
+                    "baseline %s (admin_score=%s); reverting to base_commit.",
+                    best_commit, best_score, self.base_commit, base_score,
+                )
+                return self.base_commit
+            logger.info(
+                "auto_best floor: best candidate %s (%s) beats baseline (%s); keeping it.",
+                best_commit, best_score, base_score,
+            )
+        return best_commit
diff --git a/vero/tests/test_harbor_verifier.py b/vero/tests/test_harbor_verifier.py
@@ -111,7 +111,9 @@ async def _admin(*, task, dataset_id, split, commit, sample_ids=None):
         assert rewards["reward"] == 0.95
 
     @pytest.mark.asyncio
-    async def test_auto_best_excludes_baseline_after_rescore(self, tmp_path):
+    async def test_auto_best_excludes_baseline_from_ranking(self, tmp_path):
+        # base_commit is excluded from the candidate ranking pool. Floor off here so
+        # the test isolates ranking-exclusion (the floor is covered separately below).
         engine = MagicMock()
         engine.db.get_experiments_df.return_value = pd.DataFrame(
             {
@@ -134,6 +136,7 @@ async def _admin(*, task, dataset_id, split, commit, sample_ids=None):
             selection_split="validation",
             base_commit="base",
             selection_task="math",
+            auto_best_baseline_floor=False,
             targets=[VerificationTarget(task="math", dataset_id="ds1", split="test", reward_key="reward")],
         )
         await v.finalize()
@@ -143,6 +146,169 @@ async def _admin(*, task, dataset_id, split, commit, sample_ids=None):
         assert engine.evaluate_admin.await_args.kwargs["commit"] == "agent"
 
 
+class TestAutoBestBaselineFloor:
+    """auto_best never ships a candidate that fails to beat the baseline.
+
+    auto_best excludes base_commit from the candidate pool, so without a floor it
+    selects the least-bad candidate even when every candidate regressed (observed
+    live: a weak inner model, every candidate below baseline, shipped a -0.10
+    regression despite the free baseline being available). The floor reverts to the
+    seed instead.
+    """
+
+    def _df(self):
+        return pd.DataFrame(
+            {
+                "dataset_subset_split": ["train", "train"],
+                "dataset_subset_dataset_id": ["ds1", "ds1"],
+                "candidate_commit": ["base", "agent"],
+                "mean_score": [0.3, 0.9],  # agent inflated its own recorded score
+                "candidate_created_at": [1, 2],
+            }
+        )
+
+    @pytest.mark.asyncio
+    async def test_reverts_to_base_when_no_candidate_beats_baseline(self, tmp_path):
+        engine = MagicMock()
+        engine.db.get_experiments_df.return_value = self._df()
+
+        # agent admin-scores 0.2 on the selection split; base admin-scores 0.3;
+        # the reverted base scores 0.35 on the target split (distinct values so the
+        # assertions can tell the target eval apart from the floor comparison).
+        async def _admin(*, task, dataset_id, split, commit, sample_ids=None):
+            if commit == "base":
+                score = 0.35 if split == "validation" else 0.3
+            else:
+                score = 0.2
+            return MagicMock(result=MagicMock(score=MagicMock(return_value=score)))
+
+        engine.evaluate_admin = AsyncMock(side_effect=_admin)
+        v = Verifier(
+            engine=engine,
+            admin_volume=tmp_path,
+            reward_mode="auto_best",
+            selection_split="train",
+            base_commit="base",
+            selection_task="math",
+            targets=[VerificationTarget(task="math", dataset_id="ds1", split="validation", reward_key="reward")],
+        )
+        result = await v.finalize()
+        # winner reverted to base -> the emitted reward is the SEED's target-split
+        # score, not the regressed candidate's
+        assert result["rewards"] == {"reward": 0.35}
+        rescored = [c.kwargs["commit"] for c in engine.evaluate_admin.await_args_list]
+        assert "base" in rescored  # base was admin-scored for the floor comparison
+        # the final call is the target eval of the reverted commit (validation split),
+        # not the floor comparison (train split)
+        assert engine.evaluate_admin.await_args.kwargs["commit"] == "base"
+        assert engine.evaluate_admin.await_args.kwargs["split"] == "validation"
+
+    @pytest.mark.asyncio
+    async def test_exact_tie_reverts_to_base(self, tmp_path):
+        # The floor uses '<=': a statistical tie reverts. If the optimizer cannot
+        # show an improvement, shipping the seed is the safe outcome. Pins the
+        # boundary so a refactor to '<' regresses loudly.
+        engine = MagicMock()
+        engine.db.get_experiments_df.return_value = self._df()
+
+        async def _admin(*, task, dataset_id, split, commit, sample_ids=None):
+            return MagicMock(result=MagicMock(score=MagicMock(return_value=0.3)))  # all equal
+
+        engine.evaluate_admin = AsyncMock(side_effect=_admin)
+        v = Verifier(
+            engine=engine,
+            admin_volume=tmp_path,
+            reward_mode="auto_best",
+            selection_split="train",
+            base_commit="base",
+            selection_task="math",
+            targets=[VerificationTarget(task="math", dataset_id="ds1", split="validation", reward_key="reward")],
+        )
+        await v.finalize()
+        assert engine.evaluate_admin.await_args.kwargs["commit"] == "base"
+
+    @pytest.mark.asyncio
+    async def test_floor_noop_without_base_commit(self, tmp_path):
+        # floor on (default) but base_commit=None: the floor must silently no-op,
+        # never issuing an eval with commit=None, and the best candidate ships.
+        engine = MagicMock()
+        engine.db.get_experiments_df.return_value = pd.DataFrame(
+            {
+                "dataset_subset_split": ["train"],
+                "dataset_subset_dataset_id": ["ds1"],
+                "candidate_commit": ["agent"],
+                "mean_score": [0.9],
+                "candidate_created_at": [1],
+            }
+        )
+
+        async def _admin(*, task, dataset_id, split, commit, sample_ids=None):
+            return MagicMock(result=MagicMock(score=MagicMock(return_value=0.5)))
+
+        engine.evaluate_admin = AsyncMock(side_effect=_admin)
+        v = Verifier(
+            engine=engine,
+            admin_volume=tmp_path,
+            reward_mode="auto_best",
+            selection_split="train",
+            selection_task="math",
+            targets=[VerificationTarget(task="math", dataset_id="ds1", split="validation", reward_key="reward")],
+        )
+        await v.finalize()
+        commits = [c.kwargs["commit"] for c in engine.evaluate_admin.await_args_list]
+        assert None not in commits
+        assert engine.evaluate_admin.await_args.kwargs["commit"] == "agent"
+
+    @pytest.mark.asyncio
+    async def test_keeps_candidate_that_beats_baseline(self, tmp_path):
+        engine = MagicMock()
+        engine.db.get_experiments_df.return_value = self._df()
+
+        async def _admin(*, task, dataset_id, split, commit, sample_ids=None):
+            score = 0.3 if commit == "base" else 0.6  # agent genuinely improves
+            return MagicMock(result=MagicMock(score=MagicMock(return_value=score)))
+
+        engine.evaluate_admin = AsyncMock(side_effect=_admin)
+        v = Verifier(
+            engine=engine,
+            admin_volume=tmp_path,
+            reward_mode="auto_best",
+            selection_split="train",
+            base_commit="base",
+            selection_task="math",
+            targets=[VerificationTarget(task="math", dataset_id="ds1", split="validation", reward_key="reward")],
+        )
+        await v.finalize()
+        # 'agent' beats base -> it is selected and target-scored
+        assert engine.evaluate_admin.await_args.kwargs["commit"] == "agent"
+
+    @pytest.mark.asyncio
+    async def test_floor_off_ships_least_bad_candidate(self, tmp_path):
+        # With the floor disabled, the old behavior stands: the best candidate is
+        # shipped even if it did not beat the baseline (base is never scored).
+        engine = MagicMock()
+        engine.db.get_experiments_df.return_value = self._df()
+
+        async def _admin(*, task, dataset_id, split, commit, sample_ids=None):
+            return MagicMock(result=MagicMock(score=MagicMock(return_value=0.2)))
+
+        engine.evaluate_admin = AsyncMock(side_effect=_admin)
+        v = Verifier(
+            engine=engine,
+            admin_volume=tmp_path,
+            reward_mode="auto_best",
+            selection_split="train",
+            base_commit="base",
+            selection_task="math",
+            auto_best_baseline_floor=False,
+            targets=[VerificationTarget(task="math", dataset_id="ds1", split="validation", reward_key="reward")],
+        )
+        await v.finalize()
+        rescored = [c.kwargs["commit"] for c in engine.evaluate_admin.await_args_list]
+        assert "base" not in rescored
+        assert engine.evaluate_admin.await_args.kwargs["commit"] == "agent"
+
+
 class TestNoCandidateFallback:
     """finalize() floors rewards when the optimizer produced no candidate.
 
@@ -212,7 +378,8 @@ async def test_auto_best_missing_db_still_raises(self, tmp_path):
 
     @pytest.mark.asyncio
     async def test_candidates_present_keeps_normal_selection(self, tmp_path):
-        # Regression guard: the fallback must not swallow the normal path.
+        # Regression guard: the fallback must not swallow the normal path. Floor off
+        # so this isolates candidate selection (the floor is covered separately).
         engine = MagicMock()
         engine.db.get_experiments_df.return_value = pd.DataFrame(
             {
@@ -234,6 +401,7 @@ async def _admin(*, task, dataset_id, split, commit, sample_ids=None):
             reward_mode="auto_best",
             selection_split="train",
             base_commit="base",
+            auto_best_baseline_floor=False,
             targets=[VerificationTarget(task=None, dataset_id="ds1", split="validation", reward_key="accuracy")],
         )
         rewards = (await v.finalize())["rewards"]