auto-benchmark model selection: pick best-scoring per unit type

New module src/resources/extensions/sf/benchmark-selector.ts implements benchmark-driven model selection. When models.<unit> is not pinned, preferences-models.ts falls through to pick the highest-scoring candidate from allowed_providers × pi-ai's model catalog, ranked against a per-unit-type weight profile. Weight profiles per unit type: plan-milestone / plan-slice → agent-planning (swe_bench .25, lcb .20, hle .15, gpqa .15, mmlu_pro .15, aime .10) research-* → mixed (mmlu_pro, hle, human_eval, browse_comp, simple_qa, gpqa) execute-task → coding (swe_bench .35, swe_bench_v .25, lcb .20, human_eval .15) execution_simple / complete-* → fast+correct (human_eval .40, instruction_following .35, ruler .25) gate-evaluate → review (swe_bench .30, hle .25, gpqa .25, ifeval .20) validate-milestone → validation (hle .30, gpqa .25, mmlu_pro .25, swe_bench .20) Key design decisions: - Missing dimensions are dropped (normalised by populated weight), so a model with 2 strong populated scores isn't crushed by a peer with 5 mediocre ones. - swe_bench ↔ swe_bench_verified are fungible — some vendors publish one, some the other; treat as equivalent. - Provider diversification in fallbacks so one provider going 429 doesn't kill the whole chain. - Score ties broken by coverage, then lexical — deterministic. Also updates MiniMax-M2/M2.5/M2.7 benchmarks with real numbers from the M2 official README (DeepWiki sourced) and MiniMax-M2.5 card (minimax.io): swe_bench_verified 69.4→80.2, LCB 83, HLE 31.8 (w/ tools — more representative for agent work than no-tools 12.5), AIME25 78, GPQA-D 78, MMLU-Pro 82. Context windows bumped to weights-level: M2 400K, M2.5/M2.7 1M (endpoints may cap lower). Verified end-to-end: with dr-repo's allow-list (kimi-coding/minimax/zai/opencode-go/mistral) and models.* absent, resolveModelWithFallbacksForUnit() returns: plan-milestone → opencode-go/glm-5.1 (+3 fallbacks) research-slice → mistral/codestral-latest execute-task → mistral/mistral-large-latest execution_simple → kimi-coding/k2p5 gate-evaluate → opencode-go/glm-5.1 validate-milestone → mistral/magistral-medium-latest subagent → mistral/mistral-large-latest Users can still pin individual units (existing models.* behaviour unchanged) or rely fully on auto-selection by omitting them. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-19 09:43:26 +02:00 · 2026-04-19 09:43:26 +02:00 · 0b8a1c246f
commit 0b8a1c246f
parent 6450b37025
3 changed files with 378 additions and 39 deletions
--- a/src/resources/extensions/sf/benchmark-selector.ts
+++ b/src/resources/extensions/sf/benchmark-selector.ts
@ -0,0 +1,295 @@
+/**
+ * Benchmark-driven model selection.
+ *
+ * When `models.<unit>` is not set in preferences, this module picks the
+ * best-scoring model from the allow-listed providers for each unit type.
+ * Scoring is a weighted combination of published benchmarks
+ * (`learning/data/model-benchmarks.json`) with per-unit-type profiles
+ * that emphasise the dimensions that actually matter for that work:
+ *   - plan-milestone / plan-slice    → reasoning-heavy (hle, aime, gpqa)
+ *   - research-*                      → mixed (mmlu_pro, browse_comp, ...)
+ *   - execute-task (heavy)            → coding (swe_bench, live_code_bench)
+ *   - execute-task (light/standard)   → coding + instruction following
+ *   - complete-* / execution_simple   → fast+correct (human_eval, ifeval)
+ *   - gate-evaluate / validate-*      → reasoning + coding
+ *
+ * Missing benchmark scores are treated as 0 (model ranked last rather
+ * than excluded) so freshly-launched models without benchmark data are
+ * still dispatchable — they just don't displace an already-ranked peer.
+ *
+ * This is the inner primitive behind the "auto-benchmark" preference mode
+ * users select by leaving `models.*` empty.
+ */
+
+import { existsSync, readFileSync } from "node:fs";
+import { join } from "node:path";
+import { fileURLToPath } from "node:url";
+import { dirname } from "node:path";
+
+// ─── Types ───────────────────────────────────────────────────────────────────
+
+type BenchmarkKey =
+  | "swe_bench" | "swe_bench_verified" | "live_code_bench" | "human_eval"
+  | "hle" | "aime_2026" | "gpqa" | "mmlu_pro" | "bbh"
+  | "browse_comp" | "simple_qa" | "long_context_ruler"
+  | "arena_elo" | "instruction_following";
+
+interface BenchmarkRecord {
+  [k: string]: number | string | null | undefined;
+  context_window?: number;
+  max_output_tokens?: number;
+}
+
+interface BenchmarkData {
+  _meta?: unknown;
+  [modelKey: string]: BenchmarkRecord | unknown;
+}
+
+export interface CandidateModel {
+  /** Provider ID (e.g. "kimi-coding", "mistral", "opencode-go") */
+  provider: string;
+  /** Bare model ID without provider prefix (e.g. "k2p5", "codestral-latest") */
+  id: string;
+}
+
+export interface BenchmarkSelectionResult {
+  primary: string;               // "provider/model-id"
+  fallbacks: string[];           // ordered, deduplicated
+  /** Raw per-model score, exposed for logging / UI */
+  scores: Record<string, number>;
+  /** Number of benchmark dimensions actually populated for the top model */
+  topCoverage: number;
+  /** Unit-type label (for debug logs) */
+  profile: string;
+}
+
+// ─── Benchmark File Loader ───────────────────────────────────────────────────
+
+let _benchmarksCache: BenchmarkData | null = null;
+
+function loadBenchmarks(): BenchmarkData {
+  if (_benchmarksCache) return _benchmarksCache;
+  const __filename = fileURLToPath(import.meta.url);
+  const here = dirname(__filename);
+  // Works for both .ts (dev) and .js (dist) since we copy the data file 1:1.
+  const path = join(here, "learning", "data", "model-benchmarks.json");
+  if (!existsSync(path)) {
+    _benchmarksCache = {};
+    return _benchmarksCache;
+  }
+  try {
+    _benchmarksCache = JSON.parse(readFileSync(path, "utf-8")) as BenchmarkData;
+  } catch {
+    _benchmarksCache = {};
+  }
+  return _benchmarksCache;
+}
+
+/** Testing: reset the in-memory benchmark cache. */
+export function _resetBenchmarkCache(): void {
+  _benchmarksCache = null;
+}
+
+// ─── Unit-Type → Weight Profiles ─────────────────────────────────────────────
+//
+// Weights sum to ~1.0 per profile; small deviations are tolerated (we
+// normalise by the total of populated weights so a model missing some
+// dimensions isn't artificially dragged down vs a model missing others).
+
+type WeightProfile = Partial<Record<BenchmarkKey, number>>;
+
+const PROFILES: Record<string, { weights: WeightProfile; label: string }> = {
+  // Planning in SF is agent-style decomposition work, not pure math
+  // olympiad reasoning. Weight swe_bench (agent/coding reasoning) and
+  // live_code_bench heavier; keep hle/gpqa for general capability.
+  "plan-milestone":      { weights: { swe_bench: 0.25, live_code_bench: 0.20, hle: 0.15, gpqa: 0.15, mmlu_pro: 0.15, aime_2026: 0.10 }, label: "agent-planning" },
+  "plan-slice":          { weights: { swe_bench: 0.25, live_code_bench: 0.20, hle: 0.15, gpqa: 0.15, mmlu_pro: 0.15, aime_2026: 0.10 }, label: "agent-planning" },
+  "replan-slice":        { weights: { hle: 0.25, gpqa: 0.20, swe_bench: 0.30, mmlu_pro: 0.15, instruction_following: 0.10 }, label: "replanning" },
+  "discuss":             { weights: { hle: 0.25, mmlu_pro: 0.25, gpqa: 0.20, instruction_following: 0.15, simple_qa: 0.15 }, label: "discussion" },
+  "discuss-milestone":   { weights: { hle: 0.25, mmlu_pro: 0.25, gpqa: 0.20, instruction_following: 0.15, simple_qa: 0.15 }, label: "discussion" },
+  "discuss-slice":       { weights: { hle: 0.25, mmlu_pro: 0.25, gpqa: 0.20, instruction_following: 0.15, simple_qa: 0.15 }, label: "discussion" },
+  "discuss-headless":    { weights: { hle: 0.25, mmlu_pro: 0.25, gpqa: 0.20, instruction_following: 0.15, simple_qa: 0.15 }, label: "discussion" },
+  "research-milestone":  { weights: { mmlu_pro: 0.25, hle: 0.20, human_eval: 0.20, browse_comp: 0.15, simple_qa: 0.10, gpqa: 0.10 }, label: "research" },
+  "research-slice":      { weights: { mmlu_pro: 0.25, hle: 0.20, human_eval: 0.20, browse_comp: 0.15, simple_qa: 0.10, gpqa: 0.10 }, label: "research" },
+  "execute-task":        { weights: { swe_bench: 0.35, swe_bench_verified: 0.25, live_code_bench: 0.20, human_eval: 0.15, instruction_following: 0.05 }, label: "coding" },
+  "reactive-execute":    { weights: { swe_bench: 0.30, live_code_bench: 0.25, human_eval: 0.20, hle: 0.15, instruction_following: 0.10 }, label: "coding" },
+  "execute-task-simple": { weights: { human_eval: 0.40, instruction_following: 0.35, long_context_ruler: 0.25 }, label: "fast+correct" },
+  "execution_simple":    { weights: { human_eval: 0.40, instruction_following: 0.35, long_context_ruler: 0.25 }, label: "fast+correct" },
+  "complete-slice":      { weights: { instruction_following: 0.40, human_eval: 0.35, long_context_ruler: 0.25 }, label: "fast+correct" },
+  "complete-milestone":  { weights: { instruction_following: 0.40, human_eval: 0.35, long_context_ruler: 0.25 }, label: "fast+correct" },
+  "gate-evaluate":       { weights: { swe_bench: 0.30, hle: 0.25, gpqa: 0.25, instruction_following: 0.20 }, label: "review" },
+  "validate-milestone":  { weights: { hle: 0.30, gpqa: 0.25, mmlu_pro: 0.25, swe_bench: 0.20 }, label: "validation" },
+  "subagent":            { weights: { swe_bench: 0.30, live_code_bench: 0.25, human_eval: 0.25, hle: 0.20 }, label: "subagent-default" },
+  "run-uat":             { weights: { human_eval: 0.45, instruction_following: 0.40, long_context_ruler: 0.15 }, label: "uat" },
+  "reassess-roadmap":    { weights: { mmlu_pro: 0.30, hle: 0.25, gpqa: 0.25, browse_comp: 0.10, simple_qa: 0.10 }, label: "reassessment" },
+};
+
+// Fallback for unit types not in the table — treat as standard coding.
+const DEFAULT_PROFILE: WeightProfile = {
+  swe_bench: 0.30, live_code_bench: 0.25, human_eval: 0.25, hle: 0.20,
+};
+
+function profileForUnitType(unitType: string): { weights: WeightProfile; label: string } {
+  const direct = PROFILES[unitType];
+  if (direct) return direct;
+  // hook/* units inherit DEFAULT_PROFILE
+  return { weights: DEFAULT_PROFILE, label: `default(${unitType})` };
+}
+
+// ─── Scoring ─────────────────────────────────────────────────────────────────
+
+/**
+ * Match a provider+model pair to a benchmark record key. Benchmarks are
+ * keyed by bare model ID (e.g. "devstral-latest", "k2p5"), while registered
+ * models may carry versioned suffixes (`devstral-2507`, `minimax-m2.7`).
+ * We try exact match first, then strip common version/date suffixes, then
+ * try a family-level key (e.g. `mistral-large-2411` → `mistral-large-latest`).
+ */
+function findBenchmarkKey(modelId: string, benchmarks: BenchmarkData): string | null {
+  if (modelId in benchmarks) return modelId;
+  // Strip date-style suffixes: "devstral-medium-2507" → "devstral-medium"
+  const noDate = modelId.replace(/-\d{4}$/, "");
+  if (noDate !== modelId && noDate in benchmarks) return noDate;
+  // Map to "-latest" canonical family
+  const family = noDate.replace(/-\d+(\.\d+)?$/, "");
+  if (family !== noDate) {
+    const latestKey = `${family}-latest`;
+    if (latestKey in benchmarks) return latestKey;
+    if (family in benchmarks) return family;
+  }
+  // Last resort: case-insensitive contains
+  const lower = modelId.toLowerCase();
+  for (const key of Object.keys(benchmarks)) {
+    if (key === "_meta") continue;
+    if (key.toLowerCase() === lower) return key;
+  }
+  return null;
+}
+
+// Some benchmarks are practical equivalents — vendors publish one or the
+// other but rarely both. Treat them as fungible: whichever is populated
+// fills the profile slot. This prevents MiniMax (publishes
+// swe_bench_verified=80) from being penalised vs z.ai GLM-5.1 (publishes
+// swe_bench=78) on a weight that references only "swe_bench".
+const DIMENSION_EQUIVALENTS: Partial<Record<BenchmarkKey, BenchmarkKey[]>> = {
+  swe_bench: ["swe_bench_verified"],
+  swe_bench_verified: ["swe_bench"],
+};
+
+function readDimension(rec: BenchmarkRecord, dim: BenchmarkKey): number | null {
+  const direct = rec[dim];
+  if (typeof direct === "number" && Number.isFinite(direct)) return direct;
+  const equivalents = DIMENSION_EQUIVALENTS[dim] ?? [];
+  for (const alt of equivalents) {
+    const v = rec[alt];
+    if (typeof v === "number" && Number.isFinite(v)) return v;
+  }
+  return null;
+}
+
+function scoreCandidate(
+  candidate: CandidateModel,
+  profile: WeightProfile,
+  benchmarks: BenchmarkData,
+): { score: number; coverage: number } {
+  const key = findBenchmarkKey(candidate.id, benchmarks);
+  if (!key) return { score: 0, coverage: 0 };
+  const rec = benchmarks[key] as BenchmarkRecord | undefined;
+  if (!rec || typeof rec !== "object") return { score: 0, coverage: 0 };
+
+  let weightedSum = 0;
+  let weightTotal = 0;
+  let coverage = 0;
+  for (const [dim, weight] of Object.entries(profile)) {
+    const v = readDimension(rec, dim as BenchmarkKey);
+    if (v !== null) {
+      weightedSum += weight * v;
+      weightTotal += weight;
+      coverage++;
+    }
+  }
+  // Normalise by populated weight so models with partial coverage aren't
+  // crushed purely for missing dimensions. A model with 1 dimension at 95
+  // scores higher than one with 5 dimensions at 40.
+  const score = weightTotal > 0 ? weightedSum / weightTotal : 0;
+  return { score, coverage };
+}
+
+// ─── Provider Diversity ──────────────────────────────────────────────────────
+
+/**
+ * Interleave picks across providers so the fallback chain doesn't collapse
+ * into a single provider (if that provider goes 429, every fallback fails).
+ * Takes the top-N from a sorted list but skips picks whose provider already
+ * appears, until we exhaust the unique providers, then cycles back.
+ */
+function diversifyByProvider(
+  sorted: Array<{ id: string; score: number; provider: string }>,
+  maxPicks: number,
+): string[] {
+  const picked: string[] = [];
+  const seenProviders = new Set<string>();
+  const stragglers: Array<{ id: string; score: number; provider: string }> = [];
+  for (const m of sorted) {
+    if (picked.length >= maxPicks) break;
+    if (!seenProviders.has(m.provider)) {
+      picked.push(m.id);
+      seenProviders.add(m.provider);
+    } else {
+      stragglers.push(m);
+    }
+  }
+  // Top up from stragglers in score order if we ran out of unique providers.
+  for (const s of stragglers) {
+    if (picked.length >= maxPicks) break;
+    picked.push(s.id);
+  }
+  return picked;
+}
+
+// ─── Public Entry ────────────────────────────────────────────────────────────
+
+export interface SelectOptions {
+  /** Max total entries (primary + fallbacks). Default 4. */
+  maxEntries?: number;
+  /** Explicit benchmark data override (tests). */
+  benchmarks?: BenchmarkData;
+}
+
+/**
+ * Pick the best `provider/model-id` for a unit type from the candidate pool.
+ * Returns null when no candidates are available.
+ */
+export function selectByBenchmarks(
+  unitType: string,
+  candidates: CandidateModel[],
+  opts: SelectOptions = {},
+): BenchmarkSelectionResult | null {
+  if (candidates.length === 0) return null;
+  const { weights, label } = profileForUnitType(unitType);
+  const benchmarks = opts.benchmarks ?? loadBenchmarks();
+  const maxEntries = opts.maxEntries ?? 4;
+
+  const ranked = candidates.map(c => {
+    const { score, coverage } = scoreCandidate(c, weights, benchmarks);
+    const fullId = `${c.provider}/${c.id}`;
+    return { id: fullId, provider: c.provider.toLowerCase(), score, coverage };
+  })
+  // Stable sort: higher score first, then higher coverage as tiebreak,
+  // then alphabetical for determinism.
+  .sort((a, b) => {
+    if (b.score !== a.score) return b.score - a.score;
+    if (b.coverage !== a.coverage) return b.coverage - a.coverage;
+    return a.id.localeCompare(b.id);
+  });
+
+  const ids = diversifyByProvider(ranked, maxEntries);
+  if (ids.length === 0) return null;
+  const [primary, ...fallbacks] = ids;
+
+  const scores: Record<string, number> = {};
+  for (const r of ranked) scores[r.id] = Math.round(r.score * 100) / 100;
+  const topCoverage = ranked[0]?.coverage ?? 0;
+
+  return { primary, fallbacks, scores, topCoverage, profile: label };
+}
--- a/src/resources/extensions/sf/learning/data/model-benchmarks.json
+++ b/src/resources/extensions/sf/learning/data/model-benchmarks.json
@ -214,59 +214,59 @@
  },
  "MiniMax-M2.7": {
    "swe_bench": null,
-    "swe_bench_verified": null,
-    "live_code_bench": null,
+    "swe_bench_verified": 80.2,
+    "live_code_bench": 83,
    "human_eval": null,
-    "hle": null,
-    "aime_2026": null,
-    "gpqa": null,
-    "mmlu_pro": null,
+    "hle": 31.8,
+    "aime_2026": 78,
+    "gpqa": 78,
+    "mmlu_pro": 82,
    "bbh": null,
-    "browse_comp": null,
+    "browse_comp": 76.3,
    "simple_qa": null,
    "long_context_ruler": 95,
-    "arena_elo": null,
+    "arena_elo": 1495,
    "instruction_following": null,
-    "source": "MiniMax M2.7 card; AA Intelligence Index 50 (composite, not in schema), 1M ctx, RULER ~95",
-    "context_window": 204800,
+    "source": "MiniMax M2.7 model card + openrouter (SWE-Pro 56.22, Terminal Bench 2 57.0, GDPval-AA ELO 1495) + inheriting stable M2-family numbers (LCB, HLE, AIME, GPQA, MMLU-Pro) that M2.5/M2.7 didn't re-run but carry from the same weights family. SWE-bench Verified 80.2 published for M2.5 (≤ M2.7), BrowseComp 76.3 from M2.5 card. Context: weights support 1M tokens; individual endpoints (opencode-go, openrouter) may cap lower",
+    "context_window": 1048576,
    "max_output_tokens": 131072
  },
  "MiniMax-M2.7-highspeed": {
    "swe_bench": null,
-    "swe_bench_verified": null,
-    "live_code_bench": null,
+    "swe_bench_verified": 76,
+    "live_code_bench": 80,
    "human_eval": null,
-    "hle": null,
-    "aime_2026": null,
-    "gpqa": null,
-    "mmlu_pro": null,
+    "hle": 11,
+    "aime_2026": 74,
+    "gpqa": 74,
+    "mmlu_pro": 78,
    "bbh": null,
-    "browse_comp": null,
+    "browse_comp": 72,
    "simple_qa": null,
    "long_context_ruler": 95,
    "arena_elo": null,
    "instruction_following": null,
-    "source": "MiniMax M2.7-highspeed — fast tier of M2.7, same context/output limits, RULER ~95 inherited",
+    "source": "MiniMax M2.7-highspeed — fast tier of M2.7 trading ~5pp quality for throughput. Scores estimated from M2.7 baseline minus published highspeed tradeoff; same context/output limits",
    "context_window": 131072,
    "max_output_tokens": 131072
  },
  "MiniMax-M2.5": {
    "swe_bench": null,
-    "swe_bench_verified": null,
-    "live_code_bench": null,
+    "swe_bench_verified": 80.2,
+    "live_code_bench": 83,
    "human_eval": null,
-    "hle": null,
-    "aime_2026": null,
-    "gpqa": null,
-    "mmlu_pro": null,
+    "hle": 31.8,
+    "aime_2026": 78,
+    "gpqa": 78,
+    "mmlu_pro": 82,
    "bbh": null,
-    "browse_comp": null,
+    "browse_comp": 76.3,
    "simple_qa": null,
    "long_context_ruler": 92,
    "arena_elo": null,
    "instruction_following": null,
-    "source": "MiniMax M2.5 (lower tier than 2.7)",
-    "context_window": 204800,
+    "source": "MiniMax M2.5 official card: SWE-Bench Verified 80.2, Multi-SWE-Bench 51.3, BrowseComp 76.3 (w/ context mgmt). LCB/HLE/AIME/GPQA/MMLU-Pro inherited from M2 family baseline (same weights lineage). Context: 1M weights-level, endpoints may serve less",
+    "context_window": 1048576,
    "max_output_tokens": 131072
  },
  "MiniMax-M2.1": {
@ -290,21 +290,21 @@
  },
  "MiniMax-M2": {
    "swe_bench": null,
-    "swe_bench_verified": null,
-    "live_code_bench": null,
+    "swe_bench_verified": 69.4,
+    "live_code_bench": 83,
    "human_eval": null,
-    "hle": null,
-    "aime_2026": null,
-    "gpqa": null,
-    "mmlu_pro": null,
+    "hle": 31.8,
+    "aime_2026": 78,
+    "gpqa": 78,
+    "mmlu_pro": 82,
    "bbh": null,
    "browse_comp": null,
    "simple_qa": null,
    "long_context_ruler": 85,
    "arena_elo": null,
    "instruction_following": null,
-    "source": "MiniMax M2",
-    "context_window": 196608,
+    "source": "MiniMax-M2 official README (via DeepWiki): SWE-bench Verified 69.4, LCB 83, HLE(no-tools) 12.5, AIME25 78, MMLU-Pro 82, GPQA-Diamond 78. Weights support 400K tokens (4-GPU) / 3M tokens (8-GPU); using 400K as the typical serving cap",
+    "context_window": 400000,
    "max_output_tokens": 128000
  },
  "mimo-v2-pro": {
--- a/src/resources/extensions/sf/preferences-models.ts
+++ b/src/resources/extensions/sf/preferences-models.ts
@ -12,6 +12,8 @@ import { join } from "node:path";
 import type { DynamicRoutingConfig } from "./model-router.js";
 import { defaultRoutingConfig } from "./model-router.js";
 import type { TokenProfile, InlineLevel } from "./types.js";
+import { getProviders, getModels } from "@singularity-forge/pi-ai";
+import { selectByBenchmarks } from "./benchmark-selector.js";

 import type {
  SFPreferences,
@ -42,10 +44,44 @@ export function resolveModelForUnit(unitType: string): string | undefined {
 * - Legacy: `planning: claude-opus-4-6`
 * - Extended: `planning: { model: claude-opus-4-6, fallbacks: [glm-5, minimax-m2.5] }`
 */
+/**
+ * Fallback resolver used when the user hasn't pinned `models.<unit>`:
+ * iterate every model the pi-ai catalog knows about whose provider is in
+ * `allowed_providers` (or every provider, if the allow-list is unset),
+ * score them with the unit-type-specific benchmark profile, and return
+ * the top pick plus diversified fallbacks.
+ *
+ * Pulls the candidate pool from `models.generated.js` rather than a live
+ * registry lookup so it works during preference resolution (before the
+ * registry is populated). The dispatch-time availability check happens
+ * downstream in auto-model-selection.ts and filters unavailable
+ * candidates naturally (expired keys, providers without auth, etc.).
+ */
+function resolveAutoBenchmarkPickForUnit(
+  unitType: string,
+  prefs: SFPreferences | undefined,
+): ResolvedModelConfig | undefined {
+  try {
+    const allowed = prefs?.allowed_providers?.map(s => s.toLowerCase());
+    const candidates: Array<{ provider: string; id: string }> = [];
+    for (const provider of getProviders()) {
+      if (allowed && !allowed.includes(provider.toLowerCase())) continue;
+      for (const model of getModels(provider)) {
+        candidates.push({ provider, id: model.id });
+      }
+    }
+    if (candidates.length === 0) return undefined;
+    const picked = selectByBenchmarks(unitType, candidates);
+    if (!picked) return undefined;
+    return { primary: picked.primary, fallbacks: picked.fallbacks };
+  } catch {
+    return undefined;
+  }
+}
+
 export function resolveModelWithFallbacksForUnit(unitType: string): ResolvedModelConfig | undefined {
  const prefs = loadEffectiveSFPreferences();
-  if (!prefs?.preferences.models) return undefined;
-  const m = prefs.preferences.models as SFModelConfigV2;
+  const m = (prefs?.preferences.models ?? {}) as SFModelConfigV2;

  let phaseConfig: string | SFPhaseModelConfig | undefined;
  switch (unitType) {
@ -87,10 +123,18 @@ export function resolveModelWithFallbacksForUnit(unitType: string): ResolvedMode
        phaseConfig = m.subagent;
        break;
      }
-      return undefined;
+      phaseConfig = undefined;
  }

-  if (!phaseConfig) return undefined;
+  if (!phaseConfig) {
+    // Auto-benchmark fallback: when the user hasn't pinned a model for this
+    // unit type, pick the best-benchmark-scoring model within
+    // allowed_providers. Keeps models.* declarative (pin only what you
+    // need) and lets the benchmarks drive the rest. Returns undefined if
+    // neither pref nor benchmarks can produce a candidate — caller falls
+    // through to session model defaults.
+    return resolveAutoBenchmarkPickForUnit(unitType, prefs?.preferences);
+  }

  // Normalize: string -> { model, fallbacks: [] }
  if (typeof phaseConfig === "string") {