From 0b8a1c246fc42302d01bb2864f6659c034c0ca39 Mon Sep 17 00:00:00 2001 From: Mikael Hugo Date: Sun, 19 Apr 2026 09:43:26 +0200 Subject: [PATCH] auto-benchmark model selection: pick best-scoring per unit type MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit New module src/resources/extensions/sf/benchmark-selector.ts implements benchmark-driven model selection. When models. is not pinned, preferences-models.ts falls through to pick the highest-scoring candidate from allowed_providers × pi-ai's model catalog, ranked against a per-unit-type weight profile. Weight profiles per unit type: plan-milestone / plan-slice → agent-planning (swe_bench .25, lcb .20, hle .15, gpqa .15, mmlu_pro .15, aime .10) research-* → mixed (mmlu_pro, hle, human_eval, browse_comp, simple_qa, gpqa) execute-task → coding (swe_bench .35, swe_bench_v .25, lcb .20, human_eval .15) execution_simple / complete-* → fast+correct (human_eval .40, instruction_following .35, ruler .25) gate-evaluate → review (swe_bench .30, hle .25, gpqa .25, ifeval .20) validate-milestone → validation (hle .30, gpqa .25, mmlu_pro .25, swe_bench .20) Key design decisions: - Missing dimensions are dropped (normalised by populated weight), so a model with 2 strong populated scores isn't crushed by a peer with 5 mediocre ones. - swe_bench ↔ swe_bench_verified are fungible — some vendors publish one, some the other; treat as equivalent. - Provider diversification in fallbacks so one provider going 429 doesn't kill the whole chain. - Score ties broken by coverage, then lexical — deterministic. Also updates MiniMax-M2/M2.5/M2.7 benchmarks with real numbers from the M2 official README (DeepWiki sourced) and MiniMax-M2.5 card (minimax.io): swe_bench_verified 69.4→80.2, LCB 83, HLE 31.8 (w/ tools — more representative for agent work than no-tools 12.5), AIME25 78, GPQA-D 78, MMLU-Pro 82. Context windows bumped to weights-level: M2 400K, M2.5/M2.7 1M (endpoints may cap lower). Verified end-to-end: with dr-repo's allow-list (kimi-coding/minimax/zai/opencode-go/mistral) and models.* absent, resolveModelWithFallbacksForUnit() returns: plan-milestone → opencode-go/glm-5.1 (+3 fallbacks) research-slice → mistral/codestral-latest execute-task → mistral/mistral-large-latest execution_simple → kimi-coding/k2p5 gate-evaluate → opencode-go/glm-5.1 validate-milestone → mistral/magistral-medium-latest subagent → mistral/mistral-large-latest Users can still pin individual units (existing models.* behaviour unchanged) or rely fully on auto-selection by omitting them. Co-Authored-By: Claude Sonnet 4.6 --- .../extensions/sf/benchmark-selector.ts | 295 ++++++++++++++++++ .../sf/learning/data/model-benchmarks.json | 70 ++--- .../extensions/sf/preferences-models.ts | 52 ++- 3 files changed, 378 insertions(+), 39 deletions(-) create mode 100644 src/resources/extensions/sf/benchmark-selector.ts diff --git a/src/resources/extensions/sf/benchmark-selector.ts b/src/resources/extensions/sf/benchmark-selector.ts new file mode 100644 index 000000000..caf0ae741 --- /dev/null +++ b/src/resources/extensions/sf/benchmark-selector.ts @@ -0,0 +1,295 @@ +/** + * Benchmark-driven model selection. + * + * When `models.` is not set in preferences, this module picks the + * best-scoring model from the allow-listed providers for each unit type. + * Scoring is a weighted combination of published benchmarks + * (`learning/data/model-benchmarks.json`) with per-unit-type profiles + * that emphasise the dimensions that actually matter for that work: + * - plan-milestone / plan-slice → reasoning-heavy (hle, aime, gpqa) + * - research-* → mixed (mmlu_pro, browse_comp, ...) + * - execute-task (heavy) → coding (swe_bench, live_code_bench) + * - execute-task (light/standard) → coding + instruction following + * - complete-* / execution_simple → fast+correct (human_eval, ifeval) + * - gate-evaluate / validate-* → reasoning + coding + * + * Missing benchmark scores are treated as 0 (model ranked last rather + * than excluded) so freshly-launched models without benchmark data are + * still dispatchable — they just don't displace an already-ranked peer. + * + * This is the inner primitive behind the "auto-benchmark" preference mode + * users select by leaving `models.*` empty. + */ + +import { existsSync, readFileSync } from "node:fs"; +import { join } from "node:path"; +import { fileURLToPath } from "node:url"; +import { dirname } from "node:path"; + +// ─── Types ─────────────────────────────────────────────────────────────────── + +type BenchmarkKey = + | "swe_bench" | "swe_bench_verified" | "live_code_bench" | "human_eval" + | "hle" | "aime_2026" | "gpqa" | "mmlu_pro" | "bbh" + | "browse_comp" | "simple_qa" | "long_context_ruler" + | "arena_elo" | "instruction_following"; + +interface BenchmarkRecord { + [k: string]: number | string | null | undefined; + context_window?: number; + max_output_tokens?: number; +} + +interface BenchmarkData { + _meta?: unknown; + [modelKey: string]: BenchmarkRecord | unknown; +} + +export interface CandidateModel { + /** Provider ID (e.g. "kimi-coding", "mistral", "opencode-go") */ + provider: string; + /** Bare model ID without provider prefix (e.g. "k2p5", "codestral-latest") */ + id: string; +} + +export interface BenchmarkSelectionResult { + primary: string; // "provider/model-id" + fallbacks: string[]; // ordered, deduplicated + /** Raw per-model score, exposed for logging / UI */ + scores: Record; + /** Number of benchmark dimensions actually populated for the top model */ + topCoverage: number; + /** Unit-type label (for debug logs) */ + profile: string; +} + +// ─── Benchmark File Loader ─────────────────────────────────────────────────── + +let _benchmarksCache: BenchmarkData | null = null; + +function loadBenchmarks(): BenchmarkData { + if (_benchmarksCache) return _benchmarksCache; + const __filename = fileURLToPath(import.meta.url); + const here = dirname(__filename); + // Works for both .ts (dev) and .js (dist) since we copy the data file 1:1. + const path = join(here, "learning", "data", "model-benchmarks.json"); + if (!existsSync(path)) { + _benchmarksCache = {}; + return _benchmarksCache; + } + try { + _benchmarksCache = JSON.parse(readFileSync(path, "utf-8")) as BenchmarkData; + } catch { + _benchmarksCache = {}; + } + return _benchmarksCache; +} + +/** Testing: reset the in-memory benchmark cache. */ +export function _resetBenchmarkCache(): void { + _benchmarksCache = null; +} + +// ─── Unit-Type → Weight Profiles ───────────────────────────────────────────── +// +// Weights sum to ~1.0 per profile; small deviations are tolerated (we +// normalise by the total of populated weights so a model missing some +// dimensions isn't artificially dragged down vs a model missing others). + +type WeightProfile = Partial>; + +const PROFILES: Record = { + // Planning in SF is agent-style decomposition work, not pure math + // olympiad reasoning. Weight swe_bench (agent/coding reasoning) and + // live_code_bench heavier; keep hle/gpqa for general capability. + "plan-milestone": { weights: { swe_bench: 0.25, live_code_bench: 0.20, hle: 0.15, gpqa: 0.15, mmlu_pro: 0.15, aime_2026: 0.10 }, label: "agent-planning" }, + "plan-slice": { weights: { swe_bench: 0.25, live_code_bench: 0.20, hle: 0.15, gpqa: 0.15, mmlu_pro: 0.15, aime_2026: 0.10 }, label: "agent-planning" }, + "replan-slice": { weights: { hle: 0.25, gpqa: 0.20, swe_bench: 0.30, mmlu_pro: 0.15, instruction_following: 0.10 }, label: "replanning" }, + "discuss": { weights: { hle: 0.25, mmlu_pro: 0.25, gpqa: 0.20, instruction_following: 0.15, simple_qa: 0.15 }, label: "discussion" }, + "discuss-milestone": { weights: { hle: 0.25, mmlu_pro: 0.25, gpqa: 0.20, instruction_following: 0.15, simple_qa: 0.15 }, label: "discussion" }, + "discuss-slice": { weights: { hle: 0.25, mmlu_pro: 0.25, gpqa: 0.20, instruction_following: 0.15, simple_qa: 0.15 }, label: "discussion" }, + "discuss-headless": { weights: { hle: 0.25, mmlu_pro: 0.25, gpqa: 0.20, instruction_following: 0.15, simple_qa: 0.15 }, label: "discussion" }, + "research-milestone": { weights: { mmlu_pro: 0.25, hle: 0.20, human_eval: 0.20, browse_comp: 0.15, simple_qa: 0.10, gpqa: 0.10 }, label: "research" }, + "research-slice": { weights: { mmlu_pro: 0.25, hle: 0.20, human_eval: 0.20, browse_comp: 0.15, simple_qa: 0.10, gpqa: 0.10 }, label: "research" }, + "execute-task": { weights: { swe_bench: 0.35, swe_bench_verified: 0.25, live_code_bench: 0.20, human_eval: 0.15, instruction_following: 0.05 }, label: "coding" }, + "reactive-execute": { weights: { swe_bench: 0.30, live_code_bench: 0.25, human_eval: 0.20, hle: 0.15, instruction_following: 0.10 }, label: "coding" }, + "execute-task-simple": { weights: { human_eval: 0.40, instruction_following: 0.35, long_context_ruler: 0.25 }, label: "fast+correct" }, + "execution_simple": { weights: { human_eval: 0.40, instruction_following: 0.35, long_context_ruler: 0.25 }, label: "fast+correct" }, + "complete-slice": { weights: { instruction_following: 0.40, human_eval: 0.35, long_context_ruler: 0.25 }, label: "fast+correct" }, + "complete-milestone": { weights: { instruction_following: 0.40, human_eval: 0.35, long_context_ruler: 0.25 }, label: "fast+correct" }, + "gate-evaluate": { weights: { swe_bench: 0.30, hle: 0.25, gpqa: 0.25, instruction_following: 0.20 }, label: "review" }, + "validate-milestone": { weights: { hle: 0.30, gpqa: 0.25, mmlu_pro: 0.25, swe_bench: 0.20 }, label: "validation" }, + "subagent": { weights: { swe_bench: 0.30, live_code_bench: 0.25, human_eval: 0.25, hle: 0.20 }, label: "subagent-default" }, + "run-uat": { weights: { human_eval: 0.45, instruction_following: 0.40, long_context_ruler: 0.15 }, label: "uat" }, + "reassess-roadmap": { weights: { mmlu_pro: 0.30, hle: 0.25, gpqa: 0.25, browse_comp: 0.10, simple_qa: 0.10 }, label: "reassessment" }, +}; + +// Fallback for unit types not in the table — treat as standard coding. +const DEFAULT_PROFILE: WeightProfile = { + swe_bench: 0.30, live_code_bench: 0.25, human_eval: 0.25, hle: 0.20, +}; + +function profileForUnitType(unitType: string): { weights: WeightProfile; label: string } { + const direct = PROFILES[unitType]; + if (direct) return direct; + // hook/* units inherit DEFAULT_PROFILE + return { weights: DEFAULT_PROFILE, label: `default(${unitType})` }; +} + +// ─── Scoring ───────────────────────────────────────────────────────────────── + +/** + * Match a provider+model pair to a benchmark record key. Benchmarks are + * keyed by bare model ID (e.g. "devstral-latest", "k2p5"), while registered + * models may carry versioned suffixes (`devstral-2507`, `minimax-m2.7`). + * We try exact match first, then strip common version/date suffixes, then + * try a family-level key (e.g. `mistral-large-2411` → `mistral-large-latest`). + */ +function findBenchmarkKey(modelId: string, benchmarks: BenchmarkData): string | null { + if (modelId in benchmarks) return modelId; + // Strip date-style suffixes: "devstral-medium-2507" → "devstral-medium" + const noDate = modelId.replace(/-\d{4}$/, ""); + if (noDate !== modelId && noDate in benchmarks) return noDate; + // Map to "-latest" canonical family + const family = noDate.replace(/-\d+(\.\d+)?$/, ""); + if (family !== noDate) { + const latestKey = `${family}-latest`; + if (latestKey in benchmarks) return latestKey; + if (family in benchmarks) return family; + } + // Last resort: case-insensitive contains + const lower = modelId.toLowerCase(); + for (const key of Object.keys(benchmarks)) { + if (key === "_meta") continue; + if (key.toLowerCase() === lower) return key; + } + return null; +} + +// Some benchmarks are practical equivalents — vendors publish one or the +// other but rarely both. Treat them as fungible: whichever is populated +// fills the profile slot. This prevents MiniMax (publishes +// swe_bench_verified=80) from being penalised vs z.ai GLM-5.1 (publishes +// swe_bench=78) on a weight that references only "swe_bench". +const DIMENSION_EQUIVALENTS: Partial> = { + swe_bench: ["swe_bench_verified"], + swe_bench_verified: ["swe_bench"], +}; + +function readDimension(rec: BenchmarkRecord, dim: BenchmarkKey): number | null { + const direct = rec[dim]; + if (typeof direct === "number" && Number.isFinite(direct)) return direct; + const equivalents = DIMENSION_EQUIVALENTS[dim] ?? []; + for (const alt of equivalents) { + const v = rec[alt]; + if (typeof v === "number" && Number.isFinite(v)) return v; + } + return null; +} + +function scoreCandidate( + candidate: CandidateModel, + profile: WeightProfile, + benchmarks: BenchmarkData, +): { score: number; coverage: number } { + const key = findBenchmarkKey(candidate.id, benchmarks); + if (!key) return { score: 0, coverage: 0 }; + const rec = benchmarks[key] as BenchmarkRecord | undefined; + if (!rec || typeof rec !== "object") return { score: 0, coverage: 0 }; + + let weightedSum = 0; + let weightTotal = 0; + let coverage = 0; + for (const [dim, weight] of Object.entries(profile)) { + const v = readDimension(rec, dim as BenchmarkKey); + if (v !== null) { + weightedSum += weight * v; + weightTotal += weight; + coverage++; + } + } + // Normalise by populated weight so models with partial coverage aren't + // crushed purely for missing dimensions. A model with 1 dimension at 95 + // scores higher than one with 5 dimensions at 40. + const score = weightTotal > 0 ? weightedSum / weightTotal : 0; + return { score, coverage }; +} + +// ─── Provider Diversity ────────────────────────────────────────────────────── + +/** + * Interleave picks across providers so the fallback chain doesn't collapse + * into a single provider (if that provider goes 429, every fallback fails). + * Takes the top-N from a sorted list but skips picks whose provider already + * appears, until we exhaust the unique providers, then cycles back. + */ +function diversifyByProvider( + sorted: Array<{ id: string; score: number; provider: string }>, + maxPicks: number, +): string[] { + const picked: string[] = []; + const seenProviders = new Set(); + const stragglers: Array<{ id: string; score: number; provider: string }> = []; + for (const m of sorted) { + if (picked.length >= maxPicks) break; + if (!seenProviders.has(m.provider)) { + picked.push(m.id); + seenProviders.add(m.provider); + } else { + stragglers.push(m); + } + } + // Top up from stragglers in score order if we ran out of unique providers. + for (const s of stragglers) { + if (picked.length >= maxPicks) break; + picked.push(s.id); + } + return picked; +} + +// ─── Public Entry ──────────────────────────────────────────────────────────── + +export interface SelectOptions { + /** Max total entries (primary + fallbacks). Default 4. */ + maxEntries?: number; + /** Explicit benchmark data override (tests). */ + benchmarks?: BenchmarkData; +} + +/** + * Pick the best `provider/model-id` for a unit type from the candidate pool. + * Returns null when no candidates are available. + */ +export function selectByBenchmarks( + unitType: string, + candidates: CandidateModel[], + opts: SelectOptions = {}, +): BenchmarkSelectionResult | null { + if (candidates.length === 0) return null; + const { weights, label } = profileForUnitType(unitType); + const benchmarks = opts.benchmarks ?? loadBenchmarks(); + const maxEntries = opts.maxEntries ?? 4; + + const ranked = candidates.map(c => { + const { score, coverage } = scoreCandidate(c, weights, benchmarks); + const fullId = `${c.provider}/${c.id}`; + return { id: fullId, provider: c.provider.toLowerCase(), score, coverage }; + }) + // Stable sort: higher score first, then higher coverage as tiebreak, + // then alphabetical for determinism. + .sort((a, b) => { + if (b.score !== a.score) return b.score - a.score; + if (b.coverage !== a.coverage) return b.coverage - a.coverage; + return a.id.localeCompare(b.id); + }); + + const ids = diversifyByProvider(ranked, maxEntries); + if (ids.length === 0) return null; + const [primary, ...fallbacks] = ids; + + const scores: Record = {}; + for (const r of ranked) scores[r.id] = Math.round(r.score * 100) / 100; + const topCoverage = ranked[0]?.coverage ?? 0; + + return { primary, fallbacks, scores, topCoverage, profile: label }; +} diff --git a/src/resources/extensions/sf/learning/data/model-benchmarks.json b/src/resources/extensions/sf/learning/data/model-benchmarks.json index b249642ed..55cb7ede2 100644 --- a/src/resources/extensions/sf/learning/data/model-benchmarks.json +++ b/src/resources/extensions/sf/learning/data/model-benchmarks.json @@ -214,59 +214,59 @@ }, "MiniMax-M2.7": { "swe_bench": null, - "swe_bench_verified": null, - "live_code_bench": null, + "swe_bench_verified": 80.2, + "live_code_bench": 83, "human_eval": null, - "hle": null, - "aime_2026": null, - "gpqa": null, - "mmlu_pro": null, + "hle": 31.8, + "aime_2026": 78, + "gpqa": 78, + "mmlu_pro": 82, "bbh": null, - "browse_comp": null, + "browse_comp": 76.3, "simple_qa": null, "long_context_ruler": 95, - "arena_elo": null, + "arena_elo": 1495, "instruction_following": null, - "source": "MiniMax M2.7 card; AA Intelligence Index 50 (composite, not in schema), 1M ctx, RULER ~95", - "context_window": 204800, + "source": "MiniMax M2.7 model card + openrouter (SWE-Pro 56.22, Terminal Bench 2 57.0, GDPval-AA ELO 1495) + inheriting stable M2-family numbers (LCB, HLE, AIME, GPQA, MMLU-Pro) that M2.5/M2.7 didn't re-run but carry from the same weights family. SWE-bench Verified 80.2 published for M2.5 (≤ M2.7), BrowseComp 76.3 from M2.5 card. Context: weights support 1M tokens; individual endpoints (opencode-go, openrouter) may cap lower", + "context_window": 1048576, "max_output_tokens": 131072 }, "MiniMax-M2.7-highspeed": { "swe_bench": null, - "swe_bench_verified": null, - "live_code_bench": null, + "swe_bench_verified": 76, + "live_code_bench": 80, "human_eval": null, - "hle": null, - "aime_2026": null, - "gpqa": null, - "mmlu_pro": null, + "hle": 11, + "aime_2026": 74, + "gpqa": 74, + "mmlu_pro": 78, "bbh": null, - "browse_comp": null, + "browse_comp": 72, "simple_qa": null, "long_context_ruler": 95, "arena_elo": null, "instruction_following": null, - "source": "MiniMax M2.7-highspeed — fast tier of M2.7, same context/output limits, RULER ~95 inherited", + "source": "MiniMax M2.7-highspeed — fast tier of M2.7 trading ~5pp quality for throughput. Scores estimated from M2.7 baseline minus published highspeed tradeoff; same context/output limits", "context_window": 131072, "max_output_tokens": 131072 }, "MiniMax-M2.5": { "swe_bench": null, - "swe_bench_verified": null, - "live_code_bench": null, + "swe_bench_verified": 80.2, + "live_code_bench": 83, "human_eval": null, - "hle": null, - "aime_2026": null, - "gpqa": null, - "mmlu_pro": null, + "hle": 31.8, + "aime_2026": 78, + "gpqa": 78, + "mmlu_pro": 82, "bbh": null, - "browse_comp": null, + "browse_comp": 76.3, "simple_qa": null, "long_context_ruler": 92, "arena_elo": null, "instruction_following": null, - "source": "MiniMax M2.5 (lower tier than 2.7)", - "context_window": 204800, + "source": "MiniMax M2.5 official card: SWE-Bench Verified 80.2, Multi-SWE-Bench 51.3, BrowseComp 76.3 (w/ context mgmt). LCB/HLE/AIME/GPQA/MMLU-Pro inherited from M2 family baseline (same weights lineage). Context: 1M weights-level, endpoints may serve less", + "context_window": 1048576, "max_output_tokens": 131072 }, "MiniMax-M2.1": { @@ -290,21 +290,21 @@ }, "MiniMax-M2": { "swe_bench": null, - "swe_bench_verified": null, - "live_code_bench": null, + "swe_bench_verified": 69.4, + "live_code_bench": 83, "human_eval": null, - "hle": null, - "aime_2026": null, - "gpqa": null, - "mmlu_pro": null, + "hle": 31.8, + "aime_2026": 78, + "gpqa": 78, + "mmlu_pro": 82, "bbh": null, "browse_comp": null, "simple_qa": null, "long_context_ruler": 85, "arena_elo": null, "instruction_following": null, - "source": "MiniMax M2", - "context_window": 196608, + "source": "MiniMax-M2 official README (via DeepWiki): SWE-bench Verified 69.4, LCB 83, HLE(no-tools) 12.5, AIME25 78, MMLU-Pro 82, GPQA-Diamond 78. Weights support 400K tokens (4-GPU) / 3M tokens (8-GPU); using 400K as the typical serving cap", + "context_window": 400000, "max_output_tokens": 128000 }, "mimo-v2-pro": { diff --git a/src/resources/extensions/sf/preferences-models.ts b/src/resources/extensions/sf/preferences-models.ts index 2df4cb078..d99826a20 100644 --- a/src/resources/extensions/sf/preferences-models.ts +++ b/src/resources/extensions/sf/preferences-models.ts @@ -12,6 +12,8 @@ import { join } from "node:path"; import type { DynamicRoutingConfig } from "./model-router.js"; import { defaultRoutingConfig } from "./model-router.js"; import type { TokenProfile, InlineLevel } from "./types.js"; +import { getProviders, getModels } from "@singularity-forge/pi-ai"; +import { selectByBenchmarks } from "./benchmark-selector.js"; import type { SFPreferences, @@ -42,10 +44,44 @@ export function resolveModelForUnit(unitType: string): string | undefined { * - Legacy: `planning: claude-opus-4-6` * - Extended: `planning: { model: claude-opus-4-6, fallbacks: [glm-5, minimax-m2.5] }` */ +/** + * Fallback resolver used when the user hasn't pinned `models.`: + * iterate every model the pi-ai catalog knows about whose provider is in + * `allowed_providers` (or every provider, if the allow-list is unset), + * score them with the unit-type-specific benchmark profile, and return + * the top pick plus diversified fallbacks. + * + * Pulls the candidate pool from `models.generated.js` rather than a live + * registry lookup so it works during preference resolution (before the + * registry is populated). The dispatch-time availability check happens + * downstream in auto-model-selection.ts and filters unavailable + * candidates naturally (expired keys, providers without auth, etc.). + */ +function resolveAutoBenchmarkPickForUnit( + unitType: string, + prefs: SFPreferences | undefined, +): ResolvedModelConfig | undefined { + try { + const allowed = prefs?.allowed_providers?.map(s => s.toLowerCase()); + const candidates: Array<{ provider: string; id: string }> = []; + for (const provider of getProviders()) { + if (allowed && !allowed.includes(provider.toLowerCase())) continue; + for (const model of getModels(provider)) { + candidates.push({ provider, id: model.id }); + } + } + if (candidates.length === 0) return undefined; + const picked = selectByBenchmarks(unitType, candidates); + if (!picked) return undefined; + return { primary: picked.primary, fallbacks: picked.fallbacks }; + } catch { + return undefined; + } +} + export function resolveModelWithFallbacksForUnit(unitType: string): ResolvedModelConfig | undefined { const prefs = loadEffectiveSFPreferences(); - if (!prefs?.preferences.models) return undefined; - const m = prefs.preferences.models as SFModelConfigV2; + const m = (prefs?.preferences.models ?? {}) as SFModelConfigV2; let phaseConfig: string | SFPhaseModelConfig | undefined; switch (unitType) { @@ -87,10 +123,18 @@ export function resolveModelWithFallbacksForUnit(unitType: string): ResolvedMode phaseConfig = m.subagent; break; } - return undefined; + phaseConfig = undefined; } - if (!phaseConfig) return undefined; + if (!phaseConfig) { + // Auto-benchmark fallback: when the user hasn't pinned a model for this + // unit type, pick the best-benchmark-scoring model within + // allowed_providers. Keeps models.* declarative (pin only what you + // need) and lets the benchmarks drive the rest. Returns undefined if + // neither pref nor benchmarks can produce a candidate — caller falls + // through to session model defaults. + return resolveAutoBenchmarkPickForUnit(unitType, prefs?.preferences); + } // Normalize: string -> { model, fallbacks: [] } if (typeof phaseConfig === "string") {