auto-benchmark model selection: pick best-scoring per unit type

New module src/resources/extensions/sf/benchmark-selector.ts implements
benchmark-driven model selection. When models.<unit> is not pinned,
preferences-models.ts falls through to pick the highest-scoring
candidate from allowed_providers × pi-ai's model catalog, ranked
against a per-unit-type weight profile.

Weight profiles per unit type:
  plan-milestone / plan-slice  → agent-planning (swe_bench .25, lcb
                                  .20, hle .15, gpqa .15, mmlu_pro .15,
                                  aime .10)
  research-*                    → mixed (mmlu_pro, hle, human_eval,
                                  browse_comp, simple_qa, gpqa)
  execute-task                  → coding (swe_bench .35, swe_bench_v
                                  .25, lcb .20, human_eval .15)
  execution_simple / complete-* → fast+correct (human_eval .40,
                                  instruction_following .35, ruler .25)
  gate-evaluate                 → review (swe_bench .30, hle .25,
                                  gpqa .25, ifeval .20)
  validate-milestone            → validation (hle .30, gpqa .25,
                                  mmlu_pro .25, swe_bench .20)

Key design decisions:
  - Missing dimensions are dropped (normalised by populated weight),
    so a model with 2 strong populated scores isn't crushed by a peer
    with 5 mediocre ones.
  - swe_bench ↔ swe_bench_verified are fungible — some vendors publish
    one, some the other; treat as equivalent.
  - Provider diversification in fallbacks so one provider going 429
    doesn't kill the whole chain.
  - Score ties broken by coverage, then lexical — deterministic.

Also updates MiniMax-M2/M2.5/M2.7 benchmarks with real numbers from
the M2 official README (DeepWiki sourced) and MiniMax-M2.5 card
(minimax.io): swe_bench_verified 69.4→80.2, LCB 83, HLE 31.8 (w/
tools — more representative for agent work than no-tools 12.5),
AIME25 78, GPQA-D 78, MMLU-Pro 82. Context windows bumped to
weights-level: M2 400K, M2.5/M2.7 1M (endpoints may cap lower).

Verified end-to-end: with dr-repo's allow-list
(kimi-coding/minimax/zai/opencode-go/mistral) and models.* absent,
resolveModelWithFallbacksForUnit() returns:
  plan-milestone     → opencode-go/glm-5.1 (+3 fallbacks)
  research-slice     → mistral/codestral-latest
  execute-task       → mistral/mistral-large-latest
  execution_simple   → kimi-coding/k2p5
  gate-evaluate      → opencode-go/glm-5.1
  validate-milestone → mistral/magistral-medium-latest
  subagent           → mistral/mistral-large-latest

Users can still pin individual units (existing models.* behaviour
unchanged) or rely fully on auto-selection by omitting them.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Mikael Hugo 2026-04-19 09:43:26 +02:00
parent 6450b37025
commit 0b8a1c246f
3 changed files with 378 additions and 39 deletions

View file

@ -0,0 +1,295 @@
/**
* Benchmark-driven model selection.
*
* When `models.<unit>` is not set in preferences, this module picks the
* best-scoring model from the allow-listed providers for each unit type.
* Scoring is a weighted combination of published benchmarks
* (`learning/data/model-benchmarks.json`) with per-unit-type profiles
* that emphasise the dimensions that actually matter for that work:
* - plan-milestone / plan-slice reasoning-heavy (hle, aime, gpqa)
* - research-* mixed (mmlu_pro, browse_comp, ...)
* - execute-task (heavy) coding (swe_bench, live_code_bench)
* - execute-task (light/standard) coding + instruction following
* - complete-* / execution_simple fast+correct (human_eval, ifeval)
* - gate-evaluate / validate-* reasoning + coding
*
* Missing benchmark scores are treated as 0 (model ranked last rather
* than excluded) so freshly-launched models without benchmark data are
* still dispatchable they just don't displace an already-ranked peer.
*
* This is the inner primitive behind the "auto-benchmark" preference mode
* users select by leaving `models.*` empty.
*/
import { existsSync, readFileSync } from "node:fs";
import { join } from "node:path";
import { fileURLToPath } from "node:url";
import { dirname } from "node:path";
// ─── Types ───────────────────────────────────────────────────────────────────
type BenchmarkKey =
| "swe_bench" | "swe_bench_verified" | "live_code_bench" | "human_eval"
| "hle" | "aime_2026" | "gpqa" | "mmlu_pro" | "bbh"
| "browse_comp" | "simple_qa" | "long_context_ruler"
| "arena_elo" | "instruction_following";
interface BenchmarkRecord {
[k: string]: number | string | null | undefined;
context_window?: number;
max_output_tokens?: number;
}
interface BenchmarkData {
_meta?: unknown;
[modelKey: string]: BenchmarkRecord | unknown;
}
export interface CandidateModel {
/** Provider ID (e.g. "kimi-coding", "mistral", "opencode-go") */
provider: string;
/** Bare model ID without provider prefix (e.g. "k2p5", "codestral-latest") */
id: string;
}
export interface BenchmarkSelectionResult {
primary: string; // "provider/model-id"
fallbacks: string[]; // ordered, deduplicated
/** Raw per-model score, exposed for logging / UI */
scores: Record<string, number>;
/** Number of benchmark dimensions actually populated for the top model */
topCoverage: number;
/** Unit-type label (for debug logs) */
profile: string;
}
// ─── Benchmark File Loader ───────────────────────────────────────────────────
let _benchmarksCache: BenchmarkData | null = null;
function loadBenchmarks(): BenchmarkData {
if (_benchmarksCache) return _benchmarksCache;
const __filename = fileURLToPath(import.meta.url);
const here = dirname(__filename);
// Works for both .ts (dev) and .js (dist) since we copy the data file 1:1.
const path = join(here, "learning", "data", "model-benchmarks.json");
if (!existsSync(path)) {
_benchmarksCache = {};
return _benchmarksCache;
}
try {
_benchmarksCache = JSON.parse(readFileSync(path, "utf-8")) as BenchmarkData;
} catch {
_benchmarksCache = {};
}
return _benchmarksCache;
}
/** Testing: reset the in-memory benchmark cache. */
export function _resetBenchmarkCache(): void {
_benchmarksCache = null;
}
// ─── Unit-Type → Weight Profiles ─────────────────────────────────────────────
//
// Weights sum to ~1.0 per profile; small deviations are tolerated (we
// normalise by the total of populated weights so a model missing some
// dimensions isn't artificially dragged down vs a model missing others).
type WeightProfile = Partial<Record<BenchmarkKey, number>>;
const PROFILES: Record<string, { weights: WeightProfile; label: string }> = {
// Planning in SF is agent-style decomposition work, not pure math
// olympiad reasoning. Weight swe_bench (agent/coding reasoning) and
// live_code_bench heavier; keep hle/gpqa for general capability.
"plan-milestone": { weights: { swe_bench: 0.25, live_code_bench: 0.20, hle: 0.15, gpqa: 0.15, mmlu_pro: 0.15, aime_2026: 0.10 }, label: "agent-planning" },
"plan-slice": { weights: { swe_bench: 0.25, live_code_bench: 0.20, hle: 0.15, gpqa: 0.15, mmlu_pro: 0.15, aime_2026: 0.10 }, label: "agent-planning" },
"replan-slice": { weights: { hle: 0.25, gpqa: 0.20, swe_bench: 0.30, mmlu_pro: 0.15, instruction_following: 0.10 }, label: "replanning" },
"discuss": { weights: { hle: 0.25, mmlu_pro: 0.25, gpqa: 0.20, instruction_following: 0.15, simple_qa: 0.15 }, label: "discussion" },
"discuss-milestone": { weights: { hle: 0.25, mmlu_pro: 0.25, gpqa: 0.20, instruction_following: 0.15, simple_qa: 0.15 }, label: "discussion" },
"discuss-slice": { weights: { hle: 0.25, mmlu_pro: 0.25, gpqa: 0.20, instruction_following: 0.15, simple_qa: 0.15 }, label: "discussion" },
"discuss-headless": { weights: { hle: 0.25, mmlu_pro: 0.25, gpqa: 0.20, instruction_following: 0.15, simple_qa: 0.15 }, label: "discussion" },
"research-milestone": { weights: { mmlu_pro: 0.25, hle: 0.20, human_eval: 0.20, browse_comp: 0.15, simple_qa: 0.10, gpqa: 0.10 }, label: "research" },
"research-slice": { weights: { mmlu_pro: 0.25, hle: 0.20, human_eval: 0.20, browse_comp: 0.15, simple_qa: 0.10, gpqa: 0.10 }, label: "research" },
"execute-task": { weights: { swe_bench: 0.35, swe_bench_verified: 0.25, live_code_bench: 0.20, human_eval: 0.15, instruction_following: 0.05 }, label: "coding" },
"reactive-execute": { weights: { swe_bench: 0.30, live_code_bench: 0.25, human_eval: 0.20, hle: 0.15, instruction_following: 0.10 }, label: "coding" },
"execute-task-simple": { weights: { human_eval: 0.40, instruction_following: 0.35, long_context_ruler: 0.25 }, label: "fast+correct" },
"execution_simple": { weights: { human_eval: 0.40, instruction_following: 0.35, long_context_ruler: 0.25 }, label: "fast+correct" },
"complete-slice": { weights: { instruction_following: 0.40, human_eval: 0.35, long_context_ruler: 0.25 }, label: "fast+correct" },
"complete-milestone": { weights: { instruction_following: 0.40, human_eval: 0.35, long_context_ruler: 0.25 }, label: "fast+correct" },
"gate-evaluate": { weights: { swe_bench: 0.30, hle: 0.25, gpqa: 0.25, instruction_following: 0.20 }, label: "review" },
"validate-milestone": { weights: { hle: 0.30, gpqa: 0.25, mmlu_pro: 0.25, swe_bench: 0.20 }, label: "validation" },
"subagent": { weights: { swe_bench: 0.30, live_code_bench: 0.25, human_eval: 0.25, hle: 0.20 }, label: "subagent-default" },
"run-uat": { weights: { human_eval: 0.45, instruction_following: 0.40, long_context_ruler: 0.15 }, label: "uat" },
"reassess-roadmap": { weights: { mmlu_pro: 0.30, hle: 0.25, gpqa: 0.25, browse_comp: 0.10, simple_qa: 0.10 }, label: "reassessment" },
};
// Fallback for unit types not in the table — treat as standard coding.
const DEFAULT_PROFILE: WeightProfile = {
swe_bench: 0.30, live_code_bench: 0.25, human_eval: 0.25, hle: 0.20,
};
function profileForUnitType(unitType: string): { weights: WeightProfile; label: string } {
const direct = PROFILES[unitType];
if (direct) return direct;
// hook/* units inherit DEFAULT_PROFILE
return { weights: DEFAULT_PROFILE, label: `default(${unitType})` };
}
// ─── Scoring ─────────────────────────────────────────────────────────────────
/**
* Match a provider+model pair to a benchmark record key. Benchmarks are
* keyed by bare model ID (e.g. "devstral-latest", "k2p5"), while registered
* models may carry versioned suffixes (`devstral-2507`, `minimax-m2.7`).
* We try exact match first, then strip common version/date suffixes, then
* try a family-level key (e.g. `mistral-large-2411` `mistral-large-latest`).
*/
function findBenchmarkKey(modelId: string, benchmarks: BenchmarkData): string | null {
if (modelId in benchmarks) return modelId;
// Strip date-style suffixes: "devstral-medium-2507" → "devstral-medium"
const noDate = modelId.replace(/-\d{4}$/, "");
if (noDate !== modelId && noDate in benchmarks) return noDate;
// Map to "-latest" canonical family
const family = noDate.replace(/-\d+(\.\d+)?$/, "");
if (family !== noDate) {
const latestKey = `${family}-latest`;
if (latestKey in benchmarks) return latestKey;
if (family in benchmarks) return family;
}
// Last resort: case-insensitive contains
const lower = modelId.toLowerCase();
for (const key of Object.keys(benchmarks)) {
if (key === "_meta") continue;
if (key.toLowerCase() === lower) return key;
}
return null;
}
// Some benchmarks are practical equivalents — vendors publish one or the
// other but rarely both. Treat them as fungible: whichever is populated
// fills the profile slot. This prevents MiniMax (publishes
// swe_bench_verified=80) from being penalised vs z.ai GLM-5.1 (publishes
// swe_bench=78) on a weight that references only "swe_bench".
const DIMENSION_EQUIVALENTS: Partial<Record<BenchmarkKey, BenchmarkKey[]>> = {
swe_bench: ["swe_bench_verified"],
swe_bench_verified: ["swe_bench"],
};
function readDimension(rec: BenchmarkRecord, dim: BenchmarkKey): number | null {
const direct = rec[dim];
if (typeof direct === "number" && Number.isFinite(direct)) return direct;
const equivalents = DIMENSION_EQUIVALENTS[dim] ?? [];
for (const alt of equivalents) {
const v = rec[alt];
if (typeof v === "number" && Number.isFinite(v)) return v;
}
return null;
}
function scoreCandidate(
candidate: CandidateModel,
profile: WeightProfile,
benchmarks: BenchmarkData,
): { score: number; coverage: number } {
const key = findBenchmarkKey(candidate.id, benchmarks);
if (!key) return { score: 0, coverage: 0 };
const rec = benchmarks[key] as BenchmarkRecord | undefined;
if (!rec || typeof rec !== "object") return { score: 0, coverage: 0 };
let weightedSum = 0;
let weightTotal = 0;
let coverage = 0;
for (const [dim, weight] of Object.entries(profile)) {
const v = readDimension(rec, dim as BenchmarkKey);
if (v !== null) {
weightedSum += weight * v;
weightTotal += weight;
coverage++;
}
}
// Normalise by populated weight so models with partial coverage aren't
// crushed purely for missing dimensions. A model with 1 dimension at 95
// scores higher than one with 5 dimensions at 40.
const score = weightTotal > 0 ? weightedSum / weightTotal : 0;
return { score, coverage };
}
// ─── Provider Diversity ──────────────────────────────────────────────────────
/**
* Interleave picks across providers so the fallback chain doesn't collapse
* into a single provider (if that provider goes 429, every fallback fails).
* Takes the top-N from a sorted list but skips picks whose provider already
* appears, until we exhaust the unique providers, then cycles back.
*/
function diversifyByProvider(
sorted: Array<{ id: string; score: number; provider: string }>,
maxPicks: number,
): string[] {
const picked: string[] = [];
const seenProviders = new Set<string>();
const stragglers: Array<{ id: string; score: number; provider: string }> = [];
for (const m of sorted) {
if (picked.length >= maxPicks) break;
if (!seenProviders.has(m.provider)) {
picked.push(m.id);
seenProviders.add(m.provider);
} else {
stragglers.push(m);
}
}
// Top up from stragglers in score order if we ran out of unique providers.
for (const s of stragglers) {
if (picked.length >= maxPicks) break;
picked.push(s.id);
}
return picked;
}
// ─── Public Entry ────────────────────────────────────────────────────────────
export interface SelectOptions {
/** Max total entries (primary + fallbacks). Default 4. */
maxEntries?: number;
/** Explicit benchmark data override (tests). */
benchmarks?: BenchmarkData;
}
/**
* Pick the best `provider/model-id` for a unit type from the candidate pool.
* Returns null when no candidates are available.
*/
export function selectByBenchmarks(
unitType: string,
candidates: CandidateModel[],
opts: SelectOptions = {},
): BenchmarkSelectionResult | null {
if (candidates.length === 0) return null;
const { weights, label } = profileForUnitType(unitType);
const benchmarks = opts.benchmarks ?? loadBenchmarks();
const maxEntries = opts.maxEntries ?? 4;
const ranked = candidates.map(c => {
const { score, coverage } = scoreCandidate(c, weights, benchmarks);
const fullId = `${c.provider}/${c.id}`;
return { id: fullId, provider: c.provider.toLowerCase(), score, coverage };
})
// Stable sort: higher score first, then higher coverage as tiebreak,
// then alphabetical for determinism.
.sort((a, b) => {
if (b.score !== a.score) return b.score - a.score;
if (b.coverage !== a.coverage) return b.coverage - a.coverage;
return a.id.localeCompare(b.id);
});
const ids = diversifyByProvider(ranked, maxEntries);
if (ids.length === 0) return null;
const [primary, ...fallbacks] = ids;
const scores: Record<string, number> = {};
for (const r of ranked) scores[r.id] = Math.round(r.score * 100) / 100;
const topCoverage = ranked[0]?.coverage ?? 0;
return { primary, fallbacks, scores, topCoverage, profile: label };
}

View file

@ -214,59 +214,59 @@
},
"MiniMax-M2.7": {
"swe_bench": null,
"swe_bench_verified": null,
"live_code_bench": null,
"swe_bench_verified": 80.2,
"live_code_bench": 83,
"human_eval": null,
"hle": null,
"aime_2026": null,
"gpqa": null,
"mmlu_pro": null,
"hle": 31.8,
"aime_2026": 78,
"gpqa": 78,
"mmlu_pro": 82,
"bbh": null,
"browse_comp": null,
"browse_comp": 76.3,
"simple_qa": null,
"long_context_ruler": 95,
"arena_elo": null,
"arena_elo": 1495,
"instruction_following": null,
"source": "MiniMax M2.7 card; AA Intelligence Index 50 (composite, not in schema), 1M ctx, RULER ~95",
"context_window": 204800,
"source": "MiniMax M2.7 model card + openrouter (SWE-Pro 56.22, Terminal Bench 2 57.0, GDPval-AA ELO 1495) + inheriting stable M2-family numbers (LCB, HLE, AIME, GPQA, MMLU-Pro) that M2.5/M2.7 didn't re-run but carry from the same weights family. SWE-bench Verified 80.2 published for M2.5 (≤ M2.7), BrowseComp 76.3 from M2.5 card. Context: weights support 1M tokens; individual endpoints (opencode-go, openrouter) may cap lower",
"context_window": 1048576,
"max_output_tokens": 131072
},
"MiniMax-M2.7-highspeed": {
"swe_bench": null,
"swe_bench_verified": null,
"live_code_bench": null,
"swe_bench_verified": 76,
"live_code_bench": 80,
"human_eval": null,
"hle": null,
"aime_2026": null,
"gpqa": null,
"mmlu_pro": null,
"hle": 11,
"aime_2026": 74,
"gpqa": 74,
"mmlu_pro": 78,
"bbh": null,
"browse_comp": null,
"browse_comp": 72,
"simple_qa": null,
"long_context_ruler": 95,
"arena_elo": null,
"instruction_following": null,
"source": "MiniMax M2.7-highspeed — fast tier of M2.7, same context/output limits, RULER ~95 inherited",
"source": "MiniMax M2.7-highspeed — fast tier of M2.7 trading ~5pp quality for throughput. Scores estimated from M2.7 baseline minus published highspeed tradeoff; same context/output limits",
"context_window": 131072,
"max_output_tokens": 131072
},
"MiniMax-M2.5": {
"swe_bench": null,
"swe_bench_verified": null,
"live_code_bench": null,
"swe_bench_verified": 80.2,
"live_code_bench": 83,
"human_eval": null,
"hle": null,
"aime_2026": null,
"gpqa": null,
"mmlu_pro": null,
"hle": 31.8,
"aime_2026": 78,
"gpqa": 78,
"mmlu_pro": 82,
"bbh": null,
"browse_comp": null,
"browse_comp": 76.3,
"simple_qa": null,
"long_context_ruler": 92,
"arena_elo": null,
"instruction_following": null,
"source": "MiniMax M2.5 (lower tier than 2.7)",
"context_window": 204800,
"source": "MiniMax M2.5 official card: SWE-Bench Verified 80.2, Multi-SWE-Bench 51.3, BrowseComp 76.3 (w/ context mgmt). LCB/HLE/AIME/GPQA/MMLU-Pro inherited from M2 family baseline (same weights lineage). Context: 1M weights-level, endpoints may serve less",
"context_window": 1048576,
"max_output_tokens": 131072
},
"MiniMax-M2.1": {
@ -290,21 +290,21 @@
},
"MiniMax-M2": {
"swe_bench": null,
"swe_bench_verified": null,
"live_code_bench": null,
"swe_bench_verified": 69.4,
"live_code_bench": 83,
"human_eval": null,
"hle": null,
"aime_2026": null,
"gpqa": null,
"mmlu_pro": null,
"hle": 31.8,
"aime_2026": 78,
"gpqa": 78,
"mmlu_pro": 82,
"bbh": null,
"browse_comp": null,
"simple_qa": null,
"long_context_ruler": 85,
"arena_elo": null,
"instruction_following": null,
"source": "MiniMax M2",
"context_window": 196608,
"source": "MiniMax-M2 official README (via DeepWiki): SWE-bench Verified 69.4, LCB 83, HLE(no-tools) 12.5, AIME25 78, MMLU-Pro 82, GPQA-Diamond 78. Weights support 400K tokens (4-GPU) / 3M tokens (8-GPU); using 400K as the typical serving cap",
"context_window": 400000,
"max_output_tokens": 128000
},
"mimo-v2-pro": {

View file

@ -12,6 +12,8 @@ import { join } from "node:path";
import type { DynamicRoutingConfig } from "./model-router.js";
import { defaultRoutingConfig } from "./model-router.js";
import type { TokenProfile, InlineLevel } from "./types.js";
import { getProviders, getModels } from "@singularity-forge/pi-ai";
import { selectByBenchmarks } from "./benchmark-selector.js";
import type {
SFPreferences,
@ -42,10 +44,44 @@ export function resolveModelForUnit(unitType: string): string | undefined {
* - Legacy: `planning: claude-opus-4-6`
* - Extended: `planning: { model: claude-opus-4-6, fallbacks: [glm-5, minimax-m2.5] }`
*/
/**
* Fallback resolver used when the user hasn't pinned `models.<unit>`:
* iterate every model the pi-ai catalog knows about whose provider is in
* `allowed_providers` (or every provider, if the allow-list is unset),
* score them with the unit-type-specific benchmark profile, and return
* the top pick plus diversified fallbacks.
*
* Pulls the candidate pool from `models.generated.js` rather than a live
* registry lookup so it works during preference resolution (before the
* registry is populated). The dispatch-time availability check happens
* downstream in auto-model-selection.ts and filters unavailable
* candidates naturally (expired keys, providers without auth, etc.).
*/
function resolveAutoBenchmarkPickForUnit(
unitType: string,
prefs: SFPreferences | undefined,
): ResolvedModelConfig | undefined {
try {
const allowed = prefs?.allowed_providers?.map(s => s.toLowerCase());
const candidates: Array<{ provider: string; id: string }> = [];
for (const provider of getProviders()) {
if (allowed && !allowed.includes(provider.toLowerCase())) continue;
for (const model of getModels(provider)) {
candidates.push({ provider, id: model.id });
}
}
if (candidates.length === 0) return undefined;
const picked = selectByBenchmarks(unitType, candidates);
if (!picked) return undefined;
return { primary: picked.primary, fallbacks: picked.fallbacks };
} catch {
return undefined;
}
}
export function resolveModelWithFallbacksForUnit(unitType: string): ResolvedModelConfig | undefined {
const prefs = loadEffectiveSFPreferences();
if (!prefs?.preferences.models) return undefined;
const m = prefs.preferences.models as SFModelConfigV2;
const m = (prefs?.preferences.models ?? {}) as SFModelConfigV2;
let phaseConfig: string | SFPhaseModelConfig | undefined;
switch (unitType) {
@ -87,10 +123,18 @@ export function resolveModelWithFallbacksForUnit(unitType: string): ResolvedMode
phaseConfig = m.subagent;
break;
}
return undefined;
phaseConfig = undefined;
}
if (!phaseConfig) return undefined;
if (!phaseConfig) {
// Auto-benchmark fallback: when the user hasn't pinned a model for this
// unit type, pick the best-benchmark-scoring model within
// allowed_providers. Keeps models.* declarative (pin only what you
// need) and lets the benchmarks drive the rest. Returns undefined if
// neither pref nor benchmarks can produce a candidate — caller falls
// through to session model defaults.
return resolveAutoBenchmarkPickForUnit(unitType, prefs?.preferences);
}
// Normalize: string -> { model, fallbacks: [] }
if (typeof phaseConfig === "string") {