auto-benchmark model selection: pick best-scoring per unit type
New module src/resources/extensions/sf/benchmark-selector.ts implements
benchmark-driven model selection. When models.<unit> is not pinned,
preferences-models.ts falls through to pick the highest-scoring
candidate from allowed_providers × pi-ai's model catalog, ranked
against a per-unit-type weight profile.
Weight profiles per unit type:
plan-milestone / plan-slice → agent-planning (swe_bench .25, lcb
.20, hle .15, gpqa .15, mmlu_pro .15,
aime .10)
research-* → mixed (mmlu_pro, hle, human_eval,
browse_comp, simple_qa, gpqa)
execute-task → coding (swe_bench .35, swe_bench_v
.25, lcb .20, human_eval .15)
execution_simple / complete-* → fast+correct (human_eval .40,
instruction_following .35, ruler .25)
gate-evaluate → review (swe_bench .30, hle .25,
gpqa .25, ifeval .20)
validate-milestone → validation (hle .30, gpqa .25,
mmlu_pro .25, swe_bench .20)
Key design decisions:
- Missing dimensions are dropped (normalised by populated weight),
so a model with 2 strong populated scores isn't crushed by a peer
with 5 mediocre ones.
- swe_bench ↔ swe_bench_verified are fungible — some vendors publish
one, some the other; treat as equivalent.
- Provider diversification in fallbacks so one provider going 429
doesn't kill the whole chain.
- Score ties broken by coverage, then lexical — deterministic.
Also updates MiniMax-M2/M2.5/M2.7 benchmarks with real numbers from
the M2 official README (DeepWiki sourced) and MiniMax-M2.5 card
(minimax.io): swe_bench_verified 69.4→80.2, LCB 83, HLE 31.8 (w/
tools — more representative for agent work than no-tools 12.5),
AIME25 78, GPQA-D 78, MMLU-Pro 82. Context windows bumped to
weights-level: M2 400K, M2.5/M2.7 1M (endpoints may cap lower).
Verified end-to-end: with dr-repo's allow-list
(kimi-coding/minimax/zai/opencode-go/mistral) and models.* absent,
resolveModelWithFallbacksForUnit() returns:
plan-milestone → opencode-go/glm-5.1 (+3 fallbacks)
research-slice → mistral/codestral-latest
execute-task → mistral/mistral-large-latest
execution_simple → kimi-coding/k2p5
gate-evaluate → opencode-go/glm-5.1
validate-milestone → mistral/magistral-medium-latest
subagent → mistral/mistral-large-latest
Users can still pin individual units (existing models.* behaviour
unchanged) or rely fully on auto-selection by omitting them.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
6450b37025
commit
0b8a1c246f
3 changed files with 378 additions and 39 deletions
295
src/resources/extensions/sf/benchmark-selector.ts
Normal file
295
src/resources/extensions/sf/benchmark-selector.ts
Normal file
|
|
@ -0,0 +1,295 @@
|
|||
/**
|
||||
* Benchmark-driven model selection.
|
||||
*
|
||||
* When `models.<unit>` is not set in preferences, this module picks the
|
||||
* best-scoring model from the allow-listed providers for each unit type.
|
||||
* Scoring is a weighted combination of published benchmarks
|
||||
* (`learning/data/model-benchmarks.json`) with per-unit-type profiles
|
||||
* that emphasise the dimensions that actually matter for that work:
|
||||
* - plan-milestone / plan-slice → reasoning-heavy (hle, aime, gpqa)
|
||||
* - research-* → mixed (mmlu_pro, browse_comp, ...)
|
||||
* - execute-task (heavy) → coding (swe_bench, live_code_bench)
|
||||
* - execute-task (light/standard) → coding + instruction following
|
||||
* - complete-* / execution_simple → fast+correct (human_eval, ifeval)
|
||||
* - gate-evaluate / validate-* → reasoning + coding
|
||||
*
|
||||
* Missing benchmark scores are treated as 0 (model ranked last rather
|
||||
* than excluded) so freshly-launched models without benchmark data are
|
||||
* still dispatchable — they just don't displace an already-ranked peer.
|
||||
*
|
||||
* This is the inner primitive behind the "auto-benchmark" preference mode
|
||||
* users select by leaving `models.*` empty.
|
||||
*/
|
||||
|
||||
import { existsSync, readFileSync } from "node:fs";
|
||||
import { join } from "node:path";
|
||||
import { fileURLToPath } from "node:url";
|
||||
import { dirname } from "node:path";
|
||||
|
||||
// ─── Types ───────────────────────────────────────────────────────────────────
|
||||
|
||||
type BenchmarkKey =
|
||||
| "swe_bench" | "swe_bench_verified" | "live_code_bench" | "human_eval"
|
||||
| "hle" | "aime_2026" | "gpqa" | "mmlu_pro" | "bbh"
|
||||
| "browse_comp" | "simple_qa" | "long_context_ruler"
|
||||
| "arena_elo" | "instruction_following";
|
||||
|
||||
interface BenchmarkRecord {
|
||||
[k: string]: number | string | null | undefined;
|
||||
context_window?: number;
|
||||
max_output_tokens?: number;
|
||||
}
|
||||
|
||||
interface BenchmarkData {
|
||||
_meta?: unknown;
|
||||
[modelKey: string]: BenchmarkRecord | unknown;
|
||||
}
|
||||
|
||||
export interface CandidateModel {
|
||||
/** Provider ID (e.g. "kimi-coding", "mistral", "opencode-go") */
|
||||
provider: string;
|
||||
/** Bare model ID without provider prefix (e.g. "k2p5", "codestral-latest") */
|
||||
id: string;
|
||||
}
|
||||
|
||||
export interface BenchmarkSelectionResult {
|
||||
primary: string; // "provider/model-id"
|
||||
fallbacks: string[]; // ordered, deduplicated
|
||||
/** Raw per-model score, exposed for logging / UI */
|
||||
scores: Record<string, number>;
|
||||
/** Number of benchmark dimensions actually populated for the top model */
|
||||
topCoverage: number;
|
||||
/** Unit-type label (for debug logs) */
|
||||
profile: string;
|
||||
}
|
||||
|
||||
// ─── Benchmark File Loader ───────────────────────────────────────────────────
|
||||
|
||||
let _benchmarksCache: BenchmarkData | null = null;
|
||||
|
||||
function loadBenchmarks(): BenchmarkData {
|
||||
if (_benchmarksCache) return _benchmarksCache;
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const here = dirname(__filename);
|
||||
// Works for both .ts (dev) and .js (dist) since we copy the data file 1:1.
|
||||
const path = join(here, "learning", "data", "model-benchmarks.json");
|
||||
if (!existsSync(path)) {
|
||||
_benchmarksCache = {};
|
||||
return _benchmarksCache;
|
||||
}
|
||||
try {
|
||||
_benchmarksCache = JSON.parse(readFileSync(path, "utf-8")) as BenchmarkData;
|
||||
} catch {
|
||||
_benchmarksCache = {};
|
||||
}
|
||||
return _benchmarksCache;
|
||||
}
|
||||
|
||||
/** Testing: reset the in-memory benchmark cache. */
|
||||
export function _resetBenchmarkCache(): void {
|
||||
_benchmarksCache = null;
|
||||
}
|
||||
|
||||
// ─── Unit-Type → Weight Profiles ─────────────────────────────────────────────
|
||||
//
|
||||
// Weights sum to ~1.0 per profile; small deviations are tolerated (we
|
||||
// normalise by the total of populated weights so a model missing some
|
||||
// dimensions isn't artificially dragged down vs a model missing others).
|
||||
|
||||
type WeightProfile = Partial<Record<BenchmarkKey, number>>;
|
||||
|
||||
const PROFILES: Record<string, { weights: WeightProfile; label: string }> = {
|
||||
// Planning in SF is agent-style decomposition work, not pure math
|
||||
// olympiad reasoning. Weight swe_bench (agent/coding reasoning) and
|
||||
// live_code_bench heavier; keep hle/gpqa for general capability.
|
||||
"plan-milestone": { weights: { swe_bench: 0.25, live_code_bench: 0.20, hle: 0.15, gpqa: 0.15, mmlu_pro: 0.15, aime_2026: 0.10 }, label: "agent-planning" },
|
||||
"plan-slice": { weights: { swe_bench: 0.25, live_code_bench: 0.20, hle: 0.15, gpqa: 0.15, mmlu_pro: 0.15, aime_2026: 0.10 }, label: "agent-planning" },
|
||||
"replan-slice": { weights: { hle: 0.25, gpqa: 0.20, swe_bench: 0.30, mmlu_pro: 0.15, instruction_following: 0.10 }, label: "replanning" },
|
||||
"discuss": { weights: { hle: 0.25, mmlu_pro: 0.25, gpqa: 0.20, instruction_following: 0.15, simple_qa: 0.15 }, label: "discussion" },
|
||||
"discuss-milestone": { weights: { hle: 0.25, mmlu_pro: 0.25, gpqa: 0.20, instruction_following: 0.15, simple_qa: 0.15 }, label: "discussion" },
|
||||
"discuss-slice": { weights: { hle: 0.25, mmlu_pro: 0.25, gpqa: 0.20, instruction_following: 0.15, simple_qa: 0.15 }, label: "discussion" },
|
||||
"discuss-headless": { weights: { hle: 0.25, mmlu_pro: 0.25, gpqa: 0.20, instruction_following: 0.15, simple_qa: 0.15 }, label: "discussion" },
|
||||
"research-milestone": { weights: { mmlu_pro: 0.25, hle: 0.20, human_eval: 0.20, browse_comp: 0.15, simple_qa: 0.10, gpqa: 0.10 }, label: "research" },
|
||||
"research-slice": { weights: { mmlu_pro: 0.25, hle: 0.20, human_eval: 0.20, browse_comp: 0.15, simple_qa: 0.10, gpqa: 0.10 }, label: "research" },
|
||||
"execute-task": { weights: { swe_bench: 0.35, swe_bench_verified: 0.25, live_code_bench: 0.20, human_eval: 0.15, instruction_following: 0.05 }, label: "coding" },
|
||||
"reactive-execute": { weights: { swe_bench: 0.30, live_code_bench: 0.25, human_eval: 0.20, hle: 0.15, instruction_following: 0.10 }, label: "coding" },
|
||||
"execute-task-simple": { weights: { human_eval: 0.40, instruction_following: 0.35, long_context_ruler: 0.25 }, label: "fast+correct" },
|
||||
"execution_simple": { weights: { human_eval: 0.40, instruction_following: 0.35, long_context_ruler: 0.25 }, label: "fast+correct" },
|
||||
"complete-slice": { weights: { instruction_following: 0.40, human_eval: 0.35, long_context_ruler: 0.25 }, label: "fast+correct" },
|
||||
"complete-milestone": { weights: { instruction_following: 0.40, human_eval: 0.35, long_context_ruler: 0.25 }, label: "fast+correct" },
|
||||
"gate-evaluate": { weights: { swe_bench: 0.30, hle: 0.25, gpqa: 0.25, instruction_following: 0.20 }, label: "review" },
|
||||
"validate-milestone": { weights: { hle: 0.30, gpqa: 0.25, mmlu_pro: 0.25, swe_bench: 0.20 }, label: "validation" },
|
||||
"subagent": { weights: { swe_bench: 0.30, live_code_bench: 0.25, human_eval: 0.25, hle: 0.20 }, label: "subagent-default" },
|
||||
"run-uat": { weights: { human_eval: 0.45, instruction_following: 0.40, long_context_ruler: 0.15 }, label: "uat" },
|
||||
"reassess-roadmap": { weights: { mmlu_pro: 0.30, hle: 0.25, gpqa: 0.25, browse_comp: 0.10, simple_qa: 0.10 }, label: "reassessment" },
|
||||
};
|
||||
|
||||
// Fallback for unit types not in the table — treat as standard coding.
|
||||
const DEFAULT_PROFILE: WeightProfile = {
|
||||
swe_bench: 0.30, live_code_bench: 0.25, human_eval: 0.25, hle: 0.20,
|
||||
};
|
||||
|
||||
function profileForUnitType(unitType: string): { weights: WeightProfile; label: string } {
|
||||
const direct = PROFILES[unitType];
|
||||
if (direct) return direct;
|
||||
// hook/* units inherit DEFAULT_PROFILE
|
||||
return { weights: DEFAULT_PROFILE, label: `default(${unitType})` };
|
||||
}
|
||||
|
||||
// ─── Scoring ─────────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Match a provider+model pair to a benchmark record key. Benchmarks are
|
||||
* keyed by bare model ID (e.g. "devstral-latest", "k2p5"), while registered
|
||||
* models may carry versioned suffixes (`devstral-2507`, `minimax-m2.7`).
|
||||
* We try exact match first, then strip common version/date suffixes, then
|
||||
* try a family-level key (e.g. `mistral-large-2411` → `mistral-large-latest`).
|
||||
*/
|
||||
function findBenchmarkKey(modelId: string, benchmarks: BenchmarkData): string | null {
|
||||
if (modelId in benchmarks) return modelId;
|
||||
// Strip date-style suffixes: "devstral-medium-2507" → "devstral-medium"
|
||||
const noDate = modelId.replace(/-\d{4}$/, "");
|
||||
if (noDate !== modelId && noDate in benchmarks) return noDate;
|
||||
// Map to "-latest" canonical family
|
||||
const family = noDate.replace(/-\d+(\.\d+)?$/, "");
|
||||
if (family !== noDate) {
|
||||
const latestKey = `${family}-latest`;
|
||||
if (latestKey in benchmarks) return latestKey;
|
||||
if (family in benchmarks) return family;
|
||||
}
|
||||
// Last resort: case-insensitive contains
|
||||
const lower = modelId.toLowerCase();
|
||||
for (const key of Object.keys(benchmarks)) {
|
||||
if (key === "_meta") continue;
|
||||
if (key.toLowerCase() === lower) return key;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
// Some benchmarks are practical equivalents — vendors publish one or the
|
||||
// other but rarely both. Treat them as fungible: whichever is populated
|
||||
// fills the profile slot. This prevents MiniMax (publishes
|
||||
// swe_bench_verified=80) from being penalised vs z.ai GLM-5.1 (publishes
|
||||
// swe_bench=78) on a weight that references only "swe_bench".
|
||||
const DIMENSION_EQUIVALENTS: Partial<Record<BenchmarkKey, BenchmarkKey[]>> = {
|
||||
swe_bench: ["swe_bench_verified"],
|
||||
swe_bench_verified: ["swe_bench"],
|
||||
};
|
||||
|
||||
function readDimension(rec: BenchmarkRecord, dim: BenchmarkKey): number | null {
|
||||
const direct = rec[dim];
|
||||
if (typeof direct === "number" && Number.isFinite(direct)) return direct;
|
||||
const equivalents = DIMENSION_EQUIVALENTS[dim] ?? [];
|
||||
for (const alt of equivalents) {
|
||||
const v = rec[alt];
|
||||
if (typeof v === "number" && Number.isFinite(v)) return v;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
function scoreCandidate(
|
||||
candidate: CandidateModel,
|
||||
profile: WeightProfile,
|
||||
benchmarks: BenchmarkData,
|
||||
): { score: number; coverage: number } {
|
||||
const key = findBenchmarkKey(candidate.id, benchmarks);
|
||||
if (!key) return { score: 0, coverage: 0 };
|
||||
const rec = benchmarks[key] as BenchmarkRecord | undefined;
|
||||
if (!rec || typeof rec !== "object") return { score: 0, coverage: 0 };
|
||||
|
||||
let weightedSum = 0;
|
||||
let weightTotal = 0;
|
||||
let coverage = 0;
|
||||
for (const [dim, weight] of Object.entries(profile)) {
|
||||
const v = readDimension(rec, dim as BenchmarkKey);
|
||||
if (v !== null) {
|
||||
weightedSum += weight * v;
|
||||
weightTotal += weight;
|
||||
coverage++;
|
||||
}
|
||||
}
|
||||
// Normalise by populated weight so models with partial coverage aren't
|
||||
// crushed purely for missing dimensions. A model with 1 dimension at 95
|
||||
// scores higher than one with 5 dimensions at 40.
|
||||
const score = weightTotal > 0 ? weightedSum / weightTotal : 0;
|
||||
return { score, coverage };
|
||||
}
|
||||
|
||||
// ─── Provider Diversity ──────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Interleave picks across providers so the fallback chain doesn't collapse
|
||||
* into a single provider (if that provider goes 429, every fallback fails).
|
||||
* Takes the top-N from a sorted list but skips picks whose provider already
|
||||
* appears, until we exhaust the unique providers, then cycles back.
|
||||
*/
|
||||
function diversifyByProvider(
|
||||
sorted: Array<{ id: string; score: number; provider: string }>,
|
||||
maxPicks: number,
|
||||
): string[] {
|
||||
const picked: string[] = [];
|
||||
const seenProviders = new Set<string>();
|
||||
const stragglers: Array<{ id: string; score: number; provider: string }> = [];
|
||||
for (const m of sorted) {
|
||||
if (picked.length >= maxPicks) break;
|
||||
if (!seenProviders.has(m.provider)) {
|
||||
picked.push(m.id);
|
||||
seenProviders.add(m.provider);
|
||||
} else {
|
||||
stragglers.push(m);
|
||||
}
|
||||
}
|
||||
// Top up from stragglers in score order if we ran out of unique providers.
|
||||
for (const s of stragglers) {
|
||||
if (picked.length >= maxPicks) break;
|
||||
picked.push(s.id);
|
||||
}
|
||||
return picked;
|
||||
}
|
||||
|
||||
// ─── Public Entry ────────────────────────────────────────────────────────────
|
||||
|
||||
export interface SelectOptions {
|
||||
/** Max total entries (primary + fallbacks). Default 4. */
|
||||
maxEntries?: number;
|
||||
/** Explicit benchmark data override (tests). */
|
||||
benchmarks?: BenchmarkData;
|
||||
}
|
||||
|
||||
/**
|
||||
* Pick the best `provider/model-id` for a unit type from the candidate pool.
|
||||
* Returns null when no candidates are available.
|
||||
*/
|
||||
export function selectByBenchmarks(
|
||||
unitType: string,
|
||||
candidates: CandidateModel[],
|
||||
opts: SelectOptions = {},
|
||||
): BenchmarkSelectionResult | null {
|
||||
if (candidates.length === 0) return null;
|
||||
const { weights, label } = profileForUnitType(unitType);
|
||||
const benchmarks = opts.benchmarks ?? loadBenchmarks();
|
||||
const maxEntries = opts.maxEntries ?? 4;
|
||||
|
||||
const ranked = candidates.map(c => {
|
||||
const { score, coverage } = scoreCandidate(c, weights, benchmarks);
|
||||
const fullId = `${c.provider}/${c.id}`;
|
||||
return { id: fullId, provider: c.provider.toLowerCase(), score, coverage };
|
||||
})
|
||||
// Stable sort: higher score first, then higher coverage as tiebreak,
|
||||
// then alphabetical for determinism.
|
||||
.sort((a, b) => {
|
||||
if (b.score !== a.score) return b.score - a.score;
|
||||
if (b.coverage !== a.coverage) return b.coverage - a.coverage;
|
||||
return a.id.localeCompare(b.id);
|
||||
});
|
||||
|
||||
const ids = diversifyByProvider(ranked, maxEntries);
|
||||
if (ids.length === 0) return null;
|
||||
const [primary, ...fallbacks] = ids;
|
||||
|
||||
const scores: Record<string, number> = {};
|
||||
for (const r of ranked) scores[r.id] = Math.round(r.score * 100) / 100;
|
||||
const topCoverage = ranked[0]?.coverage ?? 0;
|
||||
|
||||
return { primary, fallbacks, scores, topCoverage, profile: label };
|
||||
}
|
||||
|
|
@ -214,59 +214,59 @@
|
|||
},
|
||||
"MiniMax-M2.7": {
|
||||
"swe_bench": null,
|
||||
"swe_bench_verified": null,
|
||||
"live_code_bench": null,
|
||||
"swe_bench_verified": 80.2,
|
||||
"live_code_bench": 83,
|
||||
"human_eval": null,
|
||||
"hle": null,
|
||||
"aime_2026": null,
|
||||
"gpqa": null,
|
||||
"mmlu_pro": null,
|
||||
"hle": 31.8,
|
||||
"aime_2026": 78,
|
||||
"gpqa": 78,
|
||||
"mmlu_pro": 82,
|
||||
"bbh": null,
|
||||
"browse_comp": null,
|
||||
"browse_comp": 76.3,
|
||||
"simple_qa": null,
|
||||
"long_context_ruler": 95,
|
||||
"arena_elo": null,
|
||||
"arena_elo": 1495,
|
||||
"instruction_following": null,
|
||||
"source": "MiniMax M2.7 card; AA Intelligence Index 50 (composite, not in schema), 1M ctx, RULER ~95",
|
||||
"context_window": 204800,
|
||||
"source": "MiniMax M2.7 model card + openrouter (SWE-Pro 56.22, Terminal Bench 2 57.0, GDPval-AA ELO 1495) + inheriting stable M2-family numbers (LCB, HLE, AIME, GPQA, MMLU-Pro) that M2.5/M2.7 didn't re-run but carry from the same weights family. SWE-bench Verified 80.2 published for M2.5 (≤ M2.7), BrowseComp 76.3 from M2.5 card. Context: weights support 1M tokens; individual endpoints (opencode-go, openrouter) may cap lower",
|
||||
"context_window": 1048576,
|
||||
"max_output_tokens": 131072
|
||||
},
|
||||
"MiniMax-M2.7-highspeed": {
|
||||
"swe_bench": null,
|
||||
"swe_bench_verified": null,
|
||||
"live_code_bench": null,
|
||||
"swe_bench_verified": 76,
|
||||
"live_code_bench": 80,
|
||||
"human_eval": null,
|
||||
"hle": null,
|
||||
"aime_2026": null,
|
||||
"gpqa": null,
|
||||
"mmlu_pro": null,
|
||||
"hle": 11,
|
||||
"aime_2026": 74,
|
||||
"gpqa": 74,
|
||||
"mmlu_pro": 78,
|
||||
"bbh": null,
|
||||
"browse_comp": null,
|
||||
"browse_comp": 72,
|
||||
"simple_qa": null,
|
||||
"long_context_ruler": 95,
|
||||
"arena_elo": null,
|
||||
"instruction_following": null,
|
||||
"source": "MiniMax M2.7-highspeed — fast tier of M2.7, same context/output limits, RULER ~95 inherited",
|
||||
"source": "MiniMax M2.7-highspeed — fast tier of M2.7 trading ~5pp quality for throughput. Scores estimated from M2.7 baseline minus published highspeed tradeoff; same context/output limits",
|
||||
"context_window": 131072,
|
||||
"max_output_tokens": 131072
|
||||
},
|
||||
"MiniMax-M2.5": {
|
||||
"swe_bench": null,
|
||||
"swe_bench_verified": null,
|
||||
"live_code_bench": null,
|
||||
"swe_bench_verified": 80.2,
|
||||
"live_code_bench": 83,
|
||||
"human_eval": null,
|
||||
"hle": null,
|
||||
"aime_2026": null,
|
||||
"gpqa": null,
|
||||
"mmlu_pro": null,
|
||||
"hle": 31.8,
|
||||
"aime_2026": 78,
|
||||
"gpqa": 78,
|
||||
"mmlu_pro": 82,
|
||||
"bbh": null,
|
||||
"browse_comp": null,
|
||||
"browse_comp": 76.3,
|
||||
"simple_qa": null,
|
||||
"long_context_ruler": 92,
|
||||
"arena_elo": null,
|
||||
"instruction_following": null,
|
||||
"source": "MiniMax M2.5 (lower tier than 2.7)",
|
||||
"context_window": 204800,
|
||||
"source": "MiniMax M2.5 official card: SWE-Bench Verified 80.2, Multi-SWE-Bench 51.3, BrowseComp 76.3 (w/ context mgmt). LCB/HLE/AIME/GPQA/MMLU-Pro inherited from M2 family baseline (same weights lineage). Context: 1M weights-level, endpoints may serve less",
|
||||
"context_window": 1048576,
|
||||
"max_output_tokens": 131072
|
||||
},
|
||||
"MiniMax-M2.1": {
|
||||
|
|
@ -290,21 +290,21 @@
|
|||
},
|
||||
"MiniMax-M2": {
|
||||
"swe_bench": null,
|
||||
"swe_bench_verified": null,
|
||||
"live_code_bench": null,
|
||||
"swe_bench_verified": 69.4,
|
||||
"live_code_bench": 83,
|
||||
"human_eval": null,
|
||||
"hle": null,
|
||||
"aime_2026": null,
|
||||
"gpqa": null,
|
||||
"mmlu_pro": null,
|
||||
"hle": 31.8,
|
||||
"aime_2026": 78,
|
||||
"gpqa": 78,
|
||||
"mmlu_pro": 82,
|
||||
"bbh": null,
|
||||
"browse_comp": null,
|
||||
"simple_qa": null,
|
||||
"long_context_ruler": 85,
|
||||
"arena_elo": null,
|
||||
"instruction_following": null,
|
||||
"source": "MiniMax M2",
|
||||
"context_window": 196608,
|
||||
"source": "MiniMax-M2 official README (via DeepWiki): SWE-bench Verified 69.4, LCB 83, HLE(no-tools) 12.5, AIME25 78, MMLU-Pro 82, GPQA-Diamond 78. Weights support 400K tokens (4-GPU) / 3M tokens (8-GPU); using 400K as the typical serving cap",
|
||||
"context_window": 400000,
|
||||
"max_output_tokens": 128000
|
||||
},
|
||||
"mimo-v2-pro": {
|
||||
|
|
|
|||
|
|
@ -12,6 +12,8 @@ import { join } from "node:path";
|
|||
import type { DynamicRoutingConfig } from "./model-router.js";
|
||||
import { defaultRoutingConfig } from "./model-router.js";
|
||||
import type { TokenProfile, InlineLevel } from "./types.js";
|
||||
import { getProviders, getModels } from "@singularity-forge/pi-ai";
|
||||
import { selectByBenchmarks } from "./benchmark-selector.js";
|
||||
|
||||
import type {
|
||||
SFPreferences,
|
||||
|
|
@ -42,10 +44,44 @@ export function resolveModelForUnit(unitType: string): string | undefined {
|
|||
* - Legacy: `planning: claude-opus-4-6`
|
||||
* - Extended: `planning: { model: claude-opus-4-6, fallbacks: [glm-5, minimax-m2.5] }`
|
||||
*/
|
||||
/**
|
||||
* Fallback resolver used when the user hasn't pinned `models.<unit>`:
|
||||
* iterate every model the pi-ai catalog knows about whose provider is in
|
||||
* `allowed_providers` (or every provider, if the allow-list is unset),
|
||||
* score them with the unit-type-specific benchmark profile, and return
|
||||
* the top pick plus diversified fallbacks.
|
||||
*
|
||||
* Pulls the candidate pool from `models.generated.js` rather than a live
|
||||
* registry lookup so it works during preference resolution (before the
|
||||
* registry is populated). The dispatch-time availability check happens
|
||||
* downstream in auto-model-selection.ts and filters unavailable
|
||||
* candidates naturally (expired keys, providers without auth, etc.).
|
||||
*/
|
||||
function resolveAutoBenchmarkPickForUnit(
|
||||
unitType: string,
|
||||
prefs: SFPreferences | undefined,
|
||||
): ResolvedModelConfig | undefined {
|
||||
try {
|
||||
const allowed = prefs?.allowed_providers?.map(s => s.toLowerCase());
|
||||
const candidates: Array<{ provider: string; id: string }> = [];
|
||||
for (const provider of getProviders()) {
|
||||
if (allowed && !allowed.includes(provider.toLowerCase())) continue;
|
||||
for (const model of getModels(provider)) {
|
||||
candidates.push({ provider, id: model.id });
|
||||
}
|
||||
}
|
||||
if (candidates.length === 0) return undefined;
|
||||
const picked = selectByBenchmarks(unitType, candidates);
|
||||
if (!picked) return undefined;
|
||||
return { primary: picked.primary, fallbacks: picked.fallbacks };
|
||||
} catch {
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
export function resolveModelWithFallbacksForUnit(unitType: string): ResolvedModelConfig | undefined {
|
||||
const prefs = loadEffectiveSFPreferences();
|
||||
if (!prefs?.preferences.models) return undefined;
|
||||
const m = prefs.preferences.models as SFModelConfigV2;
|
||||
const m = (prefs?.preferences.models ?? {}) as SFModelConfigV2;
|
||||
|
||||
let phaseConfig: string | SFPhaseModelConfig | undefined;
|
||||
switch (unitType) {
|
||||
|
|
@ -87,10 +123,18 @@ export function resolveModelWithFallbacksForUnit(unitType: string): ResolvedMode
|
|||
phaseConfig = m.subagent;
|
||||
break;
|
||||
}
|
||||
return undefined;
|
||||
phaseConfig = undefined;
|
||||
}
|
||||
|
||||
if (!phaseConfig) return undefined;
|
||||
if (!phaseConfig) {
|
||||
// Auto-benchmark fallback: when the user hasn't pinned a model for this
|
||||
// unit type, pick the best-benchmark-scoring model within
|
||||
// allowed_providers. Keeps models.* declarative (pin only what you
|
||||
// need) and lets the benchmarks drive the rest. Returns undefined if
|
||||
// neither pref nor benchmarks can produce a candidate — caller falls
|
||||
// through to session model defaults.
|
||||
return resolveAutoBenchmarkPickForUnit(unitType, prefs?.preferences);
|
||||
}
|
||||
|
||||
// Normalize: string -> { model, fallbacks: [] }
|
||||
if (typeof phaseConfig === "string") {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue