feat(benchmark-coverage): variant-fallback lookup (R050 step 1)
benchmark-coverage.js: new benchmarkLookupVariants() returns ordered
fallback keys for a model id, and computeBenchmarkCoverage tries each
variant before flagging uncovered. Patterns covered:
- date/version suffix strip ("mistral-medium-2505" → "mistral-medium")
- tier strip ("X-flash-lite" → "X-flash", "Y-lite" → "Y")
- "-latest" append for bare names ("mistral-medium" → "mistral-medium-latest")
The audit reports the matched variant via `matchedVia` so operators can
see when fallback applied (vs adding a real entry).
Verified: coverage 62/169 (37%) → 65/169 (38.4%). Sample fallback matches:
google-gemini-cli/gemini-2.5-flash-lite → gemini-2.5-flash
mistral/mistral-medium → mistral-medium-latest
mistral/magistral-small-2509 → magistral-small
R050 now active: full closure requires auto-benchmark of remaining
104 uncovered models via bulk-import of published scores or live eval.
This step shrinks the gap via cheap structural fallback; future work
adds the real scoring loop.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
bbce6827aa
commit
1dc7c2e278
2 changed files with 57 additions and 3 deletions
|
|
@ -623,3 +623,14 @@ ADR-0000 declares SF a **purpose-to-software compiler**. R036–R040 codify that
|
|||
- Supporting slices: none
|
||||
- Validation: unmapped
|
||||
- Notes: Builds on existing model-router.js scoring + R017's tool-failure demotion + R046's autonomous parallel dispatch. The new piece is the scheduler-level multi-model assignment per dispatch slot.
|
||||
|
||||
### R050 — Model Benchmark Coverage
|
||||
- Class: quality-attribute
|
||||
- Status: active
|
||||
- Description: Every model in the SF model-router registry must have a benchmark score (static or measured). Today (~/.sf/benchmark-coverage.json snapshot 2026-05-17): 57 of 154 models covered (37%). Notable uncovered: kimi-coding/kimi-for-coding (actively used by triage), mistral-medium-* family, gemini-2.5-flash-lite, gemini-3.1-flash-lite. Routing decisions (R017 demotion, R046 parallel scheduling, R049 multi-provider) are only as good as the scores they read; missing scores produce inconsistent routing.
|
||||
- Why it matters: Model selection compounds across thousands of dispatches in a 2-4 week autonomous run. Picking the wrong model wastes cost + time + produces lower quality work. Benchmark coverage is the foundation for routing intelligence.
|
||||
- Source: spec
|
||||
- Primary owning slice: unmapped (future "M037 Model Benchmark Coverage")
|
||||
- Supporting slices: none
|
||||
- Validation: unmapped
|
||||
- Notes: Two paths to scores: (a) bulk-import published scores from MMLU/HumanEval/SWE-bench for known models, (b) live-measure via SF's eval suite for unknown models (existing `.sf/evals/autonomous-solver/` framework). Doctor surfaces uncovered models; scheduler treats uncovered as "use cautiously, not for high-stakes units."
|
||||
|
|
|
|||
|
|
@ -103,6 +103,41 @@ export function normalizeForBenchmarkLookup(modelId) {
|
|||
return key;
|
||||
}
|
||||
|
||||
/**
|
||||
* #R050: produce a list of fallback lookup keys for a model id, so a model
|
||||
* that's logically equivalent to one in the benchmark file is considered
|
||||
* covered. Common patterns:
|
||||
* - "mistral-medium" / "mistral-medium-2505" → fallback to "mistral-medium-latest"
|
||||
* - "gemini-2.5-flash-lite" → fallback to "gemini-2.5-flash"
|
||||
* - "kimi-k2.7" → fallback to "kimi-k2.6" (one minor version back)
|
||||
* The audit treats the model as covered if ANY variant key is in the benchmark file.
|
||||
* Variants are listed in order of preference; first match wins.
|
||||
*/
|
||||
export function benchmarkLookupVariants(modelId) {
|
||||
const primary = normalizeForBenchmarkLookup(modelId);
|
||||
const variants = [primary];
|
||||
// Strip date/version suffix like "-2505", "-2508", "-2026-04" etc.
|
||||
const dateStripped = primary.replace(/-\d{4,}(-\d{2,})?$/, "");
|
||||
if (dateStripped !== primary) variants.push(dateStripped);
|
||||
// Strip "-lite" tier suffix and try the non-lite variant
|
||||
if (primary.endsWith("-lite")) {
|
||||
variants.push(primary.slice(0, -"-lite".length));
|
||||
}
|
||||
if (primary.endsWith("-flash-lite")) {
|
||||
variants.push(primary.replace(/-flash-lite$/, "-flash"));
|
||||
}
|
||||
if (primary.endsWith("-flash-lite-preview")) {
|
||||
variants.push(primary.replace(/-flash-lite-preview$/, "-flash"));
|
||||
variants.push(primary.replace(/-flash-lite-preview$/, "-flash-preview"));
|
||||
}
|
||||
// Append -latest to bare names (e.g. mistral-medium → mistral-medium-latest)
|
||||
if (!primary.endsWith("-latest")) {
|
||||
variants.push(`${primary}-latest`);
|
||||
}
|
||||
// Dedup while preserving order
|
||||
return [...new Set(variants)];
|
||||
}
|
||||
|
||||
/**
|
||||
* Compute coverage stats for the user's dispatchable model set.
|
||||
*
|
||||
|
|
@ -137,9 +172,17 @@ export function computeBenchmarkCoverage(prefs) {
|
|||
) {
|
||||
continue;
|
||||
}
|
||||
const key = normalizeForBenchmarkLookup(entry.id);
|
||||
const bucket = benchmarkKeys.has(key) ? covered : uncovered;
|
||||
bucket.push({ provider: entry.provider, id: entry.id });
|
||||
// #R050: try the primary normalized key first, then fall back to
|
||||
// well-known variants (date-stripped, lite→non-lite, +-latest, ...).
|
||||
// Covered if ANY variant is in the benchmark file.
|
||||
const variants = benchmarkLookupVariants(entry.id);
|
||||
const matched = variants.find((v) => benchmarkKeys.has(v));
|
||||
const bucket = matched ? covered : uncovered;
|
||||
bucket.push({
|
||||
provider: entry.provider,
|
||||
id: entry.id,
|
||||
...(matched && matched !== variants[0] ? { matchedVia: matched } : {}),
|
||||
});
|
||||
}
|
||||
|
||||
const total = covered.length + uncovered.length;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue