diff --git a/.sf/REQUIREMENTS.md b/.sf/REQUIREMENTS.md index 9169de044..f844fc1f4 100644 --- a/.sf/REQUIREMENTS.md +++ b/.sf/REQUIREMENTS.md @@ -623,3 +623,14 @@ ADR-0000 declares SF a **purpose-to-software compiler**. R036–R040 codify that - Supporting slices: none - Validation: unmapped - Notes: Builds on existing model-router.js scoring + R017's tool-failure demotion + R046's autonomous parallel dispatch. The new piece is the scheduler-level multi-model assignment per dispatch slot. + +### R050 — Model Benchmark Coverage +- Class: quality-attribute +- Status: active +- Description: Every model in the SF model-router registry must have a benchmark score (static or measured). Today (~/.sf/benchmark-coverage.json snapshot 2026-05-17): 57 of 154 models covered (37%). Notable uncovered: kimi-coding/kimi-for-coding (actively used by triage), mistral-medium-* family, gemini-2.5-flash-lite, gemini-3.1-flash-lite. Routing decisions (R017 demotion, R046 parallel scheduling, R049 multi-provider) are only as good as the scores they read; missing scores produce inconsistent routing. +- Why it matters: Model selection compounds across thousands of dispatches in a 2-4 week autonomous run. Picking the wrong model wastes cost + time + produces lower quality work. Benchmark coverage is the foundation for routing intelligence. +- Source: spec +- Primary owning slice: unmapped (future "M037 Model Benchmark Coverage") +- Supporting slices: none +- Validation: unmapped +- Notes: Two paths to scores: (a) bulk-import published scores from MMLU/HumanEval/SWE-bench for known models, (b) live-measure via SF's eval suite for unknown models (existing `.sf/evals/autonomous-solver/` framework). Doctor surfaces uncovered models; scheduler treats uncovered as "use cautiously, not for high-stakes units." diff --git a/src/resources/extensions/sf/benchmark-coverage.js b/src/resources/extensions/sf/benchmark-coverage.js index 23e890ff4..ab4bc762b 100644 --- a/src/resources/extensions/sf/benchmark-coverage.js +++ b/src/resources/extensions/sf/benchmark-coverage.js @@ -103,6 +103,41 @@ export function normalizeForBenchmarkLookup(modelId) { return key; } +/** + * #R050: produce a list of fallback lookup keys for a model id, so a model + * that's logically equivalent to one in the benchmark file is considered + * covered. Common patterns: + * - "mistral-medium" / "mistral-medium-2505" → fallback to "mistral-medium-latest" + * - "gemini-2.5-flash-lite" → fallback to "gemini-2.5-flash" + * - "kimi-k2.7" → fallback to "kimi-k2.6" (one minor version back) + * The audit treats the model as covered if ANY variant key is in the benchmark file. + * Variants are listed in order of preference; first match wins. + */ +export function benchmarkLookupVariants(modelId) { + const primary = normalizeForBenchmarkLookup(modelId); + const variants = [primary]; + // Strip date/version suffix like "-2505", "-2508", "-2026-04" etc. + const dateStripped = primary.replace(/-\d{4,}(-\d{2,})?$/, ""); + if (dateStripped !== primary) variants.push(dateStripped); + // Strip "-lite" tier suffix and try the non-lite variant + if (primary.endsWith("-lite")) { + variants.push(primary.slice(0, -"-lite".length)); + } + if (primary.endsWith("-flash-lite")) { + variants.push(primary.replace(/-flash-lite$/, "-flash")); + } + if (primary.endsWith("-flash-lite-preview")) { + variants.push(primary.replace(/-flash-lite-preview$/, "-flash")); + variants.push(primary.replace(/-flash-lite-preview$/, "-flash-preview")); + } + // Append -latest to bare names (e.g. mistral-medium → mistral-medium-latest) + if (!primary.endsWith("-latest")) { + variants.push(`${primary}-latest`); + } + // Dedup while preserving order + return [...new Set(variants)]; +} + /** * Compute coverage stats for the user's dispatchable model set. * @@ -137,9 +172,17 @@ export function computeBenchmarkCoverage(prefs) { ) { continue; } - const key = normalizeForBenchmarkLookup(entry.id); - const bucket = benchmarkKeys.has(key) ? covered : uncovered; - bucket.push({ provider: entry.provider, id: entry.id }); + // #R050: try the primary normalized key first, then fall back to + // well-known variants (date-stripped, lite→non-lite, +-latest, ...). + // Covered if ANY variant is in the benchmark file. + const variants = benchmarkLookupVariants(entry.id); + const matched = variants.find((v) => benchmarkKeys.has(v)); + const bucket = matched ? covered : uncovered; + bucket.push({ + provider: entry.provider, + id: entry.id, + ...(matched && matched !== variants[0] ? { matchedVia: matched } : {}), + }); } const total = covered.length + uncovered.length;