From 7a273262f158f7fade433e14e45369338262ed71 Mon Sep 17 00:00:00 2001 From: Mikael Hugo Date: Sun, 17 May 2026 03:52:29 +0200 Subject: [PATCH] fix(benchmark-coverage): tier-strip fallbacks downgraded to 'approx' proxy MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit User caught: flash-lite ≠ flash (different model tier, different scores). Previous fix counted flash-lite as fully covered via flash proxy, which overstated coverage and could mislead routing. benchmarkLookupVariants now tags variants with kind: - 'exact' → date/version strip + -latest alias (same model line) - 'approx' → tier strip (flash-lite→flash, X-lite→X) — different model computeBenchmarkCoverage promotes 'exact' matches to covered; 'approx' matches stay in uncovered with `approximatedBy` field so operators see when a real benchmark is still needed. Honest report: 64 exact covered / 1 proxy-only / 104 genuine uncovered (was 65/0/104 with the overcount). R049 + R050 added to traceability (M036/M037 future milestones). Co-Authored-By: Claude Opus 4.7 (1M context) --- .sf/REQUIREMENTS.md | 10 +- .../extensions/sf/benchmark-coverage.js | 94 +++++++++++++------ 2 files changed, 69 insertions(+), 35 deletions(-) diff --git a/.sf/REQUIREMENTS.md b/.sf/REQUIREMENTS.md index f844fc1f4..fddfc34f8 100644 --- a/.sf/REQUIREMENTS.md +++ b/.sf/REQUIREMENTS.md @@ -500,14 +500,16 @@ The next group enforces ADR-0000's contract: **purpose is the driver**, not work | R046 | differentiator | active | M033/S02 | M033/S01, M033/S03 | unmapped | | R047 | quality-attribute | active | M034/S02 | M034/S01, M034/S03, M034/S04 | unmapped | | R048 | core-capability | active | M035/S02 | M035/S01, M035/S03, M035/S04 | unmapped | +| R049 | differentiator | active | unmapped (M036 future) | none | unmapped | +| R050 | quality-attribute | active | unmapped (M037 future) | none | partial — variant-fallback shipped in benchmark-coverage.js | ## Coverage Summary -- Active requirements: 48 -- Mapped to slices: **48 (all)** +- Active requirements: 50 +- Mapped to slices: **48** - Validated: 0 -- Unmapped active requirements: **0** -- Owning milestones: M003 (R001-R006), M005 (R007-R010), M010 (R013-R015, R020), M011 (R011-R012), M012 (R016), M013 (R017), M014 (R018), M015 (R019), M016-M030 (R021-R040), M031 (R041-R044), M032 (R045), M033 (R046), M034 (R047), M035 (R048) +- Unmapped active requirements: **2** (R049 — multi-provider parallel routing; R050 — auto-benchmark uncovered models) +- Owning milestones: M003 (R001-R006), M005 (R007-R010), M010 (R013-R015, R020), M011 (R011-R012), M012 (R016), M013 (R017), M014 (R018), M015 (R019), M016-M030 (R021-R040), M031 (R041-R044), M032 (R045), M033 (R046), M034 (R047), M035 (R048), [pending] M036-M037 (R049-R050) ## Purpose Anchor diff --git a/src/resources/extensions/sf/benchmark-coverage.js b/src/resources/extensions/sf/benchmark-coverage.js index ab4bc762b..1f0dd77af 100644 --- a/src/resources/extensions/sf/benchmark-coverage.js +++ b/src/resources/extensions/sf/benchmark-coverage.js @@ -104,38 +104,58 @@ export function normalizeForBenchmarkLookup(modelId) { } /** - * #R050: produce a list of fallback lookup keys for a model id, so a model - * that's logically equivalent to one in the benchmark file is considered - * covered. Common patterns: - * - "mistral-medium" / "mistral-medium-2505" → fallback to "mistral-medium-latest" - * - "gemini-2.5-flash-lite" → fallback to "gemini-2.5-flash" - * - "kimi-k2.7" → fallback to "kimi-k2.6" (one minor version back) - * The audit treats the model as covered if ANY variant key is in the benchmark file. - * Variants are listed in order of preference; first match wins. + * #R050: produce a list of fallback lookup keys for a model id. Each variant + * carries a `kind` flag — "exact" for date/version aliases (e.g. + * "mistral-medium-2505" → "mistral-medium-latest" — same model line, different + * release tag) versus "approx" for tier proxies (e.g. "X-flash-lite" → "X-flash" + * — DIFFERENT model size, scores not equivalent). + * + * The audit uses "exact" matches to count toward full coverage; "approx" matches + * are reported as `approximateCoverage` so operators know a proxy is being used + * rather than a real benchmark. */ export function benchmarkLookupVariants(modelId) { const primary = normalizeForBenchmarkLookup(modelId); - const variants = [primary]; - // Strip date/version suffix like "-2505", "-2508", "-2026-04" etc. + const variants = [{ key: primary, kind: "exact" }]; + // Date/version suffix strip is "exact" — same model line, different release tag const dateStripped = primary.replace(/-\d{4,}(-\d{2,})?$/, ""); - if (dateStripped !== primary) variants.push(dateStripped); - // Strip "-lite" tier suffix and try the non-lite variant + if (dateStripped !== primary) { + variants.push({ key: dateStripped, kind: "exact" }); + } + // "-latest" alias is "exact" — points to the same line's current release + if (!primary.endsWith("-latest")) { + variants.push({ key: `${primary}-latest`, kind: "exact" }); + } + // Tier strips are "approx" — flash-lite vs flash are DIFFERENT models, + // different parameters, different scores. Proxy at best. if (primary.endsWith("-lite")) { - variants.push(primary.slice(0, -"-lite".length)); + variants.push({ key: primary.slice(0, -"-lite".length), kind: "approx" }); } if (primary.endsWith("-flash-lite")) { - variants.push(primary.replace(/-flash-lite$/, "-flash")); + variants.push({ + key: primary.replace(/-flash-lite$/, "-flash"), + kind: "approx", + }); } if (primary.endsWith("-flash-lite-preview")) { - variants.push(primary.replace(/-flash-lite-preview$/, "-flash")); - variants.push(primary.replace(/-flash-lite-preview$/, "-flash-preview")); + variants.push({ + key: primary.replace(/-flash-lite-preview$/, "-flash"), + kind: "approx", + }); + variants.push({ + key: primary.replace(/-flash-lite-preview$/, "-flash-preview"), + kind: "approx", + }); } - // Append -latest to bare names (e.g. mistral-medium → mistral-medium-latest) - if (!primary.endsWith("-latest")) { - variants.push(`${primary}-latest`); + // Dedup by key, preserving the first (most-preferred) kind + const seen = new Set(); + const out = []; + for (const v of variants) { + if (seen.has(v.key)) continue; + seen.add(v.key); + out.push(v); } - // Dedup while preserving order - return [...new Set(variants)]; + return out; } /** @@ -172,17 +192,29 @@ export function computeBenchmarkCoverage(prefs) { ) { continue; } - // #R050: try the primary normalized key first, then fall back to - // well-known variants (date-stripped, lite→non-lite, +-latest, ...). - // Covered if ANY variant is in the benchmark file. + // #R050: try each variant in order. Exact matches count as full coverage; + // approx matches (tier proxies like flash-lite→flash) are tracked + // separately so operators see when a real benchmark is still missing. const variants = benchmarkLookupVariants(entry.id); - const matched = variants.find((v) => benchmarkKeys.has(v)); - const bucket = matched ? covered : uncovered; - bucket.push({ - provider: entry.provider, - id: entry.id, - ...(matched && matched !== variants[0] ? { matchedVia: matched } : {}), - }); + const matched = variants.find((v) => benchmarkKeys.has(v.key)); + if (!matched) { + uncovered.push({ provider: entry.provider, id: entry.id }); + } else if (matched.kind === "exact") { + covered.push({ + provider: entry.provider, + id: entry.id, + ...(matched.key !== variants[0].key ? { matchedVia: matched.key } : {}), + }); + } else { + // approx — proxy match; still counts as uncovered for honest reporting, + // but the audit lists the proxy so operators can decide to add a real entry. + uncovered.push({ + provider: entry.provider, + id: entry.id, + approximatedBy: matched.key, + note: "covered via approx proxy — different model tier; real benchmark would be more accurate", + }); + } } const total = covered.length + uncovered.length;