fix(benchmark-coverage): tier-strip fallbacks downgraded to 'approx' proxy

User caught: flash-lite ≠ flash (different model tier, different scores). Previous fix counted flash-lite as fully covered via flash proxy, which overstated coverage and could mislead routing. benchmarkLookupVariants now tags variants with kind: - 'exact' → date/version strip + -latest alias (same model line) - 'approx' → tier strip (flash-lite→flash, X-lite→X) — different model computeBenchmarkCoverage promotes 'exact' matches to covered; 'approx' matches stay in uncovered with `approximatedBy` field so operators see when a real benchmark is still needed. Honest report: 64 exact covered / 1 proxy-only / 104 genuine uncovered (was 65/0/104 with the overcount). R049 + R050 added to traceability (M036/M037 future milestones). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-17 03:52:29 +02:00 · 2026-05-17 03:52:29 +02:00 · 7a273262f1
commit 7a273262f1
parent 1dc7c2e278
2 changed files with 69 additions and 35 deletions
--- a/.sf/REQUIREMENTS.md
+++ b/.sf/REQUIREMENTS.md
@ -500,14 +500,16 @@ The next group enforces ADR-0000's contract: **purpose is the driver**, not work
 | R046 | differentiator | active | M033/S02 | M033/S01, M033/S03 | unmapped |
 | R047 | quality-attribute | active | M034/S02 | M034/S01, M034/S03, M034/S04 | unmapped |
 | R048 | core-capability | active | M035/S02 | M035/S01, M035/S03, M035/S04 | unmapped |
+| R049 | differentiator | active | unmapped (M036 future) | none | unmapped |
+| R050 | quality-attribute | active | unmapped (M037 future) | none | partial — variant-fallback shipped in benchmark-coverage.js |

 ## Coverage Summary

- Active requirements: 48
- Mapped to slices: **48 (all)**
+- Active requirements: 50
+- Mapped to slices: **48**
 - Validated: 0
- Unmapped active requirements: **0**
- Owning milestones: M003 (R001-R006), M005 (R007-R010), M010 (R013-R015, R020), M011 (R011-R012), M012 (R016), M013 (R017), M014 (R018), M015 (R019), M016-M030 (R021-R040), M031 (R041-R044), M032 (R045), M033 (R046), M034 (R047), M035 (R048)
+- Unmapped active requirements: **2** (R049 — multi-provider parallel routing; R050 — auto-benchmark uncovered models)
+- Owning milestones: M003 (R001-R006), M005 (R007-R010), M010 (R013-R015, R020), M011 (R011-R012), M012 (R016), M013 (R017), M014 (R018), M015 (R019), M016-M030 (R021-R040), M031 (R041-R044), M032 (R045), M033 (R046), M034 (R047), M035 (R048), [pending] M036-M037 (R049-R050)

 ## Purpose Anchor

--- a/src/resources/extensions/sf/benchmark-coverage.js
+++ b/src/resources/extensions/sf/benchmark-coverage.js
@ -104,38 +104,58 @@ export function normalizeForBenchmarkLookup(modelId) {
 }

 /**
- * #R050: produce a list of fallback lookup keys for a model id, so a model
- * that's logically equivalent to one in the benchmark file is considered
- * covered. Common patterns:
- *   - "mistral-medium" / "mistral-medium-2505" → fallback to "mistral-medium-latest"
- *   - "gemini-2.5-flash-lite" → fallback to "gemini-2.5-flash"
- *   - "kimi-k2.7" → fallback to "kimi-k2.6" (one minor version back)
- * The audit treats the model as covered if ANY variant key is in the benchmark file.
- * Variants are listed in order of preference; first match wins.
+ * #R050: produce a list of fallback lookup keys for a model id. Each variant
+ * carries a `kind` flag — "exact" for date/version aliases (e.g.
+ * "mistral-medium-2505" → "mistral-medium-latest" — same model line, different
+ * release tag) versus "approx" for tier proxies (e.g. "X-flash-lite" → "X-flash"
+ * — DIFFERENT model size, scores not equivalent).
+ *
+ * The audit uses "exact" matches to count toward full coverage; "approx" matches
+ * are reported as `approximateCoverage` so operators know a proxy is being used
+ * rather than a real benchmark.
 */
 export function benchmarkLookupVariants(modelId) {
 	const primary = normalizeForBenchmarkLookup(modelId);
-	const variants = [primary];
-	// Strip date/version suffix like "-2505", "-2508", "-2026-04" etc.
+	const variants = [{ key: primary, kind: "exact" }];
+	// Date/version suffix strip is "exact" — same model line, different release tag
 	const dateStripped = primary.replace(/-\d{4,}(-\d{2,})?$/, "");
-	if (dateStripped !== primary) variants.push(dateStripped);
-	// Strip "-lite" tier suffix and try the non-lite variant
+	if (dateStripped !== primary) {
+		variants.push({ key: dateStripped, kind: "exact" });
+	}
+	// "-latest" alias is "exact" — points to the same line's current release
+	if (!primary.endsWith("-latest")) {
+		variants.push({ key: `${primary}-latest`, kind: "exact" });
+	}
+	// Tier strips are "approx" — flash-lite vs flash are DIFFERENT models,
+	// different parameters, different scores. Proxy at best.
 	if (primary.endsWith("-lite")) {
-		variants.push(primary.slice(0, -"-lite".length));
+		variants.push({ key: primary.slice(0, -"-lite".length), kind: "approx" });
 	}
 	if (primary.endsWith("-flash-lite")) {
-		variants.push(primary.replace(/-flash-lite$/, "-flash"));
+		variants.push({
+			key: primary.replace(/-flash-lite$/, "-flash"),
+			kind: "approx",
+		});
 	}
 	if (primary.endsWith("-flash-lite-preview")) {
-		variants.push(primary.replace(/-flash-lite-preview$/, "-flash"));
-		variants.push(primary.replace(/-flash-lite-preview$/, "-flash-preview"));
+		variants.push({
+			key: primary.replace(/-flash-lite-preview$/, "-flash"),
+			kind: "approx",
+		});
+		variants.push({
+			key: primary.replace(/-flash-lite-preview$/, "-flash-preview"),
+			kind: "approx",
+		});
 	}
-	// Append -latest to bare names (e.g. mistral-medium → mistral-medium-latest)
-	if (!primary.endsWith("-latest")) {
-		variants.push(`${primary}-latest`);
+	// Dedup by key, preserving the first (most-preferred) kind
+	const seen = new Set();
+	const out = [];
+	for (const v of variants) {
+		if (seen.has(v.key)) continue;
+		seen.add(v.key);
+		out.push(v);
 	}
-	// Dedup while preserving order
-	return [...new Set(variants)];
+	return out;
 }

 /**
@ -172,17 +192,29 @@ export function computeBenchmarkCoverage(prefs) {
 		) {
 			continue;
 		}
-		// #R050: try the primary normalized key first, then fall back to
-		// well-known variants (date-stripped, lite→non-lite, +-latest, ...).
-		// Covered if ANY variant is in the benchmark file.
+		// #R050: try each variant in order. Exact matches count as full coverage;
+		// approx matches (tier proxies like flash-lite→flash) are tracked
+		// separately so operators see when a real benchmark is still missing.
 		const variants = benchmarkLookupVariants(entry.id);
-		const matched = variants.find((v) => benchmarkKeys.has(v));
-		const bucket = matched ? covered : uncovered;
-		bucket.push({
-			provider: entry.provider,
-			id: entry.id,
-			...(matched && matched !== variants[0] ? { matchedVia: matched } : {}),
-		});
+		const matched = variants.find((v) => benchmarkKeys.has(v.key));
+		if (!matched) {
+			uncovered.push({ provider: entry.provider, id: entry.id });
+		} else if (matched.kind === "exact") {
+			covered.push({
+				provider: entry.provider,
+				id: entry.id,
+				...(matched.key !== variants[0].key ? { matchedVia: matched.key } : {}),
+			});
+		} else {
+			// approx — proxy match; still counts as uncovered for honest reporting,
+			// but the audit lists the proxy so operators can decide to add a real entry.
+			uncovered.push({
+				provider: entry.provider,
+				id: entry.id,
+				approximatedBy: matched.key,
+				note: "covered via approx proxy — different model tier; real benchmark would be more accurate",
+			});
+		}
 	}

 	const total = covered.length + uncovered.length;