fix(benchmark-coverage): tier-strip fallbacks downgraded to 'approx' proxy
User caught: flash-lite ≠ flash (different model tier, different scores). Previous fix counted flash-lite as fully covered via flash proxy, which overstated coverage and could mislead routing. benchmarkLookupVariants now tags variants with kind: - 'exact' → date/version strip + -latest alias (same model line) - 'approx' → tier strip (flash-lite→flash, X-lite→X) — different model computeBenchmarkCoverage promotes 'exact' matches to covered; 'approx' matches stay in uncovered with `approximatedBy` field so operators see when a real benchmark is still needed. Honest report: 64 exact covered / 1 proxy-only / 104 genuine uncovered (was 65/0/104 with the overcount). R049 + R050 added to traceability (M036/M037 future milestones). Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
1dc7c2e278
commit
7a273262f1
2 changed files with 69 additions and 35 deletions
|
|
@ -500,14 +500,16 @@ The next group enforces ADR-0000's contract: **purpose is the driver**, not work
|
|||
| R046 | differentiator | active | M033/S02 | M033/S01, M033/S03 | unmapped |
|
||||
| R047 | quality-attribute | active | M034/S02 | M034/S01, M034/S03, M034/S04 | unmapped |
|
||||
| R048 | core-capability | active | M035/S02 | M035/S01, M035/S03, M035/S04 | unmapped |
|
||||
| R049 | differentiator | active | unmapped (M036 future) | none | unmapped |
|
||||
| R050 | quality-attribute | active | unmapped (M037 future) | none | partial — variant-fallback shipped in benchmark-coverage.js |
|
||||
|
||||
## Coverage Summary
|
||||
|
||||
- Active requirements: 48
|
||||
- Mapped to slices: **48 (all)**
|
||||
- Active requirements: 50
|
||||
- Mapped to slices: **48**
|
||||
- Validated: 0
|
||||
- Unmapped active requirements: **0**
|
||||
- Owning milestones: M003 (R001-R006), M005 (R007-R010), M010 (R013-R015, R020), M011 (R011-R012), M012 (R016), M013 (R017), M014 (R018), M015 (R019), M016-M030 (R021-R040), M031 (R041-R044), M032 (R045), M033 (R046), M034 (R047), M035 (R048)
|
||||
- Unmapped active requirements: **2** (R049 — multi-provider parallel routing; R050 — auto-benchmark uncovered models)
|
||||
- Owning milestones: M003 (R001-R006), M005 (R007-R010), M010 (R013-R015, R020), M011 (R011-R012), M012 (R016), M013 (R017), M014 (R018), M015 (R019), M016-M030 (R021-R040), M031 (R041-R044), M032 (R045), M033 (R046), M034 (R047), M035 (R048), [pending] M036-M037 (R049-R050)
|
||||
|
||||
## Purpose Anchor
|
||||
|
||||
|
|
|
|||
|
|
@ -104,38 +104,58 @@ export function normalizeForBenchmarkLookup(modelId) {
|
|||
}
|
||||
|
||||
/**
|
||||
* #R050: produce a list of fallback lookup keys for a model id, so a model
|
||||
* that's logically equivalent to one in the benchmark file is considered
|
||||
* covered. Common patterns:
|
||||
* - "mistral-medium" / "mistral-medium-2505" → fallback to "mistral-medium-latest"
|
||||
* - "gemini-2.5-flash-lite" → fallback to "gemini-2.5-flash"
|
||||
* - "kimi-k2.7" → fallback to "kimi-k2.6" (one minor version back)
|
||||
* The audit treats the model as covered if ANY variant key is in the benchmark file.
|
||||
* Variants are listed in order of preference; first match wins.
|
||||
* #R050: produce a list of fallback lookup keys for a model id. Each variant
|
||||
* carries a `kind` flag — "exact" for date/version aliases (e.g.
|
||||
* "mistral-medium-2505" → "mistral-medium-latest" — same model line, different
|
||||
* release tag) versus "approx" for tier proxies (e.g. "X-flash-lite" → "X-flash"
|
||||
* — DIFFERENT model size, scores not equivalent).
|
||||
*
|
||||
* The audit uses "exact" matches to count toward full coverage; "approx" matches
|
||||
* are reported as `approximateCoverage` so operators know a proxy is being used
|
||||
* rather than a real benchmark.
|
||||
*/
|
||||
export function benchmarkLookupVariants(modelId) {
|
||||
const primary = normalizeForBenchmarkLookup(modelId);
|
||||
const variants = [primary];
|
||||
// Strip date/version suffix like "-2505", "-2508", "-2026-04" etc.
|
||||
const variants = [{ key: primary, kind: "exact" }];
|
||||
// Date/version suffix strip is "exact" — same model line, different release tag
|
||||
const dateStripped = primary.replace(/-\d{4,}(-\d{2,})?$/, "");
|
||||
if (dateStripped !== primary) variants.push(dateStripped);
|
||||
// Strip "-lite" tier suffix and try the non-lite variant
|
||||
if (dateStripped !== primary) {
|
||||
variants.push({ key: dateStripped, kind: "exact" });
|
||||
}
|
||||
// "-latest" alias is "exact" — points to the same line's current release
|
||||
if (!primary.endsWith("-latest")) {
|
||||
variants.push({ key: `${primary}-latest`, kind: "exact" });
|
||||
}
|
||||
// Tier strips are "approx" — flash-lite vs flash are DIFFERENT models,
|
||||
// different parameters, different scores. Proxy at best.
|
||||
if (primary.endsWith("-lite")) {
|
||||
variants.push(primary.slice(0, -"-lite".length));
|
||||
variants.push({ key: primary.slice(0, -"-lite".length), kind: "approx" });
|
||||
}
|
||||
if (primary.endsWith("-flash-lite")) {
|
||||
variants.push(primary.replace(/-flash-lite$/, "-flash"));
|
||||
variants.push({
|
||||
key: primary.replace(/-flash-lite$/, "-flash"),
|
||||
kind: "approx",
|
||||
});
|
||||
}
|
||||
if (primary.endsWith("-flash-lite-preview")) {
|
||||
variants.push(primary.replace(/-flash-lite-preview$/, "-flash"));
|
||||
variants.push(primary.replace(/-flash-lite-preview$/, "-flash-preview"));
|
||||
variants.push({
|
||||
key: primary.replace(/-flash-lite-preview$/, "-flash"),
|
||||
kind: "approx",
|
||||
});
|
||||
variants.push({
|
||||
key: primary.replace(/-flash-lite-preview$/, "-flash-preview"),
|
||||
kind: "approx",
|
||||
});
|
||||
}
|
||||
// Append -latest to bare names (e.g. mistral-medium → mistral-medium-latest)
|
||||
if (!primary.endsWith("-latest")) {
|
||||
variants.push(`${primary}-latest`);
|
||||
// Dedup by key, preserving the first (most-preferred) kind
|
||||
const seen = new Set();
|
||||
const out = [];
|
||||
for (const v of variants) {
|
||||
if (seen.has(v.key)) continue;
|
||||
seen.add(v.key);
|
||||
out.push(v);
|
||||
}
|
||||
// Dedup while preserving order
|
||||
return [...new Set(variants)];
|
||||
return out;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -172,17 +192,29 @@ export function computeBenchmarkCoverage(prefs) {
|
|||
) {
|
||||
continue;
|
||||
}
|
||||
// #R050: try the primary normalized key first, then fall back to
|
||||
// well-known variants (date-stripped, lite→non-lite, +-latest, ...).
|
||||
// Covered if ANY variant is in the benchmark file.
|
||||
// #R050: try each variant in order. Exact matches count as full coverage;
|
||||
// approx matches (tier proxies like flash-lite→flash) are tracked
|
||||
// separately so operators see when a real benchmark is still missing.
|
||||
const variants = benchmarkLookupVariants(entry.id);
|
||||
const matched = variants.find((v) => benchmarkKeys.has(v));
|
||||
const bucket = matched ? covered : uncovered;
|
||||
bucket.push({
|
||||
provider: entry.provider,
|
||||
id: entry.id,
|
||||
...(matched && matched !== variants[0] ? { matchedVia: matched } : {}),
|
||||
});
|
||||
const matched = variants.find((v) => benchmarkKeys.has(v.key));
|
||||
if (!matched) {
|
||||
uncovered.push({ provider: entry.provider, id: entry.id });
|
||||
} else if (matched.kind === "exact") {
|
||||
covered.push({
|
||||
provider: entry.provider,
|
||||
id: entry.id,
|
||||
...(matched.key !== variants[0].key ? { matchedVia: matched.key } : {}),
|
||||
});
|
||||
} else {
|
||||
// approx — proxy match; still counts as uncovered for honest reporting,
|
||||
// but the audit lists the proxy so operators can decide to add a real entry.
|
||||
uncovered.push({
|
||||
provider: entry.provider,
|
||||
id: entry.id,
|
||||
approximatedBy: matched.key,
|
||||
note: "covered via approx proxy — different model tier; real benchmark would be more accurate",
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
const total = covered.length + uncovered.length;
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue