feat(model-router): explicit agentic score for every capability profile

Sweep MODEL_CAPABILITY_PROFILES so all 82 entries declare an explicit
agentic score; the agentic=50 fallback in scoreModel was silently
giving untouched profiles a generous default and letting weak agentic
models slip through execute-task routing. Anchors per the entry's
suggestedFix: coding-only ~25-40, very small/older ~30-40, older
generations ~55-70, frontier agentic ~85-95.

Adds an invariant test that asserts no profile relies on the default.

Closes sf-mp37p9u2-80f2gz.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Mikael Hugo 2026-05-14 06:28:06 +02:00
parent 48e793c003
commit 0694803df3
2 changed files with 60 additions and 0 deletions

View file

@ -174,6 +174,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 95,
longContext: 40,
instruction: 65,
agentic: 35,
},
"claude-3-opus-latest": {
agentic: 88,
@ -194,6 +195,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 65,
longContext: 70,
instruction: 80,
agentic: 65,
},
"gpt-4o-mini": {
coding: 55,
@ -203,6 +205,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 90,
longContext: 45,
instruction: 70,
agentic: 50,
},
"gpt-4-turbo": {
coding: 78,
@ -212,6 +215,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 50,
longContext: 65,
instruction: 78,
agentic: 60,
},
"gpt-4.1": {
coding: 82,
@ -221,6 +225,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 62,
longContext: 72,
instruction: 82,
agentic: 70,
},
"gpt-4.1-mini": {
coding: 58,
@ -230,6 +235,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 88,
longContext: 48,
instruction: 72,
agentic: 55,
},
"gpt-4.1-nano": {
coding: 40,
@ -239,6 +245,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 95,
longContext: 30,
instruction: 60,
agentic: 35,
},
"gpt-5": {
coding: 92,
@ -259,6 +266,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 88,
longContext: 52,
instruction: 74,
agentic: 75,
},
"gpt-5-nano": {
coding: 42,
@ -268,6 +276,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 95,
longContext: 32,
instruction: 62,
agentic: 60,
},
"gpt-5-pro": {
coding: 94,
@ -393,6 +402,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 20,
longContext: 65,
instruction: 82,
agentic: 65,
},
o3: {
coding: 80,
@ -402,6 +412,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 25,
longContext: 70,
instruction: 85,
agentic: 72,
},
"o4-mini": {
coding: 75,
@ -411,6 +422,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 60,
longContext: 65,
instruction: 80,
agentic: 70,
},
"o4-mini-deep-research": {
coding: 75,
@ -420,6 +432,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 30,
longContext: 80,
instruction: 80,
agentic: 65,
},
// ── Google ─────────────────────────────────────────────────────────────────
"gemini-2.5-pro": {
@ -430,6 +443,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 55,
longContext: 90,
instruction: 75,
agentic: 70,
},
"gemini-3.1-pro-preview": {
coding: 82,
@ -472,6 +486,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 96,
longContext: 85,
instruction: 68,
agentic: 60,
},
"gemini-2.5-flash": {
coding: 60,
@ -481,6 +496,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 92,
longContext: 85,
instruction: 70,
agentic: 60,
},
"gemini-2.5-flash-lite": {
coding: 52,
@ -490,6 +506,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 97,
longContext: 78,
instruction: 65,
agentic: 50,
},
"gemini-2.0-flash": {
coding: 50,
@ -499,6 +516,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 95,
longContext: 60,
instruction: 65,
agentic: 55,
},
"gemini-flash-2.0": {
coding: 50,
@ -508,6 +526,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 95,
longContext: 60,
instruction: 65,
agentic: 55,
},
// ── DeepSeek ───────────────────────────────────────────────────────────────
"deepseek-chat": {
@ -518,6 +537,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 70,
longContext: 55,
instruction: 65,
agentic: 55,
},
// ── Mistral AI ─────────────────────────────────────────────────────────────
"mistral-large-latest": {
@ -528,6 +548,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 50,
longContext: 75,
instruction: 85,
agentic: 60,
},
"mistral-large-2411": {
coding: 85,
@ -537,6 +558,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 50,
longContext: 75,
instruction: 85,
agentic: 55,
},
"mistral-large-2512": {
coding: 88,
@ -546,6 +568,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 52,
longContext: 78,
instruction: 88,
agentic: 65,
},
"pixtral-large-latest": {
coding: 85,
@ -555,6 +578,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 45,
longContext: 80,
instruction: 85,
agentic: 50,
},
"mistral-medium-latest": {
coding: 75,
@ -564,6 +588,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 60,
longContext: 65,
instruction: 75,
agentic: 55,
},
"mistral-medium-2505": {
coding: 75,
@ -573,6 +598,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 60,
longContext: 65,
instruction: 75,
agentic: 50,
},
"mistral-medium-2508": {
coding: 78,
@ -582,6 +608,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 62,
longContext: 68,
instruction: 78,
agentic: 55,
},
"mistral-small-latest": {
coding: 65,
@ -591,6 +618,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 80,
longContext: 55,
instruction: 70,
agentic: 40,
},
"mistral-small-2506": {
coding: 65,
@ -600,6 +628,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 80,
longContext: 55,
instruction: 70,
agentic: 40,
},
"mistral-small-2603": {
coding: 68,
@ -609,6 +638,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 82,
longContext: 58,
instruction: 72,
agentic: 40,
},
"codestral-latest": {
coding: 85,
@ -631,6 +661,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 90,
longContext: 45,
instruction: 70,
agentic: 30,
},
"ministral-3b-latest": {
coding: 45,
@ -640,6 +671,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 95,
longContext: 35,
instruction: 60,
agentic: 25,
},
"open-mixtral-8x22b": {
coding: 75,
@ -649,6 +681,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 40,
longContext: 70,
instruction: 75,
agentic: 40,
},
"pixtral-12b": {
coding: 60,
@ -658,6 +691,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 75,
longContext: 60,
instruction: 65,
agentic: 35,
},
"mistral-nemo": {
coding: 60,
@ -667,6 +701,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 85,
longContext: 60,
instruction: 65,
agentic: 35,
},
"magistral-medium-latest": {
coding: 80,
@ -676,6 +711,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 55,
longContext: 75,
instruction: 80,
agentic: 65,
},
"magistral-small": {
coding: 70,
@ -685,6 +721,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 75,
longContext: 65,
instruction: 70,
agentic: 50,
},
"devstral-2512": {
coding: 82,
@ -757,6 +794,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 35,
longContext: 80,
instruction: 88,
agentic: 75,
},
"glm-5-turbo": {
coding: 85,
@ -766,6 +804,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 65,
longContext: 75,
instruction: 85,
agentic: 70,
},
"glm-5.1": {
coding: 92,
@ -775,6 +814,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 38,
longContext: 82,
instruction: 89,
agentic: 75,
},
"glm-5v-turbo": {
coding: 82,
@ -784,6 +824,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 60,
longContext: 75,
instruction: 82,
agentic: 65,
},
"glm-4.7": {
coding: 80,
@ -793,6 +834,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 60,
longContext: 70,
instruction: 80,
agentic: 55,
},
"glm-4.7-flash": {
coding: 50,
@ -802,6 +844,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 95,
longContext: 50,
instruction: 65,
agentic: 50,
},
"glm-4.7-flashx": {
coding: 45,
@ -811,6 +854,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 98,
longContext: 45,
instruction: 60,
agentic: 50,
},
// ── Qwen / Ollama Cloud compatible tags ──────────────────────────────────
"qwen3-coder:480b": {
@ -842,6 +886,7 @@ export const MODEL_CAPABILITY_PROFILES = {
speed: 62,
longContext: 86,
instruction: 74,
agentic: 55,
},
// ── Moonshot / Kimi ───────────────────────────────────────────────────────
"kimi-k2.6": {

View file

@ -115,6 +115,21 @@ describe("agentic capability axis (ADR-0079)", () => {
expect(newScore).toBeGreaterThan(oldScore);
});
test("every profile has an explicit agentic score (no defaulting to 50)", () => {
// sf-mp37p9u2-80f2gz: the agentic=50 fallback in scoreModel was
// silently letting untouched profiles escape penalization for poor
// tool-use reliability. Every profile must declare a deliberate
// score so the router can differentiate the full table.
const offenders = [];
for (const [id, profile] of Object.entries(MODEL_CAPABILITY_PROFILES)) {
if (typeof profile.agentic !== "number") offenders.push(id);
}
expect(
offenders,
`profiles missing explicit agentic: ${offenders.join(", ")}`,
).toEqual([]);
});
test("known agentic-frontier models all have agentic >= 85", () => {
const agenticFrontier = [
"claude-opus-4-6",