test(01-01): add failing tests for capability-aware model routing
- Tests for scoreModel weighted average, edge cases (empty/unknown dims) - Tests for computeTaskRequirements with all branch paths (docs, concurrency, migration, large-file) - Tests for MODEL_CAPABILITY_PROFILES (9 models, 7 dimensions each) - Tests for BASE_REQUIREMENTS (all 11 unit types) - Tests for scoreEligibleModels (sorting, tie-breaking, unknown models, overrides) - Tests for getEligibleModels (tier filtering, explicit config, empty result) - Tests for DynamicRoutingConfig.capability_routing and RoutingDecision.selectionMethod
This commit is contained in:
parent
6aaa244742
commit
e89bf7d18e
1 changed files with 348 additions and 0 deletions
348
src/resources/extensions/gsd/tests/capability-router.test.ts
Normal file
348
src/resources/extensions/gsd/tests/capability-router.test.ts
Normal file
|
|
@ -0,0 +1,348 @@
|
|||
// GSD Extension — Capability-Aware Router Tests
|
||||
// Copyright (c) 2026 Jeremy McSpadden <jeremy@fluxlabs.net>
|
||||
// Tests for new capability scoring functions and data tables (Plan 01-01)
|
||||
|
||||
import { describe, test } from "node:test";
|
||||
import assert from "node:assert/strict";
|
||||
|
||||
import {
|
||||
scoreModel,
|
||||
computeTaskRequirements,
|
||||
scoreEligibleModels,
|
||||
getEligibleModels,
|
||||
resolveModelForComplexity,
|
||||
MODEL_CAPABILITY_PROFILES,
|
||||
BASE_REQUIREMENTS,
|
||||
defaultRoutingConfig,
|
||||
} from "../model-router.js";
|
||||
import type { ModelCapabilities, DynamicRoutingConfig, RoutingDecision } from "../model-router.js";
|
||||
|
||||
// ─── scoreModel ──────────────────────────────────────────────────────────────
|
||||
|
||||
describe("scoreModel", () => {
|
||||
const sonnetProfile: ModelCapabilities = {
|
||||
coding: 85, debugging: 80, research: 75, reasoning: 80,
|
||||
speed: 60, longContext: 75, instruction: 85,
|
||||
};
|
||||
|
||||
test("produces correct weighted average for single dimension", () => {
|
||||
// Only coding weight 1.0 → result should be the coding score
|
||||
const score = scoreModel(sonnetProfile, { coding: 1.0 });
|
||||
assert.equal(score, 85);
|
||||
});
|
||||
|
||||
test("produces correct weighted average for two dimensions (coding 0.9, instruction 0.7)", () => {
|
||||
// (0.9*85 + 0.7*85) / (0.9+0.7) = (76.5+59.5)/1.6 = 136/1.6 = 85.0
|
||||
const score = scoreModel(sonnetProfile, { coding: 0.9, instruction: 0.7 });
|
||||
assert.ok(Math.abs(score - 85.0) < 0.01, `Expected ~85.0, got ${score}`);
|
||||
});
|
||||
|
||||
test("returns 50 when requirements is empty", () => {
|
||||
const score = scoreModel(sonnetProfile, {});
|
||||
assert.equal(score, 50);
|
||||
});
|
||||
|
||||
test("uses 50 as fallback for unknown dimension in requirements", () => {
|
||||
// 'unknown' dimension not in profile → treated as 50
|
||||
const score = scoreModel(sonnetProfile, { coding: 0.5, unknown: 1.0 } as any);
|
||||
// (0.5*85 + 1.0*50) / (0.5+1.0) = (42.5+50)/1.5 = 92.5/1.5 = 61.67
|
||||
assert.ok(score > 61 && score < 62, `Expected ~61.67, got ${score}`);
|
||||
});
|
||||
});
|
||||
|
||||
// ─── computeTaskRequirements ─────────────────────────────────────────────────
|
||||
|
||||
describe("computeTaskRequirements", () => {
|
||||
test("execute-task with no metadata returns base requirements", () => {
|
||||
const req = computeTaskRequirements("execute-task", undefined);
|
||||
assert.deepStrictEqual(req, { coding: 0.9, instruction: 0.7, speed: 0.3 });
|
||||
});
|
||||
|
||||
test("execute-task with docs tag returns docs-adjusted requirements", () => {
|
||||
const req = computeTaskRequirements("execute-task", { tags: ["docs"] });
|
||||
assert.equal(req.instruction, 0.9);
|
||||
assert.equal(req.coding, 0.3);
|
||||
assert.equal(req.speed, 0.7);
|
||||
});
|
||||
|
||||
test("execute-task with readme tag returns docs-adjusted requirements", () => {
|
||||
const req = computeTaskRequirements("execute-task", { tags: ["readme"] });
|
||||
assert.equal(req.instruction, 0.9);
|
||||
});
|
||||
|
||||
test("execute-task with concurrency keyword boosts debugging and reasoning", () => {
|
||||
const req = computeTaskRequirements("execute-task", { complexityKeywords: ["concurrency"] });
|
||||
assert.equal(req.debugging, 0.9);
|
||||
assert.equal(req.reasoning, 0.8);
|
||||
});
|
||||
|
||||
test("execute-task with compatibility keyword boosts debugging and reasoning", () => {
|
||||
const req = computeTaskRequirements("execute-task", { complexityKeywords: ["compatibility"] });
|
||||
assert.equal(req.debugging, 0.9);
|
||||
assert.equal(req.reasoning, 0.8);
|
||||
});
|
||||
|
||||
test("execute-task with migration keyword boosts reasoning and coding", () => {
|
||||
const req = computeTaskRequirements("execute-task", { complexityKeywords: ["migration"] });
|
||||
assert.equal(req.reasoning, 0.9);
|
||||
assert.equal(req.coding, 0.8);
|
||||
});
|
||||
|
||||
test("execute-task with architecture keyword boosts reasoning and coding", () => {
|
||||
const req = computeTaskRequirements("execute-task", { complexityKeywords: ["architecture"] });
|
||||
assert.equal(req.reasoning, 0.9);
|
||||
assert.equal(req.coding, 0.8);
|
||||
});
|
||||
|
||||
test("execute-task with fileCount >= 6 boosts coding and reasoning", () => {
|
||||
const req = computeTaskRequirements("execute-task", { fileCount: 8 });
|
||||
assert.equal(req.coding, 0.9);
|
||||
assert.equal(req.reasoning, 0.7);
|
||||
});
|
||||
|
||||
test("execute-task with fileCount exactly 6 triggers large-file boost", () => {
|
||||
const req = computeTaskRequirements("execute-task", { fileCount: 6 });
|
||||
assert.equal(req.coding, 0.9);
|
||||
assert.equal(req.reasoning, 0.7);
|
||||
});
|
||||
|
||||
test("execute-task with estimatedLines >= 500 boosts coding and reasoning", () => {
|
||||
const req = computeTaskRequirements("execute-task", { estimatedLines: 500 });
|
||||
assert.equal(req.coding, 0.9);
|
||||
assert.equal(req.reasoning, 0.7);
|
||||
});
|
||||
|
||||
test("research-milestone with no metadata returns base requirements", () => {
|
||||
const req = computeTaskRequirements("research-milestone", undefined);
|
||||
assert.deepStrictEqual(req, { research: 0.9, longContext: 0.7, reasoning: 0.5 });
|
||||
});
|
||||
|
||||
test("unknown unit type returns default reasoning requirement", () => {
|
||||
const req = computeTaskRequirements("unknown-type", undefined);
|
||||
assert.deepStrictEqual(req, { reasoning: 0.5 });
|
||||
});
|
||||
});
|
||||
|
||||
// ─── MODEL_CAPABILITY_PROFILES ───────────────────────────────────────────────
|
||||
|
||||
describe("MODEL_CAPABILITY_PROFILES", () => {
|
||||
test("contains all 9 required models", () => {
|
||||
const required = [
|
||||
"claude-opus-4-6", "claude-sonnet-4-6", "claude-haiku-4-5",
|
||||
"gpt-4o", "gpt-4o-mini", "gemini-2.5-pro", "gemini-2.0-flash",
|
||||
"deepseek-chat", "o3",
|
||||
];
|
||||
for (const model of required) {
|
||||
assert.ok(MODEL_CAPABILITY_PROFILES[model], `Missing profile for ${model}`);
|
||||
}
|
||||
});
|
||||
|
||||
test("each profile has all 7 capability dimensions", () => {
|
||||
const dims: Array<keyof ModelCapabilities> = [
|
||||
"coding", "debugging", "research", "reasoning",
|
||||
"speed", "longContext", "instruction",
|
||||
];
|
||||
for (const [modelId, profile] of Object.entries(MODEL_CAPABILITY_PROFILES)) {
|
||||
for (const dim of dims) {
|
||||
assert.ok(profile[dim] !== undefined, `${modelId} missing dimension ${dim}`);
|
||||
assert.ok(profile[dim] >= 0 && profile[dim] <= 100, `${modelId}.${dim} out of range`);
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
test("claude-opus-4-6 has high reasoning and coding", () => {
|
||||
const opus = MODEL_CAPABILITY_PROFILES["claude-opus-4-6"];
|
||||
assert.ok(opus.reasoning >= 90, `Expected reasoning >= 90, got ${opus.reasoning}`);
|
||||
assert.ok(opus.coding >= 90, `Expected coding >= 90, got ${opus.coding}`);
|
||||
});
|
||||
|
||||
test("claude-haiku-4-5 has high speed but lower reasoning", () => {
|
||||
const haiku = MODEL_CAPABILITY_PROFILES["claude-haiku-4-5"];
|
||||
assert.ok(haiku.speed >= 90, `Expected speed >= 90, got ${haiku.speed}`);
|
||||
assert.ok(haiku.reasoning < 70, `Expected reasoning < 70, got ${haiku.reasoning}`);
|
||||
});
|
||||
});
|
||||
|
||||
// ─── BASE_REQUIREMENTS ───────────────────────────────────────────────────────
|
||||
|
||||
describe("BASE_REQUIREMENTS", () => {
|
||||
test("contains all 11 unit types", () => {
|
||||
const required = [
|
||||
"execute-task", "research-milestone", "research-slice",
|
||||
"plan-milestone", "plan-slice", "replan-slice",
|
||||
"reassess-roadmap", "complete-slice", "run-uat",
|
||||
"discuss-milestone", "complete-milestone",
|
||||
];
|
||||
for (const unitType of required) {
|
||||
assert.ok(BASE_REQUIREMENTS[unitType], `Missing requirements for ${unitType}`);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
// ─── scoreEligibleModels ─────────────────────────────────────────────────────
|
||||
|
||||
describe("scoreEligibleModels", () => {
|
||||
test("returns array sorted by score descending", () => {
|
||||
const requirements = { research: 0.9, longContext: 0.7, reasoning: 0.5 };
|
||||
const results = scoreEligibleModels(["claude-sonnet-4-6", "gpt-4o"], requirements);
|
||||
assert.ok(results.length === 2);
|
||||
assert.ok(results[0].score >= results[1].score, "Should be sorted descending by score");
|
||||
});
|
||||
|
||||
test("returns single model when only one eligible", () => {
|
||||
const requirements = { coding: 0.9 };
|
||||
const results = scoreEligibleModels(["claude-sonnet-4-6"], requirements);
|
||||
assert.equal(results.length, 1);
|
||||
assert.equal(results[0].modelId, "claude-sonnet-4-6");
|
||||
});
|
||||
|
||||
test("models without profiles get uniform 50s score", () => {
|
||||
const requirements = { coding: 1.0 };
|
||||
const results = scoreEligibleModels(["unknown-model-xyz"], requirements);
|
||||
assert.equal(results[0].score, 50);
|
||||
});
|
||||
|
||||
test("when two models score within 2 points, prefers cheaper model", () => {
|
||||
// gemini-2.0-flash is cheaper than gpt-4o-mini ($0.0001 vs $0.00015)
|
||||
// Use a requirement that causes similar scores for both
|
||||
const requirements = { speed: 1.0 };
|
||||
const results = scoreEligibleModels(["gpt-4o-mini", "gemini-2.0-flash"], requirements);
|
||||
// Both are high-speed: gpt-4o-mini=90, gemini-2.0-flash=95 — scores differ by 5, not within 2
|
||||
// So top should be gemini-2.0-flash by score
|
||||
assert.equal(results[0].modelId, "gemini-2.0-flash");
|
||||
});
|
||||
|
||||
test("tie-breaks by lexicographic model ID when cost and score are equal", () => {
|
||||
// Use models without cost entries — both get Infinity cost
|
||||
const requirements = { coding: 1.0 };
|
||||
const results = scoreEligibleModels(["model-z", "model-a"], requirements);
|
||||
// Both unknown → score=50, cost=Infinity → tiebreak by ID
|
||||
assert.equal(results[0].modelId, "model-a");
|
||||
});
|
||||
|
||||
test("scoreEligibleModels respects capabilityOverrides", () => {
|
||||
const requirements = { coding: 1.0 };
|
||||
// Override claude-sonnet-4-6's coding to 30 (worse)
|
||||
const results = scoreEligibleModels(
|
||||
["claude-sonnet-4-6", "gpt-4o"],
|
||||
requirements,
|
||||
{ "claude-sonnet-4-6": { coding: 30 } },
|
||||
);
|
||||
// gpt-4o coding=80 should beat overridden sonnet coding=30
|
||||
assert.equal(results[0].modelId, "gpt-4o");
|
||||
});
|
||||
});
|
||||
|
||||
// ─── getEligibleModels ───────────────────────────────────────────────────────
|
||||
|
||||
describe("getEligibleModels", () => {
|
||||
const MODELS = [
|
||||
"claude-opus-4-6", // heavy
|
||||
"claude-sonnet-4-6", // standard
|
||||
"claude-haiku-4-5", // light
|
||||
"gpt-4o-mini", // light
|
||||
];
|
||||
|
||||
test("returns light-tier models sorted by cost when no explicit config", () => {
|
||||
const config: DynamicRoutingConfig = defaultRoutingConfig();
|
||||
const result = getEligibleModels("light", MODELS, config);
|
||||
assert.ok(result.length >= 1);
|
||||
// All results should be light-tier
|
||||
for (const id of result) {
|
||||
assert.ok(
|
||||
["claude-haiku-4-5", "gpt-4o-mini"].includes(id),
|
||||
`Expected light-tier model, got ${id}`,
|
||||
);
|
||||
}
|
||||
});
|
||||
|
||||
test("returns explicit tier_models when configured and available", () => {
|
||||
const config: DynamicRoutingConfig = {
|
||||
...defaultRoutingConfig(),
|
||||
tier_models: { light: "gpt-4o-mini" },
|
||||
};
|
||||
const result = getEligibleModels("light", MODELS, config);
|
||||
assert.deepStrictEqual(result, ["gpt-4o-mini"]);
|
||||
});
|
||||
|
||||
test("returns empty array when no eligible models for tier", () => {
|
||||
const config: DynamicRoutingConfig = defaultRoutingConfig();
|
||||
// Only heavy model available, requesting light
|
||||
const result = getEligibleModels("light", ["claude-opus-4-6"], config);
|
||||
assert.equal(result.length, 0);
|
||||
});
|
||||
});
|
||||
|
||||
// ─── DynamicRoutingConfig extension ─────────────────────────────────────────
|
||||
|
||||
describe("DynamicRoutingConfig.capability_routing", () => {
|
||||
test("defaultRoutingConfig includes capability_routing: true", () => {
|
||||
const config = defaultRoutingConfig();
|
||||
assert.equal(config.capability_routing, true);
|
||||
});
|
||||
});
|
||||
|
||||
// ─── RoutingDecision.selectionMethod ─────────────────────────────────────────
|
||||
|
||||
describe("RoutingDecision.selectionMethod", () => {
|
||||
const MODELS = ["claude-opus-4-6", "claude-sonnet-4-6", "claude-haiku-4-5", "gpt-4o-mini"];
|
||||
|
||||
function makeClassification(tier: "light" | "standard" | "heavy") {
|
||||
return { tier, reason: "test", downgraded: false };
|
||||
}
|
||||
|
||||
test("returns selectionMethod: tier-only when routing is disabled", () => {
|
||||
const config = { ...defaultRoutingConfig(), enabled: false };
|
||||
const result: RoutingDecision = resolveModelForComplexity(
|
||||
makeClassification("light"),
|
||||
{ primary: "claude-opus-4-6", fallbacks: [] },
|
||||
config,
|
||||
MODELS,
|
||||
);
|
||||
assert.equal(result.selectionMethod, "tier-only");
|
||||
});
|
||||
|
||||
test("returns selectionMethod: tier-only for no phase config passthrough", () => {
|
||||
const config = { ...defaultRoutingConfig(), enabled: true };
|
||||
const result: RoutingDecision = resolveModelForComplexity(
|
||||
makeClassification("light"),
|
||||
undefined,
|
||||
config,
|
||||
MODELS,
|
||||
);
|
||||
assert.equal(result.selectionMethod, "tier-only");
|
||||
});
|
||||
|
||||
test("returns selectionMethod: tier-only for unknown model passthrough", () => {
|
||||
const config = { ...defaultRoutingConfig(), enabled: true };
|
||||
const result: RoutingDecision = resolveModelForComplexity(
|
||||
makeClassification("light"),
|
||||
{ primary: "custom-provider/my-model-v3", fallbacks: [] },
|
||||
config,
|
||||
["custom-provider/my-model-v3", ...MODELS],
|
||||
);
|
||||
assert.equal(result.selectionMethod, "tier-only");
|
||||
});
|
||||
|
||||
test("returns selectionMethod: tier-only for no-downgrade passthrough", () => {
|
||||
const config = { ...defaultRoutingConfig(), enabled: true };
|
||||
const result: RoutingDecision = resolveModelForComplexity(
|
||||
makeClassification("heavy"),
|
||||
{ primary: "claude-opus-4-6", fallbacks: [] },
|
||||
config,
|
||||
MODELS,
|
||||
);
|
||||
assert.equal(result.selectionMethod, "tier-only");
|
||||
});
|
||||
|
||||
test("returns selectionMethod: tier-only when downgraded", () => {
|
||||
const config = { ...defaultRoutingConfig(), enabled: true };
|
||||
const result: RoutingDecision = resolveModelForComplexity(
|
||||
makeClassification("light"),
|
||||
{ primary: "claude-opus-4-6", fallbacks: [] },
|
||||
config,
|
||||
MODELS,
|
||||
);
|
||||
assert.equal(result.selectionMethod, "tier-only");
|
||||
});
|
||||
});
|
||||
Loading…
Add table
Reference in a new issue