diff --git a/docs/dynamic-model-routing.md b/docs/dynamic-model-routing.md index 9bbf125fe..bc88df2bd 100644 --- a/docs/dynamic-model-routing.md +++ b/docs/dynamic-model-routing.md @@ -1,12 +1,20 @@ # Dynamic Model Routing -*Introduced in v2.19.0* +*Introduced in v2.19.0. Capability scoring introduced in v2.52.0.* Dynamic model routing automatically selects cheaper models for simple work and reserves expensive models for complex tasks. This reduces token consumption by 20-50% on capped plans without sacrificing quality where it matters. +Starting in v2.52.0, the router uses **capability-aware scoring** to select the *best fit* model for each task, not just the cheapest one in the tier. + ## How It Works -Each unit dispatched by auto-mode is classified into a complexity tier: +Each unit dispatched by auto-mode passes through a two-stage pipeline: + +**Stage 1: Complexity classification** — classifies the work into a tier (light/standard/heavy). + +**Stage 2: Capability scoring** — within the eligible tier, ranks available models by how well their capabilities match the task's requirements. + +The key rule: **downgrade-only semantics**. The user's configured model is always the ceiling — routing never upgrades beyond what you've configured. | Tier | Typical Work | Default Model Level | |------|-------------|-------------------| @@ -14,8 +22,6 @@ Each unit dispatched by auto-mode is classified into a complexity tier: | **Standard** | Research, planning, execution, milestone completion | Sonnet-class | | **Heavy** | Replanning, roadmap reassessment, complex execution | Opus-class | -The router then selects a model for that tier. The key rule: **downgrade-only semantics**. The user's configured model is always the ceiling — routing never upgrades beyond what you've configured. - ## Enabling Dynamic routing is off by default. Enable it in preferences: @@ -41,6 +47,7 @@ dynamic_routing: budget_pressure: true # auto-downgrade when approaching budget ceiling (default: true) cross_provider: true # consider models from other providers (default: true) hooks: true # apply routing to post-unit hooks (default: true) + capability_routing: true # enable capability scoring within tier (default: true) ``` ### `tier_models` @@ -70,35 +77,156 @@ When approaching the budget ceiling, the router progressively downgrades: When enabled, the router may select models from providers other than your primary. This uses the built-in cost table to find the cheapest model at each tier. Requires the target provider to be configured. -## Capability-Aware Scoring +### `capability_routing` -*Introduced in v2.59.0 (ADR-004 Phase 2)* - -When `capability_routing` is enabled, the router goes beyond tier classification and scores models against task-specific capability requirements. Each known model has a 7-dimension profile: - -| Dimension | What It Measures | -|-----------|-----------------| -| `coding` | Code generation, refactoring, implementation quality | -| `debugging` | Error diagnosis, fix accuracy | -| `research` | Information gathering, codebase exploration | -| `reasoning` | Multi-step logic, architectural decisions | -| `speed` | Response latency (inverse of cost) | -| `longContext` | Performance with large context windows | -| `instruction` | Adherence to structured instructions and templates | - -Each unit type maps to a weighted requirement vector. For example, `execute-task` weights `coding: 0.9, reasoning: 0.6, debugging: 0.5` while `research-slice` weights `research: 0.9, reasoning: 0.7, longContext: 0.5`. - -For `execute-task` units, the classifier also inspects task metadata (tags, description) to refine requirements. Documentation tasks boost `instruction` and lower `coding`; test tasks boost `debugging`. - -Enable capability routing: +When enabled (default: true), the router uses capability scoring to pick the best model in a tier rather than always defaulting to the cheapest. Set to `false` to revert to cheapest-in-tier behavior: ```yaml dynamic_routing: enabled: true - capability_routing: true + capability_routing: false # disable scoring, use cheapest-in-tier ``` -When enabled, models within the target tier are ranked by capability score rather than selected arbitrarily. When disabled (the default), the existing tier-only selection applies. +## Capability Profiles + +Each model has a built-in **capability profile** — a 7-dimension score (0–100) representing how well it handles different task types: + +| Dimension | What It Represents | +|-----------|-------------------| +| `coding` | Code generation and implementation accuracy | +| `debugging` | Diagnosing and fixing errors | +| `research` | Synthesizing information and exploring topics | +| `reasoning` | Multi-step logical reasoning | +| `speed` | Latency and throughput (inverse of capability depth) | +| `longContext` | Handling large codebases and long documents | +| `instruction` | Following structured instructions precisely | + +**Built-in profiles** exist for 9 models: `claude-opus-4-6`, `claude-sonnet-4-6`, `claude-haiku-4-5`, `gpt-4o`, `gpt-4o-mini`, `gemini-2.5-pro`, `gemini-2.0-flash`, `deepseek-chat`, `o3`. + +Models without a built-in profile receive **uniform scores of 50** across all dimensions. This is a cold-start policy — unknown models compete but don't have an advantage. From the user's perspective, routing behaves the same as before capability scoring was introduced for those models. + +**Profiles are heuristic rankings, not benchmarks.** They represent approximate relative strengths, not verified benchmark results. Use user overrides (below) to correct them for models you know well. + +## How Scoring Works + +The routing pipeline within a tier: + +``` +classify complexity tier + ↓ +filter eligible models for tier + ↓ +fire before_model_select hook (optional override) + ↓ +capability score eligible models + ↓ +select winner (or first eligible if scoring is disabled) +``` + +**Scoring formula:** weighted average of capability dimensions + +``` +score = Σ(weight × capability) / Σ(weights) +``` + +**Task requirements** are dynamic — different task types weight dimensions differently: + +| Unit Type | Key Dimensions | +|-----------|---------------| +| `execute-task` | coding (0.9), instruction (0.7), speed (0.3) | +| `research-*` | research (0.9), longContext (0.7), reasoning (0.5) | +| `plan-*` | reasoning (0.9), coding (0.5) | +| `replan-slice` | reasoning (0.9), debugging (0.6), coding (0.5) | +| `complete-slice`, `run-uat` | instruction (0.8), speed (0.7) | + +For `execute-task`, requirements are further refined by task metadata signals: +- Tags like `docs`, `config`, `readme` → boost instruction weight +- Keywords like `concurrency`, `compatibility` → boost debugging and reasoning +- Keywords like `migration`, `architecture` → boost reasoning and coding +- Large file counts (≥6) or large estimated line counts (≥500) → boost coding and reasoning + +**Tie-breaking:** When two models score within 2 points of each other, the cheaper model wins. If costs are equal, lexicographic model ID breaks the tie (deterministic). + +## User Overrides + +Correct built-in capability profiles for models you know well using `modelOverrides` in your models configuration: + +```json +{ + "providers": { + "anthropic": { + "modelOverrides": { + "claude-sonnet-4-6": { + "capabilities": { + "debugging": 90, + "research": 85 + } + } + } + } + } +} +``` + +Overrides are **deep-merged** with built-in defaults — only the specified dimensions are overridden; others retain their built-in values. + +**Use case:** You've found that a model consistently outperforms its built-in profile on specific task types. Override the relevant dimensions to steer the router toward that model for those tasks. + +## Verbose Output + +When verbose mode is active, the router logs its routing decision. When capability scoring was used, the log includes a full scoring breakdown: + +``` +Dynamic routing [S]: claude-sonnet-4-6 (capability-scored) — claude-sonnet-4-6: 82.3, gpt-4o: 78.1, deepseek-chat: 72.0 +``` + +When tier-only routing was used (scoring disabled, single eligible model, or routing guards applied): + +``` +Dynamic routing [S]: claude-sonnet-4-6 (standard complexity, multiple steps) +``` + +The `selectionMethod` field in the routing decision indicates which path was taken: +- `"capability-scored"` — capability scoring selected the winner +- `"tier-only"` — cheapest in tier (or explicit pin) was used + +## Extension Hook + +Extensions can intercept and override model selection using the `before_model_select` hook. + +The hook fires **after** tier filtering (eligible models are known) and **before** capability scoring (scores have not been computed yet). A hook can override selection entirely or return `undefined` to let scoring proceed normally. + +**Registering a handler:** + +```typescript +pi.on("before_model_select", async (event) => { + const { unitType, unitId, classification, taskMetadata, eligibleModels, phaseConfig } = event; + + // Custom routing strategy: always use gemini for research tasks + if (unitType.startsWith("research-")) { + const gemini = eligibleModels.find(id => id.includes("gemini")); + if (gemini) return { modelId: gemini }; + } + + // Return undefined to let capability scoring proceed + return undefined; +}); +``` + +**Event payload:** + +| Field | Type | Description | +|-------|------|-------------| +| `unitType` | `string` | The unit type being dispatched (e.g., `"execute-task"`) | +| `unitId` | `string` | Unique identifier for this unit dispatch | +| `classification` | `{ tier, reason, downgraded }` | The complexity classification result | +| `taskMetadata` | `Record \| undefined` | Task metadata extracted from the unit plan | +| `eligibleModels` | `string[]` | Models eligible for the classified tier | +| `phaseConfig` | `{ primary, fallbacks } \| undefined` | The user's configured model for this phase | + +**Return value:** `{ modelId: string }` to override selection, or `undefined` to defer to capability scoring. + +**First-override-wins:** If multiple extensions register handlers, the first one to return a non-undefined result wins. Subsequent handlers are not called. ## Complexity Classification diff --git a/packages/pi-coding-agent/src/core/extensions/loader.ts b/packages/pi-coding-agent/src/core/extensions/loader.ts index 96d689e67..d87eca9e4 100644 --- a/packages/pi-coding-agent/src/core/extensions/loader.ts +++ b/packages/pi-coding-agent/src/core/extensions/loader.ts @@ -428,6 +428,8 @@ export function createExtensionRuntime(): ExtensionRuntime { unregisterProvider: (name) => { runtime.pendingProviderRegistrations = runtime.pendingProviderRegistrations.filter((r) => r.name !== name); }, + // Stub replaced by ExtensionRunner at construction time via bindEmitMethods(). + emitBeforeModelSelect: async () => undefined, }; return runtime; @@ -579,6 +581,10 @@ function createExtensionAPI( runtime.unregisterProvider(name); }, + async emitBeforeModelSelect(event: Omit): Promise { + return runtime.emitBeforeModelSelect(event); + }, + events: eventBus, } as ExtensionAPI; diff --git a/packages/pi-coding-agent/src/core/extensions/runner.ts b/packages/pi-coding-agent/src/core/extensions/runner.ts index da06f0f13..048ad534c 100644 --- a/packages/pi-coding-agent/src/core/extensions/runner.ts +++ b/packages/pi-coding-agent/src/core/extensions/runner.ts @@ -13,6 +13,8 @@ import type { SessionManager } from "../session-manager.js"; import type { BeforeAgentStartEvent, BeforeAgentStartEventResult, + BeforeModelSelectEvent, + BeforeModelSelectResult, BeforeProviderRequestEvent, CompactOptions, ContextEvent, @@ -230,6 +232,8 @@ export class ExtensionRunner { this.cwd = cwd; this.sessionManager = sessionManager; this.modelRegistry = modelRegistry; + // Bind emit methods into the shared runtime so createExtensionAPI can delegate to them. + this.runtime.emitBeforeModelSelect = (event) => this.emitBeforeModelSelect(event); } bindCore(actions: ExtensionActions, contextActions: ExtensionContextActions): void { @@ -694,6 +698,21 @@ export class ExtensionRunner { return currentPayload; } + async emitBeforeModelSelect(event: Omit): Promise { + let result: BeforeModelSelectResult | undefined; + await this.invokeHandlers("before_model_select", () => ({ + type: "before_model_select" as const, + ...event, + } satisfies BeforeModelSelectEvent), (handlerResult) => { + if (handlerResult) { + result = handlerResult as BeforeModelSelectResult; + return { done: true }; // first override wins + } + return { done: false }; + }); + return result; + } + async emitBeforeAgentStart( prompt: string, images: ImageContent[] | undefined, diff --git a/packages/pi-coding-agent/src/core/extensions/types.ts b/packages/pi-coding-agent/src/core/extensions/types.ts index 8b6ff6ff1..037e9718c 100644 --- a/packages/pi-coding-agent/src/core/extensions/types.ts +++ b/packages/pi-coding-agent/src/core/extensions/types.ts @@ -603,6 +603,22 @@ export interface ModelSelectEvent { source: ModelSelectSource; } +/** Fired before model selection runs capability scoring. Extensions can override the selected model. */ +export interface BeforeModelSelectEvent { + type: "before_model_select"; + unitType: string; + unitId: string; + classification: { tier: string; reason: string; downgraded: boolean }; + taskMetadata?: Record; + eligibleModels: string[]; + phaseConfig?: { primary: string; fallbacks: string[] }; +} + +/** Result from before_model_select event handler. Return { modelId } to override selection. */ +export interface BeforeModelSelectResult { + modelId: string; +} + // ============================================================================ // User Bash Events // ============================================================================ @@ -1052,6 +1068,14 @@ export interface ExtensionAPI { on(event: "tool_result", handler: ExtensionHandler): void; on(event: "user_bash", handler: ExtensionHandler): void; on(event: "input", handler: ExtensionHandler): void; + on(event: "before_model_select", handler: ExtensionHandler): void; + + // ========================================================================= + // Event Emission (for host extensions that orchestrate model selection) + // ========================================================================= + + /** Emit before_model_select event. Returns override model ID or undefined. */ + emitBeforeModelSelect(event: Omit): Promise; // ========================================================================= // Tool Registration @@ -1367,6 +1391,8 @@ export interface ExtensionRuntimeState { */ registerProvider: (name: string, config: ProviderConfig) => void; unregisterProvider: (name: string) => void; + /** Emit before_model_select event to all registered handlers. Bound by ExtensionRunner. */ + emitBeforeModelSelect: (event: Omit) => Promise; } /** diff --git a/src/resources/extensions/gsd/auto-model-selection.ts b/src/resources/extensions/gsd/auto-model-selection.ts index cf2326e35..544a97857 100644 --- a/src/resources/extensions/gsd/auto-model-selection.ts +++ b/src/resources/extensions/gsd/auto-model-selection.ts @@ -9,8 +9,8 @@ import type { ExtensionAPI, ExtensionContext } from "@gsd/pi-coding-agent"; import type { GSDPreferences } from "./preferences.js"; import { resolveModelWithFallbacksForUnit, resolveDynamicRoutingConfig } from "./preferences.js"; import type { ComplexityTier } from "./complexity-classifier.js"; -import { classifyUnitComplexity, tierLabel, extractTaskMetadata } from "./complexity-classifier.js"; -import { resolveModelForComplexity, escalateTier } from "./model-router.js"; +import { classifyUnitComplexity, tierLabel } from "./complexity-classifier.js"; +import { resolveModelForComplexity, escalateTier, getEligibleModels, loadCapabilityOverrides } from "./model-router.js"; import { getLedger, getProjectTotals } from "./metrics.js"; import { unitPhaseLabel } from "./auto-dashboard.js"; @@ -107,27 +107,89 @@ export async function selectAndApplyModel( } } - // Extract task metadata for capability scoring - const taskMeta = unitType === "execute-task" - ? extractTaskMetadata(unitId, basePath) - : undefined; - - const routingResult = resolveModelForComplexity( - classification, modelConfig, routingConfig, availableModelIds, - unitType, taskMeta, + // Load user capability overrides from preferences (D-17: deep-merged with built-in profiles) + const capabilityOverrides = loadCapabilityOverrides( + (prefs as { modelOverrides?: Record }> } | undefined) ?? {}, ); + // Fire before_model_select hook (ADR-004, D-03) + // Hook can override model selection entirely by returning { modelId } + let hookOverride: string | undefined; + if (routingConfig.hooks !== false) { + const eligible = getEligibleModels( + classification.tier, + availableModelIds, + routingConfig, + ); + const hookResult = await pi.emitBeforeModelSelect({ + unitType, + unitId, + classification: { + tier: classification.tier, + reason: classification.reason, + downgraded: classification.downgraded, + }, + taskMetadata: classification.taskMetadata as Record | undefined, + eligibleModels: eligible, + phaseConfig: modelConfig ? { + primary: modelConfig.primary, + fallbacks: modelConfig.fallbacks ?? [], + } : undefined, + }); + if (hookResult?.modelId) { + hookOverride = hookResult.modelId; + } + } + + let routingResult: ReturnType; + if (hookOverride) { + // Hook override bypasses capability scoring entirely + routingResult = { + modelId: hookOverride, + fallbacks: [ + ...(modelConfig?.fallbacks ?? []).filter(f => f !== hookOverride), + ...(modelConfig?.primary && modelConfig.primary !== hookOverride ? [modelConfig.primary] : []), + ], + tier: classification.tier, + wasDowngraded: hookOverride !== modelConfig?.primary, + reason: `hook override: ${hookOverride}`, + selectionMethod: "tier-only", + }; + } else { + routingResult = resolveModelForComplexity( + classification, + modelConfig, + routingConfig, + availableModelIds, + unitType, + classification.taskMetadata, + capabilityOverrides, + ); + } + if (routingResult.wasDowngraded) { effectiveModelConfig = { primary: routingResult.modelId, fallbacks: routingResult.fallbacks, }; if (verbose) { - const method = routingResult.selectionMethod === "capability-scored" ? "capability-scored" : "tier-only"; - ctx.ui.notify( - `Dynamic routing [${tierLabel(classification.tier)}]: ${routingResult.modelId} (${method} — ${classification.reason})`, - "info", - ); + if (routingResult.selectionMethod === "capability-scored" && routingResult.capabilityScores) { + // Verbose scoring breakdown for capability-scored decisions (D-20) + const tierLbl = tierLabel(classification.tier); + const scores = Object.entries(routingResult.capabilityScores) + .sort(([, a], [, b]) => b - a) + .map(([id, score]) => `${id}: ${score.toFixed(1)}`) + .join(", "); + ctx.ui.notify( + `Dynamic routing [${tierLbl}]: ${routingResult.modelId} (capability-scored) — ${scores}`, + "info", + ); + } else { + ctx.ui.notify( + `Dynamic routing [${tierLabel(classification.tier)}]: ${routingResult.modelId} (${classification.reason})`, + "info", + ); + } } } routingTierLabel = ` [${tierLabel(classification.tier)}]`; diff --git a/src/resources/extensions/gsd/bootstrap/register-hooks.ts b/src/resources/extensions/gsd/bootstrap/register-hooks.ts index d76b046a1..537ebea63 100644 --- a/src/resources/extensions/gsd/bootstrap/register-hooks.ts +++ b/src/resources/extensions/gsd/bootstrap/register-hooks.ts @@ -322,4 +322,12 @@ export function registerHooks(pi: ExtensionAPI): void { payload.service_tier = tier; return payload; }); + + // Capability-aware model routing hook (ADR-004) + // Extensions can override model selection by returning { modelId: "..." } + // Return undefined to let the built-in capability scoring proceed. + pi.on("before_model_select", async (_event) => { + // Default: no override — let capability scoring handle selection + return undefined; + }); } diff --git a/src/resources/extensions/gsd/complexity-classifier.ts b/src/resources/extensions/gsd/complexity-classifier.ts index 114178810..82027227f 100644 --- a/src/resources/extensions/gsd/complexity-classifier.ts +++ b/src/resources/extensions/gsd/complexity-classifier.ts @@ -16,6 +16,7 @@ export interface ClassificationResult { tier: ComplexityTier; reason: string; downgraded: boolean; // true if budget pressure lowered the tier + taskMetadata?: TaskMetadata; } export interface TaskMetadata { @@ -71,17 +72,20 @@ export function classifyUnitComplexity( ): ClassificationResult { // Hook units default to light if (unitType.startsWith("hook/")) { - const result: ClassificationResult = { tier: "light", reason: "hook unit", downgraded: false }; + const result: ClassificationResult = { tier: "light", reason: "hook unit", downgraded: false, taskMetadata: undefined }; return applyBudgetPressure(result, budgetPct); } // Start with the default tier for this unit type let tier = UNIT_TYPE_TIERS[unitType] ?? "standard"; let reason = `unit type: ${unitType}`; + let taskMeta: TaskMetadata | undefined; // For execute-task, analyze task metadata for complexity signals if (unitType === "execute-task") { - const taskAnalysis = analyzeTaskComplexity(unitId, basePath, metadata); + // Extract metadata once and reuse throughout to avoid double-extraction + taskMeta = metadata ?? extractTaskMetadata(unitId, basePath); + const taskAnalysis = analyzeTaskComplexity(unitId, basePath, taskMeta); tier = taskAnalysis.tier; reason = taskAnalysis.reason; } @@ -96,14 +100,15 @@ export function classifyUnitComplexity( } // Adaptive learning: check if history suggests bumping the tier - const tags = metadata?.tags ?? extractTaskMetadata(unitId, basePath).tags; + // Use already-extracted taskMeta.tags if available to avoid double-extraction + const tags = taskMeta?.tags ?? metadata?.tags; const adaptiveAdjustment = getAdaptiveTierAdjustment(unitType, tier, tags); if (adaptiveAdjustment && tierOrdinal(adaptiveAdjustment) > tierOrdinal(tier)) { reason = `${reason} (adaptive: high failure rate at ${tier})`; tier = adaptiveAdjustment; } - const result: ClassificationResult = { tier, reason, downgraded: false }; + const result: ClassificationResult = { tier, reason, downgraded: false, taskMetadata: taskMeta }; return applyBudgetPressure(result, budgetPct); } diff --git a/src/resources/extensions/gsd/model-router.ts b/src/resources/extensions/gsd/model-router.ts index 5b45ef9b4..0efbbf9b6 100644 --- a/src/resources/extensions/gsd/model-router.ts +++ b/src/resources/extensions/gsd/model-router.ts @@ -2,7 +2,7 @@ // Maps complexity tiers to models, enforcing downgrade-only semantics. // The user's configured model is always the ceiling. -import type { ComplexityTier, ClassificationResult } from "./complexity-classifier.js"; +import type { ComplexityTier, ClassificationResult, TaskMetadata } from "./complexity-classifier.js"; import { tierOrdinal } from "./complexity-classifier.js"; import type { ResolvedModelConfig } from "./preferences.js"; @@ -33,14 +33,27 @@ export interface RoutingDecision { wasDowngraded: boolean; /** Human-readable reason for this decision */ reason: string; - /** How the model was selected. */ - selectionMethod?: "tier-only" | "capability-scored"; - /** Capability scores per model (when capability-scored). */ + /** How the model was selected */ + selectionMethod: "tier-only" | "capability-scored"; + /** Capability scores per eligible model (capability-scored path only) */ capabilityScores?: Record; - /** Task requirement vector (when capability-scored). */ + /** Task requirement vector used for scoring */ taskRequirements?: Partial>; } +// ─── Capability Profiles ───────────────────────────────────────────────────── + +/** Seven-dimension capability profile for a model. All values in 0–100 range. */ +export interface ModelCapabilities { + coding: number; + debugging: number; + research: number; + reasoning: number; + speed: number; + longContext: number; + instruction: number; +} + // ─── Known Model Tiers ─────────────────────────────────────────────────────── // Maps known model IDs to their capability tier. Used when tier_models is not // explicitly configured to pick the best available model for each tier. @@ -121,33 +134,27 @@ const MODEL_COST_PER_1K_INPUT: Record = { "deepseek-chat": 0.00014, }; -// ─── Capability Profiles (ADR-004 Phase 2) ────────────────────────────────── -// 7-dimension profiles, 0–100 normalized. Models without a profile -// score 50 uniformly — capability scoring is a no-op for them. - -export interface ModelCapabilities { - coding: number; - debugging: number; - research: number; - reasoning: number; - speed: number; - longContext: number; - instruction: number; -} +// ─── Capability Profiles Data Table ────────────────────────────────────────── +// Per-model capability profiles (0–100 scale). Used for capability-aware +// model selection within an eligible tier set. export const MODEL_CAPABILITY_PROFILES: Record = { - "claude-opus-4-6": { coding: 95, debugging: 90, research: 85, reasoning: 95, speed: 30, longContext: 80, instruction: 90 }, - "claude-sonnet-4-6": { coding: 85, debugging: 80, research: 75, reasoning: 80, speed: 60, longContext: 75, instruction: 85 }, - "claude-haiku-4-5": { coding: 60, debugging: 50, research: 45, reasoning: 50, speed: 95, longContext: 50, instruction: 75 }, - "gpt-4o": { coding: 80, debugging: 75, research: 70, reasoning: 75, speed: 65, longContext: 70, instruction: 80 }, - "gpt-4o-mini": { coding: 55, debugging: 45, research: 40, reasoning: 45, speed: 90, longContext: 45, instruction: 70 }, - "gemini-2.5-pro": { coding: 75, debugging: 70, research: 85, reasoning: 75, speed: 55, longContext: 90, instruction: 75 }, - "gemini-2.0-flash": { coding: 50, debugging: 40, research: 50, reasoning: 40, speed: 95, longContext: 60, instruction: 65 }, - "deepseek-chat": { coding: 75, debugging: 65, research: 55, reasoning: 70, speed: 70, longContext: 55, instruction: 65 }, - "o3": { coding: 80, debugging: 85, research: 80, reasoning: 92, speed: 25, longContext: 70, instruction: 85 }, + "claude-opus-4-6": { coding: 95, debugging: 90, research: 85, reasoning: 95, speed: 30, longContext: 80, instruction: 90 }, + "claude-sonnet-4-6": { coding: 85, debugging: 80, research: 75, reasoning: 80, speed: 60, longContext: 75, instruction: 85 }, + "claude-haiku-4-5": { coding: 60, debugging: 50, research: 45, reasoning: 50, speed: 95, longContext: 50, instruction: 75 }, + "gpt-4o": { coding: 80, debugging: 75, research: 70, reasoning: 75, speed: 65, longContext: 70, instruction: 80 }, + "gpt-4o-mini": { coding: 55, debugging: 45, research: 40, reasoning: 45, speed: 90, longContext: 45, instruction: 70 }, + "gemini-2.5-pro": { coding: 75, debugging: 70, research: 85, reasoning: 75, speed: 55, longContext: 90, instruction: 75 }, + "gemini-2.0-flash": { coding: 50, debugging: 40, research: 50, reasoning: 40, speed: 95, longContext: 60, instruction: 65 }, + "deepseek-chat": { coding: 75, debugging: 65, research: 55, reasoning: 70, speed: 70, longContext: 55, instruction: 65 }, + "o3": { coding: 80, debugging: 85, research: 80, reasoning: 92, speed: 25, longContext: 70, instruction: 85 }, }; -const BASE_REQUIREMENTS: Record>> = { +// ─── Base Task Requirements Data Table ─────────────────────────────────────── +// Per-unit-type base requirement vectors. Weights indicate how important each +// capability dimension is for this unit type. + +export const BASE_REQUIREMENTS: Record>> = { "execute-task": { coding: 0.9, instruction: 0.7, speed: 0.3 }, "research-milestone": { research: 0.9, longContext: 0.7, reasoning: 0.5 }, "research-slice": { research: 0.9, longContext: 0.7, reasoning: 0.5 }, @@ -161,15 +168,36 @@ const BASE_REQUIREMENTS: Record>, +): number { + let weightedSum = 0; + let weightSum = 0; + for (const [dim, weight] of Object.entries(requirements)) { + const capability = model[dim as keyof ModelCapabilities] ?? 50; + weightedSum += weight * capability; + weightSum += weight; + } + return weightSum > 0 ? weightedSum / weightSum : 50; +} + +/** + * Compute dynamic task requirements from unit type and optional task metadata. + * Returns a requirement vector refined by task-specific signals. */ export function computeTaskRequirements( unitType: string, - metadata?: { tags?: string[]; complexityKeywords?: string[]; fileCount?: number; estimatedLines?: number }, + metadata?: TaskMetadata, ): Partial> { - const base = { ...(BASE_REQUIREMENTS[unitType] ?? { reasoning: 0.5 }) }; - + const base = BASE_REQUIREMENTS[unitType] ?? { reasoning: 0.5 }; if (unitType === "execute-task" && metadata) { if (metadata.tags?.some(t => /^(docs?|readme|comment|config|typo|rename)$/i.test(t))) { return { ...base, instruction: 0.9, coding: 0.3, speed: 0.7 }; @@ -184,29 +212,101 @@ export function computeTaskRequirements( return { ...base, coding: 0.9, reasoning: 0.7 }; } } - return base; } /** - * Score a model against a task requirement vector. - * Returns weighted average in range 0–100. Returns 50 for empty requirements. + * Score all eligible models against a requirement vector and return them + * sorted by score descending. Within 2 points: prefer cheaper; equal cost: + * lexicographic tie-break by model ID. */ -export function scoreModel( - capabilities: ModelCapabilities, +export function scoreEligibleModels( + eligibleModelIds: string[], requirements: Partial>, -): number { - let weightedSum = 0; - let weightSum = 0; - for (const [dim, weight] of Object.entries(requirements)) { - const capability = capabilities[dim as keyof ModelCapabilities] ?? 50; - weightedSum += weight * capability; - weightSum += weight; - } - return weightSum > 0 ? weightedSum / weightSum : 50; + capabilityOverrides?: Record>, +): Array<{ modelId: string; score: number }> { + const scored = eligibleModelIds.map(modelId => { + const builtin = MODEL_CAPABILITY_PROFILES[modelId]; + const override = capabilityOverrides?.[modelId]; + const profile: ModelCapabilities = builtin + ? override ? { ...builtin, ...override } : builtin + : { coding: 50, debugging: 50, research: 50, reasoning: 50, speed: 50, longContext: 50, instruction: 50 }; + return { modelId, score: scoreModel(profile, requirements) }; + }); + scored.sort((a, b) => { + const scoreDiff = b.score - a.score; + if (Math.abs(scoreDiff) > 2) return scoreDiff; + const costA = MODEL_COST_PER_1K_INPUT[a.modelId] ?? Infinity; + const costB = MODEL_COST_PER_1K_INPUT[b.modelId] ?? Infinity; + if (costA !== costB) return costA - costB; + return a.modelId.localeCompare(b.modelId); + }); + return scored; } -// ─── Public API ────────────────────────────────────────────────────────────── +/** + * Return all models eligible for a given tier, sorted cheapest first. + * If routingConfig.tier_models[tier] is set and available, returns only that + * model. Otherwise filters availableModelIds by tier from MODEL_CAPABILITY_TIER. + */ +export function getEligibleModels( + tier: ComplexityTier, + availableModelIds: string[], + routingConfig: DynamicRoutingConfig, +): string[] { + // 1. Check explicit tier_models config + const explicitModel = routingConfig.tier_models?.[tier]; + if (explicitModel) { + // Exact match + if (availableModelIds.includes(explicitModel)) return [explicitModel]; + // Provider-prefix-stripped match + const match = availableModelIds.find(id => { + const bareAvail = id.includes("/") ? id.split("/").pop()! : id; + const bareExplicit = explicitModel.includes("/") ? explicitModel.split("/").pop()! : explicitModel; + return bareAvail === bareExplicit; + }); + if (match) return [match]; + } + + // 2. Auto-detect: filter by tier, sort cheapest first + return availableModelIds + .filter(id => getModelTier(id) === tier) + .sort((a, b) => { + const costA = getModelCost(a); + const costB = getModelCost(b); + return costA - costB; + }); +} + +/** + * Build a fallback chain for a selected model: [selectedModel, ...configuredFallbacks, configuredPrimary] + * Deduplicates entries while preserving order. + */ +function buildFallbackChain(selectedModelId: string, phaseConfig: ResolvedModelConfig): string[] { + return [ + ...phaseConfig.fallbacks.filter(f => f !== selectedModelId), + phaseConfig.primary, + ].filter(f => f !== selectedModelId); +} + +/** + * Load capability overrides from user preferences' modelOverrides section. + * Returns a map of model ID → partial capability overrides to deep-merge with built-in profiles. + * + * Per D-17: partial capability overrides via models.json modelOverrides, deep-merged with defaults. + */ +export function loadCapabilityOverrides( + prefs: { modelOverrides?: Record }> }, +): Record> { + const result: Record> = {}; + if (!prefs.modelOverrides) return result; + for (const [modelId, overrideEntry] of Object.entries(prefs.modelOverrides)) { + if (overrideEntry.capabilities) { + result[modelId] = overrideEntry.capabilities; + } + } + return result; +} /** * Resolve the model to use for a given complexity tier. @@ -214,10 +314,18 @@ export function scoreModel( * Downgrade-only: the returned model is always equal to or cheaper than * the user's configured primary model. Never upgrades beyond configuration. * - * @param classification The complexity classification result - * @param phaseConfig The user's configured model for this phase (ceiling) - * @param routingConfig Dynamic routing configuration - * @param availableModelIds List of available model IDs (from registry) + * STEP 1: Filter to eligible models for the requested tier. + * STEP 2: Capability scoring — ranks eligible models by task-capability match + * when capability_routing is enabled and multiple eligible models exist. + * STEP 3: Fallback chain assembly. + * + * @param classification The complexity classification result + * @param phaseConfig The user's configured model for this phase (ceiling) + * @param routingConfig Dynamic routing configuration + * @param availableModelIds List of available model IDs (from registry) + * @param unitType The unit type for capability requirement computation (optional) + * @param taskMetadata Task metadata for refined requirement vectors (optional) + * @param capabilityOverrides User-provided capability overrides (deep-merged with built-in profiles, optional) */ export function resolveModelForComplexity( classification: ClassificationResult, @@ -225,7 +333,8 @@ export function resolveModelForComplexity( routingConfig: DynamicRoutingConfig, availableModelIds: string[], unitType?: string, - metadata?: { tags?: string[]; complexityKeywords?: string[]; fileCount?: number; estimatedLines?: number }, + taskMetadata?: TaskMetadata, + capabilityOverrides?: Record>, ): RoutingDecision { // If no phase config or routing disabled, pass through if (!phaseConfig || !routingConfig.enabled) { @@ -235,6 +344,7 @@ export function resolveModelForComplexity( tier: classification.tier, wasDowngraded: false, reason: "dynamic routing disabled or no phase config", + selectionMethod: "tier-only", }; } @@ -254,6 +364,7 @@ export function resolveModelForComplexity( tier: requestedTier, wasDowngraded: false, reason: `configured model "${configuredPrimary}" is not in the known tier map — honoring explicit config`, + selectionMethod: "tier-only", }; } @@ -265,48 +376,52 @@ export function resolveModelForComplexity( tier: requestedTier, wasDowngraded: false, reason: `tier ${requestedTier} >= configured ${configuredTier}`, + selectionMethod: "tier-only", }; } - // Find the best model for the requested tier - const useCapabilityScoring = routingConfig.capability_routing && unitType; + // STEP 1: Get all eligible models for the requested tier + const eligible = getEligibleModels(requestedTier, availableModelIds, routingConfig); - let targetModelId: string | null; - let capabilityScores: Record | undefined; - let taskRequirements: Partial> | undefined; - let selectionMethod: "tier-only" | "capability-scored" = "tier-only"; - - if (useCapabilityScoring) { - const result = findModelForTierWithCapability( - requestedTier, routingConfig, availableModelIds, - routingConfig.cross_provider !== false, unitType, metadata, - ); - targetModelId = result.modelId; - capabilityScores = Object.keys(result.scores).length > 0 ? result.scores : undefined; - taskRequirements = Object.keys(result.requirements).length > 0 ? result.requirements : undefined; - selectionMethod = capabilityScores ? "capability-scored" : "tier-only"; - } else { - targetModelId = findModelForTier( - requestedTier, routingConfig, availableModelIds, - routingConfig.cross_provider !== false, - ); - } - - if (!targetModelId) { + if (eligible.length === 0) { + // No suitable model found — use configured primary return { modelId: configuredPrimary, fallbacks: phaseConfig.fallbacks, tier: requestedTier, wasDowngraded: false, reason: `no ${requestedTier}-tier model available`, - selectionMethod, + selectionMethod: "tier-only", }; } - const fallbacks = [ - ...phaseConfig.fallbacks.filter(f => f !== targetModelId), - configuredPrimary, - ].filter(f => f !== targetModelId); + // STEP 2: Capability scoring (when enabled and multiple eligible models exist) + if (routingConfig.capability_routing !== false && eligible.length > 1 && unitType) { + const requirements = computeTaskRequirements(unitType, taskMetadata); + const scored = scoreEligibleModels(eligible, requirements, capabilityOverrides); + const winner = scored[0]; + if (winner) { + const capScores: Record = {}; + for (const s of scored) capScores[s.modelId] = s.score; + const fallbacks = buildFallbackChain(winner.modelId, phaseConfig); + return { + modelId: winner.modelId, + fallbacks, + tier: requestedTier, + wasDowngraded: true, + reason: `capability-scored: ${winner.modelId} (${winner.score.toFixed(1)}) for ${unitType}`, + capabilityScores: capScores, + taskRequirements: requirements, + selectionMethod: "capability-scored", + }; + } + } + + // STEP 3: Fallback — use first eligible model (cheapest in tier, or single eligible) + const targetModelId = eligible[0]; + + // Build fallback chain: [downgraded_model, ...configured_fallbacks, configured_primary] + const fallbacks = buildFallbackChain(targetModelId, phaseConfig); return { modelId: targetModelId, @@ -314,9 +429,7 @@ export function resolveModelForComplexity( tier: requestedTier, wasDowngraded: true, reason: classification.reason, - selectionMethod, - capabilityScores, - taskRequirements, + selectionMethod: "tier-only", }; } @@ -338,7 +451,7 @@ export function escalateTier(currentTier: ComplexityTier): ComplexityTier | null export function defaultRoutingConfig(): DynamicRoutingConfig { return { enabled: true, - capability_routing: false, + capability_routing: true, escalate_on_failure: true, budget_pressure: true, cross_provider: true, @@ -360,8 +473,8 @@ function getModelTier(modelId: string): ComplexityTier { if (bareId.includes(knownId) || knownId.includes(bareId)) return tier; } - // Unknown models are assumed heavy (safest assumption) - return "heavy"; + // Unknown models are assumed standard (per D-15: avoids silently ignoring user config) + return "standard"; } /** Check if a model ID has a known capability tier mapping. (#2192) */ @@ -374,93 +487,6 @@ function isKnownModel(modelId: string): boolean { return false; } -function findModelForTier( - tier: ComplexityTier, - config: DynamicRoutingConfig, - availableModelIds: string[], - crossProvider: boolean, -): string | null { - // 1. Check explicit tier_models config - const explicitModel = config.tier_models?.[tier]; - if (explicitModel && availableModelIds.includes(explicitModel)) { - return explicitModel; - } - // Also check with provider prefix stripped - if (explicitModel) { - const match = availableModelIds.find(id => { - const bareAvail = id.includes("/") ? id.split("/").pop()! : id; - const bareExplicit = explicitModel.includes("/") ? explicitModel.split("/").pop()! : explicitModel; - return bareAvail === bareExplicit; - }); - if (match) return match; - } - - // 2. Auto-detect: find the cheapest available model in the requested tier - const candidates = availableModelIds - .filter(id => { - const modelTier = getModelTier(id); - return modelTier === tier; - }) - .sort((a, b) => { - if (!crossProvider) return 0; - const costA = getModelCost(a); - const costB = getModelCost(b); - return costA - costB; - }); - - return candidates[0] ?? null; -} - -function findModelForTierWithCapability( - tier: ComplexityTier, - config: DynamicRoutingConfig, - availableModelIds: string[], - crossProvider: boolean, - unitType: string, - metadata?: { tags?: string[]; complexityKeywords?: string[]; fileCount?: number; estimatedLines?: number }, -): { modelId: string | null; scores: Record; requirements: Partial> } { - const explicitModel = config.tier_models?.[tier]; - if (explicitModel) { - const match = availableModelIds.find(id => { - const bareAvail = id.includes("/") ? id.split("/").pop()! : id; - const bareExplicit = explicitModel.includes("/") ? explicitModel.split("/").pop()! : explicitModel; - return bareAvail === bareExplicit || id === explicitModel; - }); - if (match) return { modelId: match, scores: {}, requirements: {} }; - } - - const requirements = computeTaskRequirements(unitType, metadata); - const candidates = availableModelIds.filter(id => getModelTier(id) === tier); - if (candidates.length === 0) return { modelId: null, scores: {}, requirements }; - - const scores: Record = {}; - for (const id of candidates) { - const bareId = id.includes("/") ? id.split("/").pop()! : id; - const profile = getModelProfile(bareId); - scores[id] = scoreModel(profile, requirements); - } - - candidates.sort((a, b) => { - const scoreDiff = scores[b] - scores[a]; - if (Math.abs(scoreDiff) > 2) return scoreDiff; - if (crossProvider) { - const costDiff = getModelCost(a) - getModelCost(b); - if (costDiff !== 0) return costDiff; - } - return a.localeCompare(b); - }); - - return { modelId: candidates[0], scores, requirements }; -} - -function getModelProfile(bareId: string): ModelCapabilities { - if (MODEL_CAPABILITY_PROFILES[bareId]) return MODEL_CAPABILITY_PROFILES[bareId]; - for (const [knownId, profile] of Object.entries(MODEL_CAPABILITY_PROFILES)) { - if (bareId.includes(knownId) || knownId.includes(bareId)) return profile; - } - return { coding: 50, debugging: 50, research: 50, reasoning: 50, speed: 50, longContext: 50, instruction: 50 }; -} - function getModelCost(modelId: string): number { const bareId = modelId.includes("/") ? modelId.split("/").pop()! : modelId; diff --git a/src/resources/extensions/gsd/tests/capability-router.test.ts b/src/resources/extensions/gsd/tests/capability-router.test.ts new file mode 100644 index 000000000..751fc6e11 --- /dev/null +++ b/src/resources/extensions/gsd/tests/capability-router.test.ts @@ -0,0 +1,347 @@ +// GSD Extension — Capability-Aware Router Tests +// Tests for new capability scoring functions and data tables (Plan 01-01) + +import { describe, test } from "node:test"; +import assert from "node:assert/strict"; + +import { + scoreModel, + computeTaskRequirements, + scoreEligibleModels, + getEligibleModels, + resolveModelForComplexity, + MODEL_CAPABILITY_PROFILES, + BASE_REQUIREMENTS, + defaultRoutingConfig, +} from "../model-router.js"; +import type { ModelCapabilities, DynamicRoutingConfig, RoutingDecision } from "../model-router.js"; + +// ─── scoreModel ────────────────────────────────────────────────────────────── + +describe("scoreModel", () => { + const sonnetProfile: ModelCapabilities = { + coding: 85, debugging: 80, research: 75, reasoning: 80, + speed: 60, longContext: 75, instruction: 85, + }; + + test("produces correct weighted average for single dimension", () => { + // Only coding weight 1.0 → result should be the coding score + const score = scoreModel(sonnetProfile, { coding: 1.0 }); + assert.equal(score, 85); + }); + + test("produces correct weighted average for two dimensions (coding 0.9, instruction 0.7)", () => { + // (0.9*85 + 0.7*85) / (0.9+0.7) = (76.5+59.5)/1.6 = 136/1.6 = 85.0 + const score = scoreModel(sonnetProfile, { coding: 0.9, instruction: 0.7 }); + assert.ok(Math.abs(score - 85.0) < 0.01, `Expected ~85.0, got ${score}`); + }); + + test("returns 50 when requirements is empty", () => { + const score = scoreModel(sonnetProfile, {}); + assert.equal(score, 50); + }); + + test("uses 50 as fallback for unknown dimension in requirements", () => { + // 'unknown' dimension not in profile → treated as 50 + const score = scoreModel(sonnetProfile, { coding: 0.5, unknown: 1.0 } as any); + // (0.5*85 + 1.0*50) / (0.5+1.0) = (42.5+50)/1.5 = 92.5/1.5 = 61.67 + assert.ok(score > 61 && score < 62, `Expected ~61.67, got ${score}`); + }); +}); + +// ─── computeTaskRequirements ───────────────────────────────────────────────── + +describe("computeTaskRequirements", () => { + test("execute-task with no metadata returns base requirements", () => { + const req = computeTaskRequirements("execute-task", undefined); + assert.deepStrictEqual(req, { coding: 0.9, instruction: 0.7, speed: 0.3 }); + }); + + test("execute-task with docs tag returns docs-adjusted requirements", () => { + const req = computeTaskRequirements("execute-task", { tags: ["docs"] }); + assert.equal(req.instruction, 0.9); + assert.equal(req.coding, 0.3); + assert.equal(req.speed, 0.7); + }); + + test("execute-task with readme tag returns docs-adjusted requirements", () => { + const req = computeTaskRequirements("execute-task", { tags: ["readme"] }); + assert.equal(req.instruction, 0.9); + }); + + test("execute-task with concurrency keyword boosts debugging and reasoning", () => { + const req = computeTaskRequirements("execute-task", { complexityKeywords: ["concurrency"] }); + assert.equal(req.debugging, 0.9); + assert.equal(req.reasoning, 0.8); + }); + + test("execute-task with compatibility keyword boosts debugging and reasoning", () => { + const req = computeTaskRequirements("execute-task", { complexityKeywords: ["compatibility"] }); + assert.equal(req.debugging, 0.9); + assert.equal(req.reasoning, 0.8); + }); + + test("execute-task with migration keyword boosts reasoning and coding", () => { + const req = computeTaskRequirements("execute-task", { complexityKeywords: ["migration"] }); + assert.equal(req.reasoning, 0.9); + assert.equal(req.coding, 0.8); + }); + + test("execute-task with architecture keyword boosts reasoning and coding", () => { + const req = computeTaskRequirements("execute-task", { complexityKeywords: ["architecture"] }); + assert.equal(req.reasoning, 0.9); + assert.equal(req.coding, 0.8); + }); + + test("execute-task with fileCount >= 6 boosts coding and reasoning", () => { + const req = computeTaskRequirements("execute-task", { fileCount: 8 }); + assert.equal(req.coding, 0.9); + assert.equal(req.reasoning, 0.7); + }); + + test("execute-task with fileCount exactly 6 triggers large-file boost", () => { + const req = computeTaskRequirements("execute-task", { fileCount: 6 }); + assert.equal(req.coding, 0.9); + assert.equal(req.reasoning, 0.7); + }); + + test("execute-task with estimatedLines >= 500 boosts coding and reasoning", () => { + const req = computeTaskRequirements("execute-task", { estimatedLines: 500 }); + assert.equal(req.coding, 0.9); + assert.equal(req.reasoning, 0.7); + }); + + test("research-milestone with no metadata returns base requirements", () => { + const req = computeTaskRequirements("research-milestone", undefined); + assert.deepStrictEqual(req, { research: 0.9, longContext: 0.7, reasoning: 0.5 }); + }); + + test("unknown unit type returns default reasoning requirement", () => { + const req = computeTaskRequirements("unknown-type", undefined); + assert.deepStrictEqual(req, { reasoning: 0.5 }); + }); +}); + +// ─── MODEL_CAPABILITY_PROFILES ─────────────────────────────────────────────── + +describe("MODEL_CAPABILITY_PROFILES", () => { + test("contains all 9 required models", () => { + const required = [ + "claude-opus-4-6", "claude-sonnet-4-6", "claude-haiku-4-5", + "gpt-4o", "gpt-4o-mini", "gemini-2.5-pro", "gemini-2.0-flash", + "deepseek-chat", "o3", + ]; + for (const model of required) { + assert.ok(MODEL_CAPABILITY_PROFILES[model], `Missing profile for ${model}`); + } + }); + + test("each profile has all 7 capability dimensions", () => { + const dims: Array = [ + "coding", "debugging", "research", "reasoning", + "speed", "longContext", "instruction", + ]; + for (const [modelId, profile] of Object.entries(MODEL_CAPABILITY_PROFILES)) { + for (const dim of dims) { + assert.ok(profile[dim] !== undefined, `${modelId} missing dimension ${dim}`); + assert.ok(profile[dim] >= 0 && profile[dim] <= 100, `${modelId}.${dim} out of range`); + } + } + }); + + test("claude-opus-4-6 has high reasoning and coding", () => { + const opus = MODEL_CAPABILITY_PROFILES["claude-opus-4-6"]; + assert.ok(opus.reasoning >= 90, `Expected reasoning >= 90, got ${opus.reasoning}`); + assert.ok(opus.coding >= 90, `Expected coding >= 90, got ${opus.coding}`); + }); + + test("claude-haiku-4-5 has high speed but lower reasoning", () => { + const haiku = MODEL_CAPABILITY_PROFILES["claude-haiku-4-5"]; + assert.ok(haiku.speed >= 90, `Expected speed >= 90, got ${haiku.speed}`); + assert.ok(haiku.reasoning < 70, `Expected reasoning < 70, got ${haiku.reasoning}`); + }); +}); + +// ─── BASE_REQUIREMENTS ─────────────────────────────────────────────────────── + +describe("BASE_REQUIREMENTS", () => { + test("contains all 11 unit types", () => { + const required = [ + "execute-task", "research-milestone", "research-slice", + "plan-milestone", "plan-slice", "replan-slice", + "reassess-roadmap", "complete-slice", "run-uat", + "discuss-milestone", "complete-milestone", + ]; + for (const unitType of required) { + assert.ok(BASE_REQUIREMENTS[unitType], `Missing requirements for ${unitType}`); + } + }); +}); + +// ─── scoreEligibleModels ───────────────────────────────────────────────────── + +describe("scoreEligibleModels", () => { + test("returns array sorted by score descending", () => { + const requirements = { research: 0.9, longContext: 0.7, reasoning: 0.5 }; + const results = scoreEligibleModels(["claude-sonnet-4-6", "gpt-4o"], requirements); + assert.ok(results.length === 2); + assert.ok(results[0].score >= results[1].score, "Should be sorted descending by score"); + }); + + test("returns single model when only one eligible", () => { + const requirements = { coding: 0.9 }; + const results = scoreEligibleModels(["claude-sonnet-4-6"], requirements); + assert.equal(results.length, 1); + assert.equal(results[0].modelId, "claude-sonnet-4-6"); + }); + + test("models without profiles get uniform 50s score", () => { + const requirements = { coding: 1.0 }; + const results = scoreEligibleModels(["unknown-model-xyz"], requirements); + assert.equal(results[0].score, 50); + }); + + test("when two models score within 2 points, prefers cheaper model", () => { + // gemini-2.0-flash is cheaper than gpt-4o-mini ($0.0001 vs $0.00015) + // Use a requirement that causes similar scores for both + const requirements = { speed: 1.0 }; + const results = scoreEligibleModels(["gpt-4o-mini", "gemini-2.0-flash"], requirements); + // Both are high-speed: gpt-4o-mini=90, gemini-2.0-flash=95 — scores differ by 5, not within 2 + // So top should be gemini-2.0-flash by score + assert.equal(results[0].modelId, "gemini-2.0-flash"); + }); + + test("tie-breaks by lexicographic model ID when cost and score are equal", () => { + // Use models without cost entries — both get Infinity cost + const requirements = { coding: 1.0 }; + const results = scoreEligibleModels(["model-z", "model-a"], requirements); + // Both unknown → score=50, cost=Infinity → tiebreak by ID + assert.equal(results[0].modelId, "model-a"); + }); + + test("scoreEligibleModels respects capabilityOverrides", () => { + const requirements = { coding: 1.0 }; + // Override claude-sonnet-4-6's coding to 30 (worse) + const results = scoreEligibleModels( + ["claude-sonnet-4-6", "gpt-4o"], + requirements, + { "claude-sonnet-4-6": { coding: 30 } }, + ); + // gpt-4o coding=80 should beat overridden sonnet coding=30 + assert.equal(results[0].modelId, "gpt-4o"); + }); +}); + +// ─── getEligibleModels ─────────────────────────────────────────────────────── + +describe("getEligibleModels", () => { + const MODELS = [ + "claude-opus-4-6", // heavy + "claude-sonnet-4-6", // standard + "claude-haiku-4-5", // light + "gpt-4o-mini", // light + ]; + + test("returns light-tier models sorted by cost when no explicit config", () => { + const config: DynamicRoutingConfig = defaultRoutingConfig(); + const result = getEligibleModels("light", MODELS, config); + assert.ok(result.length >= 1); + // All results should be light-tier + for (const id of result) { + assert.ok( + ["claude-haiku-4-5", "gpt-4o-mini"].includes(id), + `Expected light-tier model, got ${id}`, + ); + } + }); + + test("returns explicit tier_models when configured and available", () => { + const config: DynamicRoutingConfig = { + ...defaultRoutingConfig(), + tier_models: { light: "gpt-4o-mini" }, + }; + const result = getEligibleModels("light", MODELS, config); + assert.deepStrictEqual(result, ["gpt-4o-mini"]); + }); + + test("returns empty array when no eligible models for tier", () => { + const config: DynamicRoutingConfig = defaultRoutingConfig(); + // Only heavy model available, requesting light + const result = getEligibleModels("light", ["claude-opus-4-6"], config); + assert.equal(result.length, 0); + }); +}); + +// ─── DynamicRoutingConfig extension ───────────────────────────────────────── + +describe("DynamicRoutingConfig.capability_routing", () => { + test("defaultRoutingConfig includes capability_routing: true", () => { + const config = defaultRoutingConfig(); + assert.equal(config.capability_routing, true); + }); +}); + +// ─── RoutingDecision.selectionMethod ───────────────────────────────────────── + +describe("RoutingDecision.selectionMethod", () => { + const MODELS = ["claude-opus-4-6", "claude-sonnet-4-6", "claude-haiku-4-5", "gpt-4o-mini"]; + + function makeClassification(tier: "light" | "standard" | "heavy") { + return { tier, reason: "test", downgraded: false }; + } + + test("returns selectionMethod: tier-only when routing is disabled", () => { + const config = { ...defaultRoutingConfig(), enabled: false }; + const result: RoutingDecision = resolveModelForComplexity( + makeClassification("light"), + { primary: "claude-opus-4-6", fallbacks: [] }, + config, + MODELS, + ); + assert.equal(result.selectionMethod, "tier-only"); + }); + + test("returns selectionMethod: tier-only for no phase config passthrough", () => { + const config = { ...defaultRoutingConfig(), enabled: true }; + const result: RoutingDecision = resolveModelForComplexity( + makeClassification("light"), + undefined, + config, + MODELS, + ); + assert.equal(result.selectionMethod, "tier-only"); + }); + + test("returns selectionMethod: tier-only for unknown model passthrough", () => { + const config = { ...defaultRoutingConfig(), enabled: true }; + const result: RoutingDecision = resolveModelForComplexity( + makeClassification("light"), + { primary: "custom-provider/my-model-v3", fallbacks: [] }, + config, + ["custom-provider/my-model-v3", ...MODELS], + ); + assert.equal(result.selectionMethod, "tier-only"); + }); + + test("returns selectionMethod: tier-only for no-downgrade passthrough", () => { + const config = { ...defaultRoutingConfig(), enabled: true }; + const result: RoutingDecision = resolveModelForComplexity( + makeClassification("heavy"), + { primary: "claude-opus-4-6", fallbacks: [] }, + config, + MODELS, + ); + assert.equal(result.selectionMethod, "tier-only"); + }); + + test("returns selectionMethod: tier-only when downgraded", () => { + const config = { ...defaultRoutingConfig(), enabled: true }; + const result: RoutingDecision = resolveModelForComplexity( + makeClassification("light"), + { primary: "claude-opus-4-6", fallbacks: [] }, + config, + MODELS, + ); + assert.equal(result.selectionMethod, "tier-only"); + }); +}); diff --git a/src/resources/extensions/gsd/tests/complexity-classifier.test.ts b/src/resources/extensions/gsd/tests/complexity-classifier.test.ts index ec53ddcaa..46b39ff4d 100644 --- a/src/resources/extensions/gsd/tests/complexity-classifier.test.ts +++ b/src/resources/extensions/gsd/tests/complexity-classifier.test.ts @@ -1,7 +1,7 @@ -import test from "node:test"; +import test, { describe } from "node:test"; import assert from "node:assert/strict"; -import { classifyUnitComplexity, tierLabel, tierOrdinal } from "../complexity-classifier.js"; +import { classifyUnitComplexity, tierLabel, tierOrdinal, extractTaskMetadata } from "../complexity-classifier.js"; import type { ComplexityTier, TaskMetadata } from "../complexity-classifier.js"; // ─── tierLabel ─────────────────────────────────────────────────────────────── @@ -179,3 +179,28 @@ test("execute-task with few code blocks stays standard", () => { const result = classifyUnitComplexity("execute-task", "M001/S01/T01", "/tmp/fake", undefined, metadata); assert.equal(result.tier, "standard"); }); + +// ─── ClassificationResult taskMetadata passthrough ─────────────────────────── + +describe("ClassificationResult taskMetadata", () => { + test("classifyUnitComplexity for execute-task returns result with taskMetadata populated", () => { + const metadata: TaskMetadata = { fileCount: 3, tags: ["docs"] }; + const result = classifyUnitComplexity("execute-task", "M001/S01/T01", "/tmp/fake", undefined, metadata); + assert.ok(result.taskMetadata !== undefined, "taskMetadata should be populated for execute-task"); + assert.equal(result.taskMetadata!.tags?.[0], "docs"); + }); + + test("classifyUnitComplexity for hook/xyz returns result with taskMetadata undefined", () => { + const result = classifyUnitComplexity("hook/verify", "M001/S01/T01", "/tmp/fake"); + assert.equal(result.taskMetadata, undefined, "taskMetadata should be undefined for hook units"); + }); + + test("classifyUnitComplexity for plan-slice returns result with taskMetadata undefined", () => { + const result = classifyUnitComplexity("plan-slice", "M001/S01", "/tmp/fake"); + assert.equal(result.taskMetadata, undefined, "taskMetadata should be undefined for plan-slice"); + }); + + test("extractTaskMetadata is importable as a named export and is a function", () => { + assert.equal(typeof extractTaskMetadata, "function", "extractTaskMetadata should be a callable function"); + }); +}); diff --git a/src/resources/extensions/gsd/tests/model-router.test.ts b/src/resources/extensions/gsd/tests/model-router.test.ts index f15977495..c81242215 100644 --- a/src/resources/extensions/gsd/tests/model-router.test.ts +++ b/src/resources/extensions/gsd/tests/model-router.test.ts @@ -1,4 +1,4 @@ -import test from "node:test"; +import test, { describe } from "node:test"; import assert from "node:assert/strict"; import { @@ -7,6 +7,8 @@ import { defaultRoutingConfig, scoreModel, computeTaskRequirements, + scoreEligibleModels, + getEligibleModels, MODEL_CAPABILITY_PROFILES, } from "../model-router.js"; import type { DynamicRoutingConfig, RoutingDecision, ModelCapabilities } from "../model-router.js"; @@ -211,9 +213,9 @@ test("#2192: known model is still downgraded normally", () => { // ─── Capability Scoring (ADR-004 Phase 2) ─────────────────────────────────── -test("defaultRoutingConfig includes capability_routing: false", () => { +test("defaultRoutingConfig includes capability_routing: true", () => { const config = defaultRoutingConfig(); - assert.equal(config.capability_routing, false); + assert.equal(config.capability_routing, true); }); test("scoreModel computes weighted average of capability × requirement", () => { @@ -356,3 +358,401 @@ test("#2885: heavy openai-codex model downgrades to light for light task", () => // Should pick a light-tier model assert.notEqual(result.modelId, "gpt-5.4", "should not use the heavy model for light task"); }); +// ─── scoreModel ────────────────────────────────────────────────────────────── + +describe("scoreModel", () => { + const sonnetProfile: ModelCapabilities = MODEL_CAPABILITY_PROFILES["claude-sonnet-4-6"]!; + + test("produces correct weighted average for two dimensions (coding:0.9, instruction:0.7)", () => { + // (0.9*85 + 0.7*85) / (0.9+0.7) = (76.5+59.5)/1.6 = 136/1.6 = 85.0 + const score = scoreModel(sonnetProfile, { coding: 0.9, instruction: 0.7 }); + assert.ok(Math.abs(score - 85.0) < 0.01, `Expected ~85.0, got ${score}`); + }); + + test("returns 50 when requirements is empty", () => { + const score = scoreModel(sonnetProfile, {}); + assert.equal(score, 50); + }); + + test("returns correct score for single dimension coding:1.0", () => { + // coding=90 for claude-opus-4-6 + const opusProfile = MODEL_CAPABILITY_PROFILES["claude-opus-4-6"]!; + const score = scoreModel(opusProfile, { coding: 1.0 }); + assert.equal(score, 95); + }); + + test("handles all 7 dimensions correctly", () => { + // Uniform weight 1.0 on every dim → average of all dim values + const profile: ModelCapabilities = { + coding: 60, debugging: 60, research: 60, reasoning: 60, + speed: 60, longContext: 60, instruction: 60, + }; + const reqs: Partial> = { + coding: 1.0, debugging: 1.0, research: 1.0, reasoning: 1.0, + speed: 1.0, longContext: 1.0, instruction: 1.0, + }; + const score = scoreModel(profile, reqs); + assert.equal(score, 60); + }); +}); + +// ─── computeTaskRequirements ───────────────────────────────────────────────── + +describe("computeTaskRequirements", () => { + test("execute-task with no metadata returns base vector", () => { + const req = computeTaskRequirements("execute-task", undefined); + assert.deepStrictEqual(req, { coding: 0.9, instruction: 0.7, speed: 0.3 }); + }); + + test("execute-task with tags:['docs'] adjusts requirements", () => { + const req = computeTaskRequirements("execute-task", { tags: ["docs"] }); + assert.equal(req.instruction, 0.9); + assert.equal(req.coding, 0.3); + assert.equal(req.speed, 0.7); + }); + + test("execute-task with tags:['config'] adjusts requirements", () => { + const req = computeTaskRequirements("execute-task", { tags: ["config"] }); + assert.equal(req.instruction, 0.9); + }); + + test("execute-task with complexityKeywords:['concurrency'] boosts debugging and reasoning", () => { + const req = computeTaskRequirements("execute-task", { complexityKeywords: ["concurrency"] }); + assert.equal(req.debugging, 0.9); + assert.equal(req.reasoning, 0.8); + }); + + test("execute-task with complexityKeywords:['migration'] boosts reasoning and coding", () => { + const req = computeTaskRequirements("execute-task", { complexityKeywords: ["migration"] }); + assert.equal(req.reasoning, 0.9); + assert.equal(req.coding, 0.8); + }); + + test("execute-task with fileCount:8 boosts coding and reasoning", () => { + const req = computeTaskRequirements("execute-task", { fileCount: 8 }); + assert.equal(req.coding, 0.9); + assert.equal(req.reasoning, 0.7); + }); + + test("execute-task with estimatedLines:600 boosts coding and reasoning", () => { + const req = computeTaskRequirements("execute-task", { estimatedLines: 600 }); + assert.equal(req.coding, 0.9); + assert.equal(req.reasoning, 0.7); + }); + + test("research-milestone returns correct base vector", () => { + const req = computeTaskRequirements("research-milestone"); + assert.deepStrictEqual(req, { research: 0.9, longContext: 0.7, reasoning: 0.5 }); + }); + + test("plan-slice returns correct base vector", () => { + const req = computeTaskRequirements("plan-slice"); + assert.deepStrictEqual(req, { reasoning: 0.9, coding: 0.5 }); + }); + + test("unknown-unit-type returns default reasoning requirement", () => { + const req = computeTaskRequirements("unknown-unit-type"); + assert.deepStrictEqual(req, { reasoning: 0.5 }); + }); + + test("non-execute-task with metadata ignores metadata refinements", () => { + // research-milestone should return the same vector regardless of metadata + const reqWithMeta = computeTaskRequirements("research-milestone", { tags: ["docs"], fileCount: 10 }); + const reqWithout = computeTaskRequirements("research-milestone"); + assert.deepStrictEqual(reqWithMeta, reqWithout); + }); +}); + +// ─── scoreEligibleModels ───────────────────────────────────────────────────── + +describe("scoreEligibleModels", () => { + test("ranks models by score descending when scores differ by more than 2", () => { + // research: heavily weights research dimension. gemini-2.5-pro has 85 research vs sonnet's 75 + const requirements = { research: 0.9, longContext: 0.7, reasoning: 0.5 }; + const results = scoreEligibleModels(["claude-sonnet-4-6", "gemini-2.5-pro"], requirements); + assert.equal(results.length, 2); + assert.ok(results[0].score >= results[1].score, "Should be sorted by score descending"); + }); + + test("within 2-point threshold, prefers cheaper model", () => { + // Use models without built-in profiles (both get score 50) so tie-break applies + // Then use known models with equal scores: force this via single unknown model pair + const requirements = { coding: 1.0 }; + // model-a and model-b are both unknown → score=50, cost=Infinity → lexicographic + const results = scoreEligibleModels(["model-z", "model-a"], requirements); + // Both unknown: score=50 (within 2), cost=Infinity (equal) → lex: model-a first + assert.equal(results[0].modelId, "model-a"); + }); + + test("single model returns array of one", () => { + const results = scoreEligibleModels(["claude-sonnet-4-6"], { coding: 0.9 }); + assert.equal(results.length, 1); + assert.equal(results[0].modelId, "claude-sonnet-4-6"); + }); + + test("unknown model with no profile gets score of 50", () => { + const results = scoreEligibleModels(["totally-unknown-model"], { coding: 1.0 }); + assert.equal(results[0].score, 50); + }); + + test("capabilityOverrides deep-merges with built-in profile", () => { + const requirements = { coding: 1.0 }; + // Override sonnet's coding to 30 — gpt-4o (coding=80) should win + const results = scoreEligibleModels( + ["claude-sonnet-4-6", "gpt-4o"], + requirements, + { "claude-sonnet-4-6": { coding: 30 } }, + ); + assert.equal(results[0].modelId, "gpt-4o", "gpt-4o should rank first after coding override"); + }); +}); + +// ─── getEligibleModels ─────────────────────────────────────────────────────── + +describe("getEligibleModels", () => { + const ALL_MODELS = [ + "claude-opus-4-6", // heavy + "claude-sonnet-4-6", // standard + "claude-haiku-4-5", // light + "gpt-4o-mini", // light + "gpt-4o", // standard + ]; + + test("returns light-tier models from available list sorted by cost", () => { + const config: DynamicRoutingConfig = defaultRoutingConfig(); + const result = getEligibleModels("light", ALL_MODELS, config); + assert.ok(result.length >= 1); + for (const id of result) { + assert.ok( + ["claude-haiku-4-5", "gpt-4o-mini"].includes(id), + `Expected light-tier model, got ${id}`, + ); + } + }); + + test("returns standard-tier models from available list sorted by cost", () => { + const config: DynamicRoutingConfig = defaultRoutingConfig(); + const result = getEligibleModels("standard", ALL_MODELS, config); + assert.ok(result.length >= 1); + for (const id of result) { + assert.ok( + ["claude-sonnet-4-6", "gpt-4o"].includes(id), + `Expected standard-tier model, got ${id}`, + ); + } + }); + + test("tier_models pinned model returns single-element array", () => { + const config: DynamicRoutingConfig = { + ...defaultRoutingConfig(), + tier_models: { light: "gpt-4o-mini" }, + }; + const result = getEligibleModels("light", ALL_MODELS, config); + assert.deepStrictEqual(result, ["gpt-4o-mini"]); + }); + + test("empty available list returns empty array", () => { + const config: DynamicRoutingConfig = defaultRoutingConfig(); + const result = getEligibleModels("light", [], config); + assert.equal(result.length, 0); + }); + + test("unknown models classified as standard appear in standard tier results", () => { + const config: DynamicRoutingConfig = defaultRoutingConfig(); + // unknown-model-xyz has no entry → defaults to standard tier + const result = getEligibleModels("standard", ["unknown-model-xyz"], config); + assert.ok(result.includes("unknown-model-xyz"), "Unknown model should appear in standard tier"); + }); +}); + +// ─── capability-aware routing integration ──────────────────────────────────── + +describe("capability-aware routing integration", () => { + // All standard-tier models available alongside heavy (opus) + const MULTI_MODEL_AVAILABLE = [ + "claude-opus-4-6", + "claude-sonnet-4-6", + "gpt-4o", + "gemini-2.5-pro", + "claude-haiku-4-5", + "gpt-4o-mini", + ]; + + // 1. Full pipeline with capability scoring active + test("full pipeline with capability_routing: true returns capability-scored decision", () => { + const config: DynamicRoutingConfig = { ...defaultRoutingConfig(), enabled: true, capability_routing: true }; + // Configured primary is opus (heavy) — standard tier should trigger capability scoring + const result = resolveModelForComplexity( + { tier: "standard", reason: "test", downgraded: false }, + { primary: "claude-opus-4-6", fallbacks: [] }, + config, + MULTI_MODEL_AVAILABLE, + "execute-task", + { tags: [], complexityKeywords: [], fileCount: 3, estimatedLines: 100, codeBlockCount: 0 }, + ); + assert.equal(result.selectionMethod, "capability-scored", "should use capability scoring when enabled with multiple eligible models"); + assert.ok(result.capabilityScores !== undefined, "capabilityScores should be populated"); + assert.ok(Object.keys(result.capabilityScores!).length > 1, "should have scores for multiple models"); + assert.equal(result.wasDowngraded, true, "should be downgraded from opus"); + }); + + // 2. capability_routing: false falls back to tier-only + test("capability_routing: false skips scoring and uses tier-only", () => { + const config: DynamicRoutingConfig = { ...defaultRoutingConfig(), enabled: true, capability_routing: false }; + const result = resolveModelForComplexity( + { tier: "standard", reason: "test", downgraded: false }, + { primary: "claude-opus-4-6", fallbacks: [] }, + config, + MULTI_MODEL_AVAILABLE, + "execute-task", + undefined, + ); + assert.equal(result.selectionMethod, "tier-only", "capability_routing: false should use tier-only"); + assert.equal(result.capabilityScores, undefined, "capabilityScores should be undefined for tier-only"); + }); + + // 3. Single eligible model skips scoring + test("single eligible model skips capability scoring and uses tier-only", () => { + const config: DynamicRoutingConfig = { + ...defaultRoutingConfig(), + enabled: true, + capability_routing: true, + tier_models: { standard: "claude-sonnet-4-6" }, + }; + // Pin to single standard model — eligible.length === 1 → skips STEP 2 + const result = resolveModelForComplexity( + { tier: "standard", reason: "test", downgraded: false }, + { primary: "claude-opus-4-6", fallbacks: [] }, + config, + MULTI_MODEL_AVAILABLE, + "execute-task", + undefined, + ); + // Single pinned model → tier-only (no scoring needed) + assert.equal(result.selectionMethod, "tier-only", "single eligible model should use tier-only"); + assert.equal(result.modelId, "claude-sonnet-4-6", "should use the pinned model"); + }); + + // 4. Unknown model with no profile gets uniform 50s and competes + test("unknown model with no profile gets uniform score of 50 and can compete", () => { + const unknownModel = "unknown-future-model-xyz"; + const config: DynamicRoutingConfig = { ...defaultRoutingConfig(), enabled: true, capability_routing: true }; + // Add unknown model to available list at standard tier (unknown → standard per D-15) + // scoring should still work with score=50 for the unknown model + const requirements = { coding: 0.9, instruction: 0.7, speed: 0.3 }; + const scored = scoreEligibleModels([unknownModel, "claude-sonnet-4-6"], requirements); + const unknownEntry = scored.find(s => s.modelId === unknownModel); + assert.ok(unknownEntry !== undefined, "unknown model should be in scored results"); + // Unknown model gets uniform 50s: (0.9*50 + 0.7*50 + 0.3*50) / (0.9+0.7+0.3) ≈ 50 + assert.ok(Math.abs(unknownEntry!.score - 50) < 0.01, `expected score ~50, got ${unknownEntry!.score}`); + }); + + // 5. Capability overrides change scoring outcome + test("capabilityOverrides boost a model above another for same task", () => { + // sonnet: coding=85, gpt-4o: coding=80. Override gpt-4o coding to 99 → gpt-4o should win. + const requirements = { coding: 1.0 }; + const overrides = { "gpt-4o": { coding: 99 } }; + const scored = scoreEligibleModels(["claude-sonnet-4-6", "gpt-4o"], requirements, overrides); + assert.equal(scored[0].modelId, "gpt-4o", "overridden model should win for coding-heavy task"); + assert.ok(scored[0].score > 90, `expected score > 90 after override, got ${scored[0].score}`); + }); + + // 5b. Capability overrides pass through resolveModelForComplexity to scoreEligibleModels + test("resolveModelForComplexity passes capabilityOverrides to scoring step", () => { + const config: DynamicRoutingConfig = { ...defaultRoutingConfig(), enabled: true, capability_routing: true }; + // sonnet coding=85, gpt-4o coding=80. Override gpt-4o coding to 99 → gpt-4o should win. + const overrides: Record> = { "gpt-4o": { coding: 99 } }; + const result = resolveModelForComplexity( + { tier: "standard", reason: "test", downgraded: false }, + { primary: "claude-opus-4-6", fallbacks: [] }, + config, + ["claude-opus-4-6", "claude-sonnet-4-6", "gpt-4o"], + "execute-task", + undefined, + overrides, + ); + assert.equal(result.selectionMethod, "capability-scored"); + assert.equal(result.modelId, "gpt-4o", "gpt-4o should win with coding override"); + }); + + // 6. Regression: existing routing guards unchanged + test("regression: routing-disabled passthrough still returns tier-only", () => { + const config: DynamicRoutingConfig = { ...defaultRoutingConfig(), enabled: false }; + const result = resolveModelForComplexity( + { tier: "light", reason: "test", downgraded: false }, + { primary: "claude-opus-4-6", fallbacks: [] }, + config, + MULTI_MODEL_AVAILABLE, + "execute-task", + undefined, + ); + assert.equal(result.selectionMethod, "tier-only"); + assert.equal(result.wasDowngraded, false); + assert.equal(result.modelId, "claude-opus-4-6"); + }); + + test("regression: unknown-model bypass returns tier-only and does not downgrade", () => { + const config: DynamicRoutingConfig = { ...defaultRoutingConfig(), enabled: true }; + const result = resolveModelForComplexity( + { tier: "light", reason: "test", downgraded: false }, + { primary: "totally-unknown-custom-model", fallbacks: [] }, + config, + ["totally-unknown-custom-model", ...MULTI_MODEL_AVAILABLE], + "execute-task", + undefined, + ); + assert.equal(result.selectionMethod, "tier-only"); + assert.equal(result.wasDowngraded, false); + assert.equal(result.modelId, "totally-unknown-custom-model"); + }); + + test("regression: no-downgrade-needed path returns tier-only", () => { + const config: DynamicRoutingConfig = { ...defaultRoutingConfig(), enabled: true, capability_routing: true }; + // Configured model is sonnet (standard), requesting standard → no downgrade needed + const result = resolveModelForComplexity( + { tier: "standard", reason: "test", downgraded: false }, + { primary: "claude-sonnet-4-6", fallbacks: [] }, + config, + MULTI_MODEL_AVAILABLE, + "execute-task", + undefined, + ); + assert.equal(result.selectionMethod, "tier-only"); + assert.equal(result.wasDowngraded, false); + assert.equal(result.modelId, "claude-sonnet-4-6"); + }); +}); + +// ─── getModelTier unknown default ──────────────────────────────────────────── + +describe("getModelTier unknown default", () => { + test("unknown model returns standard tier (not heavy) via downgrade behavior", () => { + // We can verify this indirectly: resolveModelForComplexity for a standard classification + // with an unknown primary model should NOT downgrade (because unknown → standard, not heavy) + const config = { ...defaultRoutingConfig(), enabled: true }; + // Use "unknown-model-xyz" as primary — its tier will be "standard" per D-15 + // Classification is "heavy" → tier >= standard → no downgrade + // But unknown models use the isKnownModel() guard, so they pass through anyway + // Test the positive: an unknown model is NOT treated as heavy + const result = resolveModelForComplexity( + makeClassification("standard"), + { primary: "claude-sonnet-4-6", fallbacks: [] }, + config, + ["claude-sonnet-4-6", "claude-haiku-4-5", "gpt-4o-mini"], + ); + // standard classification with standard model (sonnet) → no downgrade + assert.equal(result.wasDowngraded, false, "standard model should not downgrade for standard task"); + assert.equal(result.modelId, "claude-sonnet-4-6"); + }); + + test("unknown model in getEligibleModels defaults to standard tier", () => { + // Per D-15: getModelTier returns "standard" for unknown models + const config: DynamicRoutingConfig = defaultRoutingConfig(); + const standardModels = getEligibleModels("standard", ["totally-unknown-model-abc"], config); + const lightModels = getEligibleModels("light", ["totally-unknown-model-abc"], config); + const heavyModels = getEligibleModels("heavy", ["totally-unknown-model-abc"], config); + assert.ok(standardModels.includes("totally-unknown-model-abc"), "Unknown model should be in standard tier"); + assert.equal(lightModels.length, 0, "Unknown model should NOT be in light tier"); + assert.equal(heavyModels.length, 0, "Unknown model should NOT be in heavy tier"); + }); +}); diff --git a/src/resources/extensions/gsd/types.ts b/src/resources/extensions/gsd/types.ts index 25bee6774..c06891e07 100644 --- a/src/resources/extensions/gsd/types.ts +++ b/src/resources/extensions/gsd/types.ts @@ -316,6 +316,7 @@ export interface ClassificationResult { tier: ComplexityTier; reason: string; downgraded: boolean; + taskMetadata?: TaskMetadata; } export interface TaskMetadata {