diff --git a/src/resources/extensions/gsd/model-router.ts b/src/resources/extensions/gsd/model-router.ts index 5b45ef9b4..5436bd667 100644 --- a/src/resources/extensions/gsd/model-router.ts +++ b/src/resources/extensions/gsd/model-router.ts @@ -2,7 +2,7 @@ // Maps complexity tiers to models, enforcing downgrade-only semantics. // The user's configured model is always the ceiling. -import type { ComplexityTier, ClassificationResult } from "./complexity-classifier.js"; +import type { ComplexityTier, ClassificationResult, TaskMetadata } from "./complexity-classifier.js"; import { tierOrdinal } from "./complexity-classifier.js"; import type { ResolvedModelConfig } from "./preferences.js"; @@ -33,14 +33,27 @@ export interface RoutingDecision { wasDowngraded: boolean; /** Human-readable reason for this decision */ reason: string; - /** How the model was selected. */ - selectionMethod?: "tier-only" | "capability-scored"; - /** Capability scores per model (when capability-scored). */ + /** How the model was selected */ + selectionMethod: "tier-only" | "capability-scored"; + /** Capability scores per eligible model (capability-scored path only) */ capabilityScores?: Record; - /** Task requirement vector (when capability-scored). */ + /** Task requirement vector used for scoring */ taskRequirements?: Partial>; } +// ─── Capability Profiles ───────────────────────────────────────────────────── + +/** Seven-dimension capability profile for a model. All values in 0–100 range. */ +export interface ModelCapabilities { + coding: number; + debugging: number; + research: number; + reasoning: number; + speed: number; + longContext: number; + instruction: number; +} + // ─── Known Model Tiers ─────────────────────────────────────────────────────── // Maps known model IDs to their capability tier. Used when tier_models is not // explicitly configured to pick the best available model for each tier. @@ -121,33 +134,27 @@ const MODEL_COST_PER_1K_INPUT: Record = { "deepseek-chat": 0.00014, }; -// ─── Capability Profiles (ADR-004 Phase 2) ────────────────────────────────── -// 7-dimension profiles, 0–100 normalized. Models without a profile -// score 50 uniformly — capability scoring is a no-op for them. - -export interface ModelCapabilities { - coding: number; - debugging: number; - research: number; - reasoning: number; - speed: number; - longContext: number; - instruction: number; -} +// ─── Capability Profiles Data Table ────────────────────────────────────────── +// Per-model capability profiles (0–100 scale). Used for capability-aware +// model selection within an eligible tier set. export const MODEL_CAPABILITY_PROFILES: Record = { - "claude-opus-4-6": { coding: 95, debugging: 90, research: 85, reasoning: 95, speed: 30, longContext: 80, instruction: 90 }, - "claude-sonnet-4-6": { coding: 85, debugging: 80, research: 75, reasoning: 80, speed: 60, longContext: 75, instruction: 85 }, - "claude-haiku-4-5": { coding: 60, debugging: 50, research: 45, reasoning: 50, speed: 95, longContext: 50, instruction: 75 }, - "gpt-4o": { coding: 80, debugging: 75, research: 70, reasoning: 75, speed: 65, longContext: 70, instruction: 80 }, - "gpt-4o-mini": { coding: 55, debugging: 45, research: 40, reasoning: 45, speed: 90, longContext: 45, instruction: 70 }, - "gemini-2.5-pro": { coding: 75, debugging: 70, research: 85, reasoning: 75, speed: 55, longContext: 90, instruction: 75 }, - "gemini-2.0-flash": { coding: 50, debugging: 40, research: 50, reasoning: 40, speed: 95, longContext: 60, instruction: 65 }, - "deepseek-chat": { coding: 75, debugging: 65, research: 55, reasoning: 70, speed: 70, longContext: 55, instruction: 65 }, - "o3": { coding: 80, debugging: 85, research: 80, reasoning: 92, speed: 25, longContext: 70, instruction: 85 }, + "claude-opus-4-6": { coding: 95, debugging: 90, research: 85, reasoning: 95, speed: 30, longContext: 80, instruction: 90 }, + "claude-sonnet-4-6": { coding: 85, debugging: 80, research: 75, reasoning: 80, speed: 60, longContext: 75, instruction: 85 }, + "claude-haiku-4-5": { coding: 60, debugging: 50, research: 45, reasoning: 50, speed: 95, longContext: 50, instruction: 75 }, + "gpt-4o": { coding: 80, debugging: 75, research: 70, reasoning: 75, speed: 65, longContext: 70, instruction: 80 }, + "gpt-4o-mini": { coding: 55, debugging: 45, research: 40, reasoning: 45, speed: 90, longContext: 45, instruction: 70 }, + "gemini-2.5-pro": { coding: 75, debugging: 70, research: 85, reasoning: 75, speed: 55, longContext: 90, instruction: 75 }, + "gemini-2.0-flash": { coding: 50, debugging: 40, research: 50, reasoning: 40, speed: 95, longContext: 60, instruction: 65 }, + "deepseek-chat": { coding: 75, debugging: 65, research: 55, reasoning: 70, speed: 70, longContext: 55, instruction: 65 }, + "o3": { coding: 80, debugging: 85, research: 80, reasoning: 92, speed: 25, longContext: 70, instruction: 85 }, }; -const BASE_REQUIREMENTS: Record>> = { +// ─── Base Task Requirements Data Table ─────────────────────────────────────── +// Per-unit-type base requirement vectors. Weights indicate how important each +// capability dimension is for this unit type. + +export const BASE_REQUIREMENTS: Record>> = { "execute-task": { coding: 0.9, instruction: 0.7, speed: 0.3 }, "research-milestone": { research: 0.9, longContext: 0.7, reasoning: 0.5 }, "research-slice": { research: 0.9, longContext: 0.7, reasoning: 0.5 }, @@ -161,15 +168,36 @@ const BASE_REQUIREMENTS: Record>, +): number { + let weightedSum = 0; + let weightSum = 0; + for (const [dim, weight] of Object.entries(requirements)) { + const capability = model[dim as keyof ModelCapabilities] ?? 50; + weightedSum += weight * capability; + weightSum += weight; + } + return weightSum > 0 ? weightedSum / weightSum : 50; +} + +/** + * Compute dynamic task requirements from unit type and optional task metadata. + * Returns a requirement vector refined by task-specific signals. */ export function computeTaskRequirements( unitType: string, - metadata?: { tags?: string[]; complexityKeywords?: string[]; fileCount?: number; estimatedLines?: number }, + metadata?: TaskMetadata, ): Partial> { - const base = { ...(BASE_REQUIREMENTS[unitType] ?? { reasoning: 0.5 }) }; - + const base = BASE_REQUIREMENTS[unitType] ?? { reasoning: 0.5 }; if (unitType === "execute-task" && metadata) { if (metadata.tags?.some(t => /^(docs?|readme|comment|config|typo|rename)$/i.test(t))) { return { ...base, instruction: 0.9, coding: 0.3, speed: 0.7 }; @@ -184,29 +212,71 @@ export function computeTaskRequirements( return { ...base, coding: 0.9, reasoning: 0.7 }; } } - return base; } /** - * Score a model against a task requirement vector. - * Returns weighted average in range 0–100. Returns 50 for empty requirements. + * Score all eligible models against a requirement vector and return them + * sorted by score descending. Within 2 points: prefer cheaper; equal cost: + * lexicographic tie-break by model ID. */ -export function scoreModel( - capabilities: ModelCapabilities, +export function scoreEligibleModels( + eligibleModelIds: string[], requirements: Partial>, -): number { - let weightedSum = 0; - let weightSum = 0; - for (const [dim, weight] of Object.entries(requirements)) { - const capability = capabilities[dim as keyof ModelCapabilities] ?? 50; - weightedSum += weight * capability; - weightSum += weight; - } - return weightSum > 0 ? weightedSum / weightSum : 50; + capabilityOverrides?: Record>, +): Array<{ modelId: string; score: number }> { + const scored = eligibleModelIds.map(modelId => { + const builtin = MODEL_CAPABILITY_PROFILES[modelId]; + const override = capabilityOverrides?.[modelId]; + const profile: ModelCapabilities = builtin + ? override ? { ...builtin, ...override } : builtin + : { coding: 50, debugging: 50, research: 50, reasoning: 50, speed: 50, longContext: 50, instruction: 50 }; + return { modelId, score: scoreModel(profile, requirements) }; + }); + scored.sort((a, b) => { + const scoreDiff = b.score - a.score; + if (Math.abs(scoreDiff) > 2) return scoreDiff; + const costA = MODEL_COST_PER_1K_INPUT[a.modelId] ?? Infinity; + const costB = MODEL_COST_PER_1K_INPUT[b.modelId] ?? Infinity; + if (costA !== costB) return costA - costB; + return a.modelId.localeCompare(b.modelId); + }); + return scored; } -// ─── Public API ────────────────────────────────────────────────────────────── +/** + * Return all models eligible for a given tier, sorted cheapest first. + * If routingConfig.tier_models[tier] is set and available, returns only that + * model. Otherwise filters availableModelIds by tier from MODEL_CAPABILITY_TIER. + */ +export function getEligibleModels( + tier: ComplexityTier, + availableModelIds: string[], + routingConfig: DynamicRoutingConfig, +): string[] { + // 1. Check explicit tier_models config + const explicitModel = routingConfig.tier_models?.[tier]; + if (explicitModel) { + // Exact match + if (availableModelIds.includes(explicitModel)) return [explicitModel]; + // Provider-prefix-stripped match + const match = availableModelIds.find(id => { + const bareAvail = id.includes("/") ? id.split("/").pop()! : id; + const bareExplicit = explicitModel.includes("/") ? explicitModel.split("/").pop()! : explicitModel; + return bareAvail === bareExplicit; + }); + if (match) return [match]; + } + + // 2. Auto-detect: filter by tier, sort cheapest first + return availableModelIds + .filter(id => getModelTier(id) === tier) + .sort((a, b) => { + const costA = getModelCost(a); + const costB = getModelCost(b); + return costA - costB; + }); +} /** * Resolve the model to use for a given complexity tier. @@ -235,6 +305,7 @@ export function resolveModelForComplexity( tier: classification.tier, wasDowngraded: false, reason: "dynamic routing disabled or no phase config", + selectionMethod: "tier-only", }; } @@ -254,6 +325,7 @@ export function resolveModelForComplexity( tier: requestedTier, wasDowngraded: false, reason: `configured model "${configuredPrimary}" is not in the known tier map — honoring explicit config`, + selectionMethod: "tier-only", }; } @@ -265,6 +337,7 @@ export function resolveModelForComplexity( tier: requestedTier, wasDowngraded: false, reason: `tier ${requestedTier} >= configured ${configuredTier}`, + selectionMethod: "tier-only", }; } @@ -360,8 +433,8 @@ function getModelTier(modelId: string): ComplexityTier { if (bareId.includes(knownId) || knownId.includes(bareId)) return tier; } - // Unknown models are assumed heavy (safest assumption) - return "heavy"; + // Unknown models are assumed standard (per D-15: avoids silently ignoring user config) + return "standard"; } /** Check if a model ID has a known capability tier mapping. (#2192) */