From 77309207ce966ce201b1cbc4b308836e18229671 Mon Sep 17 00:00:00 2001 From: Flux Labs Date: Mon, 16 Mar 2026 08:53:53 -0500 Subject: [PATCH] feat: dynamic model routing for token consumption optimization (#579) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: dynamic model routing for token consumption optimization (#575) Add complexity-based model routing that classifies units into light/standard/heavy tiers and routes to cheaper models when appropriate. Reduces token consumption by 20-50% for users on capped plans. - Complexity classifier with heuristic-based tier assignment (no LLM call) - Model router with downgrade-only semantics (user's config is ceiling) - Budget-pressure-aware routing (more aggressive as budget fills) - Cross-provider cost comparison via bundled cost table - Hook classification support - Escalation on failure (light → standard → heavy) - Full preference validation and merge support - Metrics tracking with tier and downgrade fields - 40 new tests (classifier, router, cost table) Closes #575 * feat: phases 2-4 — dashboard, adaptive learning, task introspection Phase 2 — Observability & Dashboard: - Tier badge [L]/[S]/[H] displayed in progress widget next to phase label - Dynamic routing savings summary shown in footer when units have been downgraded - Tier and modelDowngraded fields passed through snapshotUnitMetrics Phase 3 — Adaptive Learning: - New routing-history.ts: tracks success/failure per tier per unit-type pattern - Rolling window of 50 entries per pattern to prevent stale data - User feedback support (over/under/ok) with 2x weight vs automatic - Failure rate >20% auto-bumps tier for that pattern - Tag-specific patterns (e.g. execute-task:docs) for granular learning - History persists to .gsd/routing-history.json - Classifier consults adaptive history before finalizing tier Phase 4 — Task Plan Introspection: - Code block counting in task plans (5+ blocks → heavy) - Complexity keyword detection: migration, architecture, security, performance, concurrency, compatibility - Multiple complexity keywords (2+) → heavy, single → standard - New codeBlockCount and complexityKeywords fields in TaskMetadata Tests: 16 new tests (routing history + introspection), 419 total passing --- .plans/issue-575-dynamic-model-routing.md | 364 ++++++++++++++++++ .../extensions/gsd/auto-dashboard.ts | 14 +- src/resources/extensions/gsd/auto.ts | 104 ++++- .../extensions/gsd/complexity-classifier.ts | 322 ++++++++++++++++ src/resources/extensions/gsd/metrics.ts | 48 +++ .../extensions/gsd/model-cost-table.ts | 65 ++++ src/resources/extensions/gsd/model-router.ts | 256 ++++++++++++ src/resources/extensions/gsd/preferences.ts | 71 ++++ .../gsd/tests/complexity-classifier.test.ts | 181 +++++++++ .../gsd/tests/model-cost-table.test.ts | 69 ++++ .../extensions/gsd/tests/model-router.test.ts | 167 ++++++++ .../gsd/tests/routing-history.test.ts | 265 ++++++++++--- 12 files changed, 1851 insertions(+), 75 deletions(-) create mode 100644 .plans/issue-575-dynamic-model-routing.md create mode 100644 src/resources/extensions/gsd/complexity-classifier.ts create mode 100644 src/resources/extensions/gsd/model-cost-table.ts create mode 100644 src/resources/extensions/gsd/model-router.ts create mode 100644 src/resources/extensions/gsd/tests/complexity-classifier.test.ts create mode 100644 src/resources/extensions/gsd/tests/model-cost-table.test.ts create mode 100644 src/resources/extensions/gsd/tests/model-router.test.ts diff --git a/.plans/issue-575-dynamic-model-routing.md b/.plans/issue-575-dynamic-model-routing.md new file mode 100644 index 000000000..c68eab6bf --- /dev/null +++ b/.plans/issue-575-dynamic-model-routing.md @@ -0,0 +1,364 @@ +# Plan: Dynamic Model Routing for Token Optimization + +**Issue:** #575 — Token Consumption Optimization through Dynamic Model Selection +**Status:** Draft +**Date:** 2025-03-15 + +## Problem Statement + +Users on capped plans (e.g., Claude Pro) exhaust weekly token limits in 15-20 hours of GSD usage. Currently, GSD uses a single model per phase (research/planning/execution/completion), configured statically in preferences. Simple tasks consume the same tokens as complex ones. + +## Current Architecture + +### What Exists +- **Phase-based model config:** Users can set different models per phase via `preferences.md` (research, planning, execution, completion) +- **Fallback chains:** Each phase supports `fallbacks: [model1, model2]` for error recovery +- **Pre-dispatch hooks:** `PreDispatchResult` has a `model` field but it's **never applied** in `auto.ts` — this is a ready-made extension point +- **Model registry:** `ModelRegistry.getAvailable()` provides all configured models with metadata +- **Per-unit metrics:** Token counts (input/output/cacheRead/cacheWrite), cost, and model tracked per unit +- **Budget enforcement:** Real-time cost tracking with alerts at 75%/90%/100% + +### Key Files +| File | Role | +|------|------| +| `src/resources/extensions/gsd/auto.ts` | Dispatch logic, model switching (lines 1791-1879) | +| `src/resources/extensions/gsd/preferences.ts` | Model resolution, `resolveModelWithFallbacksForUnit()` | +| `src/resources/extensions/gsd/post-unit-hooks.ts` | Pre-dispatch hooks (model field defined but unused) | +| `src/resources/extensions/gsd/types.ts` | Type definitions for hooks and model config | +| `src/resources/extensions/gsd/metrics.ts` | Token tracking, aggregation, cost projection | +| `src/resources/extensions/gsd/auto-prompts.ts` | Prompt builders per unit type | +| `packages/pi-coding-agent/src/core/model-registry.ts` | Model availability and metadata | + +## Proposed Design + +### Core Concept: Task Complexity Classification + +Before each unit dispatch, classify the task into a complexity tier and route to an appropriate model. This sits between preference resolution and model dispatch — it can **downgrade** but never **upgrade** beyond the user's configured model. + +### Complexity Tiers + +| Tier | Complexity | Example Tasks | Default Model | +|------|-----------|---------------|---------------| +| **Tier 1 — Light** | Low cognitive load, structured output | File reads, search aggregation, simple summaries, completion/summary units | Haiku / cheapest available | +| **Tier 2 — Standard** | Moderate reasoning, some creativity | Research synthesis, plan formatting, routine code generation, UAT checks | Sonnet / mid-tier | +| **Tier 3 — Heavy** | Complex reasoning, architecture, novel code | Complex execution tasks, replanning, multi-file refactors, debugging | Opus / user's configured model | + +### Classification Signals + +The classifier uses **heuristic signals** available before dispatch (no LLM call needed): + +1. **Unit type** (strongest signal): + - `complete-slice`, `run-uat` → Tier 1 (structured summarization) + - `research-milestone`, `research-slice` → Tier 2 (synthesis) + - `plan-milestone`, `plan-slice` → Tier 2-3 (depends on scope) + - `execute-task` → Tier 2-3 (depends on task complexity) + - `replan-slice` → Tier 3 (requires understanding of failure) + +2. **Task metadata** (for execution units): + - Lines of code estimated to change (from task plan) + - Number of files involved + - Dependency count + - Whether task involves new file creation vs. modification + - Tags/labels if present (e.g., "refactor", "test", "docs") + +3. **Historical performance** (adaptive, Phase 2): + - If a Tier 2 model failed and escalated on similar tasks before, default to Tier 3 + - Track success rate per tier per unit-type pattern + +### Architecture + +``` +User Preferences (phase → model) + │ + ▼ +resolveModelWithFallbacksForUnit() ← existing + │ + ▼ +classifyUnitComplexity() ← NEW: returns Tier 1/2/3 + │ + ▼ +resolveModelForTier() ← NEW: maps tier → model from available set + │ + ▼ +maybeDowngradeModel() ← NEW: only downgrades from user's configured model + │ + ▼ +Model dispatch (existing auto.ts logic) +``` + +### Key Design Decisions + +1. **Downgrade-only:** The classifier can select a cheaper model than configured, never a more expensive one. The user's preference is the ceiling. + +2. **Opt-in with easy override:** New preference key `dynamic_model_routing: true|false` (default: `false`). Users who want token savings enable it explicitly. + +3. **Escalation on failure:** If a lower-tier model fails (tool errors, incomplete output, exceeds retries), automatically escalate to the next tier and retry the unit. + +4. **No LLM call for classification:** Uses heuristics only — adding an LLM call to save tokens would be counterproductive. + +5. **Respects existing fallback chains:** Dynamic routing integrates with existing `fallbacks` — if the dynamically selected model fails, it tries the fallback chain before escalating tiers. + +6. **Transparent to user:** Dashboard shows which model was selected and why (tier badge in progress widget). + +## Implementation Phases + +### Phase 1: Foundation — Complexity Classifier & Routing (Core) + +**Goal:** Build the classification and routing system, wire it into dispatch. + +#### 1a. Define types and configuration + +**File:** `src/resources/extensions/gsd/types.ts` +- Add `ComplexityTier` type: `'light' | 'standard' | 'heavy'` +- Add `DynamicRoutingConfig` interface: + ```typescript + interface DynamicRoutingConfig { + enabled: boolean; + tier_models?: { + light?: string; // model ID for light tasks + standard?: string; // model ID for standard tasks + heavy?: string; // model ID for heavy tasks (default: user's configured model) + }; + escalate_on_failure?: boolean; // default: true + } + ``` + +**File:** `src/resources/extensions/gsd/preferences.ts` +- Add `dynamic_routing` to preference schema +- Add validation for the new config +- Add `loadDynamicRoutingConfig()` function + +#### 1b. Build complexity classifier + +**New file:** `src/resources/extensions/gsd/complexity-classifier.ts` +- `classifyUnitComplexity(unitType, unitId, metadata?)` → `ComplexityTier` +- Heuristic rules: + - Unit type mapping (see Tiers table above) + - Task plan analysis: parse task plan file for file count, estimated scope + - Dependency analysis: tasks with 3+ dependencies → bump to heavy +- Export `getClassificationReason()` for dashboard display + +#### 1c. Build model router + +**New file:** `src/resources/extensions/gsd/model-router.ts` +- `resolveModelForComplexity(tier, phaseConfig, availableModels)` → `ResolvedModelConfig` +- Logic: + 1. Get user's configured model for phase (ceiling) + 2. If `tier_models` configured, use tier-specific model + 3. If not configured, use smart defaults from available models (cheapest for light, mid for standard, configured for heavy) + 4. Validate selected model is available + 5. Return with fallback chain: `[tier_model, ...configured_fallbacks, configured_primary]` + +#### 1d. Wire into dispatch + +**File:** `src/resources/extensions/gsd/auto.ts` +- In the model resolution block (lines 1791-1879): + 1. After `resolveModelWithFallbacksForUnit()`, call classifier + 2. If dynamic routing enabled, call router to potentially downgrade + 3. Log tier and model selection to metrics + 4. On unit failure: if using downgraded model, escalate tier and retry + +#### 1e. Wire the unused pre-dispatch hook model field + +**File:** `src/resources/extensions/gsd/auto.ts` +- Apply `preDispatchResult.model` when returned — this is already defined but unused +- Allows hooks to override dynamic routing decisions + +#### Tests + +**New file:** `src/resources/extensions/gsd/tests/complexity-classifier.test.ts` +- Test tier assignment for each unit type +- Test metadata-based adjustments (file count, dependency count) +- Test edge cases (missing metadata, unknown unit types) + +**New file:** `src/resources/extensions/gsd/tests/model-router.test.ts` +- Test downgrade-only behavior (never exceeds configured model) +- Test tier-to-model mapping with various available model sets +- Test fallback chain construction +- Test when dynamic routing is disabled (passthrough) + +**New file:** `src/resources/extensions/gsd/tests/dynamic-routing-integration.test.ts` +- Test full flow: unit → classify → route → dispatch +- Test escalation on failure +- Test preference loading and validation + +--- + +### Phase 2: Observability & Dashboard + +**Goal:** Make routing decisions visible to users. + +#### 2a. Metrics tracking + +**File:** `src/resources/extensions/gsd/metrics.ts` +- Add `tier` field to `UnitMetrics` +- Add `model_downgraded: boolean` field +- Add `escalation_count` field +- Add `aggregateByTier()` function +- Add `formatTierSavings()` — show estimated savings from downgrades + +#### 2b. Dashboard integration + +**File:** `src/resources/extensions/gsd/auto-dashboard.ts` +- Add tier badge to unit progress display (e.g., `[L]`, `[S]`, `[H]`) +- Add savings summary to completion stats: "Dynamic routing saved ~$X.XX (N units downgraded)" +- Color-code tier in token widget + +#### Tests +- Test metrics aggregation by tier +- Test savings calculation +- Test dashboard formatting + +--- + +### Phase 3: Adaptive Learning (Future) + +**Goal:** Improve classification accuracy over time based on outcomes. + +#### 3a. Outcome tracking + +**File:** `src/resources/extensions/gsd/complexity-classifier.ts` +- Track success/failure per tier per unit-type pattern +- Store in `.gsd/routing-history.json` (project-level) +- Simple structure: `{ "execute-task:docs": { light: { success: 12, fail: 1 }, ... } }` + +#### 3b. Adaptive thresholds + +- If a tier has >20% failure rate for a pattern, auto-bump default tier +- Decay old data (rolling window of last 50 units) +- User can reset learning: `dynamic_routing_reset: true` in preferences + +#### Tests +- Test learning updates on success/failure +- Test threshold bumping +- Test decay logic +- Test reset behavior + +--- + +### Phase 4: Task Plan Introspection (Future) + +**Goal:** Deeper classification using task plan content analysis. + +- Parse task plan markdown for complexity signals: + - "Create new file" vs. "modify existing" + - Number of code blocks in plan + - Presence of keywords: "refactor", "migration", "architecture", "test", "docs", "config" + - Estimated lines of change (if specified) +- Weight these signals alongside unit-type heuristics + +--- + +## Preference Configuration (User-Facing) + +```yaml +--- +version: 1 +models: + research: claude-sonnet-4-6 + planning: claude-opus-4-6 + execution: claude-sonnet-4-6 + completion: claude-sonnet-4-6 +dynamic_routing: + enabled: true + tier_models: + light: claude-haiku-4-5 + standard: claude-sonnet-4-6 + # heavy: inherits from phase config (ceiling) + escalate_on_failure: true +--- +``` + +## Risk Mitigation + +| Risk | Mitigation | +|------|-----------| +| Cheaper model produces low-quality output | Downgrade-only design; escalation on failure; user can disable | +| Classification overhead adds latency | Heuristics-only, no LLM call; <1ms classification time | +| Complex preferences confuse users | Disabled by default; works with zero config if enabled (uses smart defaults) | +| Model not available in user's provider | Validation at preference load; falls back to configured model | +| Escalation loops | Max 1 escalation per unit; after that, use configured model | + +## Estimated Token Savings + +Based on typical GSD session patterns: +- ~30% of units are completion/summary (Tier 1 candidates) +- ~40% are research/standard planning (Tier 2 candidates) +- ~30% are complex execution (Tier 3, no downgrade) + +If Haiku is ~10x cheaper than Opus and Sonnet is ~5x cheaper: +- **Conservative estimate:** 20-30% cost reduction with dynamic routing enabled +- **Aggressive estimate:** 40-50% for projects with many small tasks + +## Resolved Design Decisions + +All four open questions resolved as **yes** — folded into the plan as additional scope: + +### 1. Post-unit hook classification — YES +Hooks get their own complexity classification. Most hooks are lightweight (validation, file checks) and should default to Tier 1. The existing `model` field on `PostUnitHookConfig` becomes the ceiling, same as phase models for units. + +**Implementation:** Add to Phase 1d — extend `classifyUnitComplexity()` to accept hook metadata. Wire into hook dispatch at `auto.ts` lines 936-946. + +### 2. Budget-pressure-aware routing — YES +As budget usage increases, the classifier becomes more aggressive about downgrading: +- **<50% budget used:** Normal classification +- **50-75% budget used:** Bump Tier 2 candidates down to Tier 1 where possible +- **75-90% budget used:** Only Tier 3 tasks get the configured model; everything else goes to cheapest available +- **>90% budget used:** Everything except `replan-slice` gets downgraded to cheapest + +**Implementation:** Add to Phase 1b — `classifyUnitComplexity()` takes `budgetPct` parameter from existing `getBudgetAlertLevel()` logic. New function `applyBudgetPressure(tier, budgetPct)` adjusts the tier. + +### 3. Multi-provider cost routing — YES +When multiple providers are configured, the router should consider cost differences. If a user has both Anthropic and OpenRouter, pick the cheapest option for the resolved tier. + +**Implementation:** +- Add `cost_per_1k_tokens` metadata to model registry (or maintain a lookup table for known models) +- New file: `src/resources/extensions/gsd/model-cost-table.ts` — static cost table for known models, updatable via preferences +- `resolveModelForComplexity()` ranks available models by cost within a tier's capability range +- Preference key: `dynamic_routing.cross_provider: true|false` (default: true when enabled) + +**Risk:** Cost data goes stale. Mitigate with a bundled cost table that gets updated with GSD releases + user override capability. + +### 4. User feedback loop — YES +After each unit completes, users can flag the output quality to improve future classification. + +**Implementation (Phase 3 — Adaptive Learning):** +- Post-unit prompt option: user can react with `/gsd:rate-unit [over|under|ok]` + - `over` = "this could have used a simpler model" → records downgrade signal + - `under` = "this needed a better model" → records upgrade signal + - `ok` = confirms current tier was appropriate +- Feedback stored alongside outcome data in `.gsd/routing-history.json` +- Classifier weights feedback signals 2x vs. automatic success/failure detection +- Skill: `gsd:rate-unit` — simple command that tags the last completed unit + +### Updated Preference Configuration + +```yaml +--- +version: 1 +models: + research: claude-sonnet-4-6 + planning: claude-opus-4-6 + execution: claude-sonnet-4-6 + completion: claude-sonnet-4-6 +dynamic_routing: + enabled: true + tier_models: + light: claude-haiku-4-5 + standard: claude-sonnet-4-6 + # heavy: inherits from phase config (ceiling) + escalate_on_failure: true + budget_pressure: true # more aggressive downgrading as budget fills + cross_provider: true # consider cost across providers + hooks: true # classify hooks too +--- +``` + +### Updated Phase Summary + +| Phase | Scope | Includes | +|-------|-------|----------| +| **1 — Foundation** | Classifier, router, dispatch, hook classification, budget pressure | Decisions 1 & 2 | +| **2 — Observability** | Dashboard, tier badges, savings tracking, cost table | Decision 3 | +| **3 — Adaptive Learning** | Outcome tracking, user feedback (`/gsd:rate-unit`), adaptive thresholds | Decision 4 | +| **4 — Task Introspection** | Parse task plans for deeper complexity signals | — | diff --git a/src/resources/extensions/gsd/auto-dashboard.ts b/src/resources/extensions/gsd/auto-dashboard.ts index c2d9e41af..c0031ff13 100644 --- a/src/resources/extensions/gsd/auto-dashboard.ts +++ b/src/resources/extensions/gsd/auto-dashboard.ts @@ -10,7 +10,7 @@ import type { ExtensionContext, ExtensionCommandContext } from "@gsd/pi-coding-a import type { GSDState } from "./types.js"; import { getCurrentBranch } from "./worktree.js"; import { getActiveHook } from "./post-unit-hooks.js"; -import { getLedger, getProjectTotals, formatCost, formatTokenCount } from "./metrics.js"; +import { getLedger, getProjectTotals, formatCost, formatTokenCount, formatTierSavings } from "./metrics.js"; import { resolveMilestoneFile, resolveSliceFile, @@ -239,6 +239,7 @@ export function updateProgressWidget( unitId: string, state: GSDState, accessors: WidgetStateAccessors, + tierBadge?: string, ): void { if (!ctx.hasUI) return; @@ -319,7 +320,8 @@ export function updateProgressWidget( const target = task ? `${task.id}: ${task.title}` : unitId; const actionLeft = `${pad}${theme.fg("accent", "▸")} ${theme.fg("accent", verb)} ${theme.fg("text", target)}`; - const phaseBadge = theme.fg("dim", phaseLabel); + const tierTag = tierBadge ? theme.fg("dim", `[${tierBadge}] `) : ""; + const phaseBadge = `${tierTag}${theme.fg("dim", phaseLabel)}`; lines.push(rightAlign(actionLeft, phaseBadge, width)); lines.push(""); @@ -414,6 +416,14 @@ export function updateProgressWidget( ? `${modelPhase}${theme.fg("dim", modelDisplay)}` : ""; lines.push(rightAlign(`${pad}${sLeft}`, sRight, width)); + + // Dynamic routing savings summary + if (mLedger && mLedger.units.some(u => u.tier)) { + const savings = formatTierSavings(mLedger.units); + if (savings) { + lines.push(truncateToWidth(theme.fg("dim", `${pad}${savings}`), width)); + } + } } const hintParts: string[] = []; diff --git a/src/resources/extensions/gsd/auto.ts b/src/resources/extensions/gsd/auto.ts index cc925871b..fc51a7c19 100644 --- a/src/resources/extensions/gsd/auto.ts +++ b/src/resources/extensions/gsd/auto.ts @@ -39,9 +39,12 @@ import { readUnitRuntimeRecord, writeUnitRuntimeRecord, } from "./unit-runtime.js"; -import { resolveAutoSupervisorConfig, resolveModelWithFallbacksForUnit, loadEffectiveGSDPreferences, resolveSkillDiscoveryMode } from "./preferences.js"; +import { resolveAutoSupervisorConfig, resolveModelWithFallbacksForUnit, loadEffectiveGSDPreferences, resolveSkillDiscoveryMode, resolveDynamicRoutingConfig } from "./preferences.js"; import { sendDesktopNotification } from "./notifications.js"; import type { GSDPreferences } from "./preferences.js"; +import { classifyUnitComplexity, tierLabel } from "./complexity-classifier.js"; +import { resolveModelForComplexity } from "./model-router.js"; +import { initRoutingHistory, resetRoutingHistory, recordOutcome } from "./routing-history.js"; import { checkPostUnitHooks, getActiveHook, @@ -233,6 +236,9 @@ let autoStartTime: number = 0; let completedUnits: { type: string; id: string; startedAt: number; finishedAt: number }[] = []; let currentUnit: { type: string; id: string; startedAt: number } | null = null; +/** Track dynamic routing decision for the current unit (for metrics) */ +let currentUnitRouting: { tier: string; modelDowngraded: boolean } | null = null; + /** Track current milestone to detect transitions */ let currentMilestoneId: string | null = null; let lastBudgetAlertLevel: BudgetAlertLevel = 0; @@ -504,6 +510,7 @@ export async function stopAuto(ctx?: ExtensionContext, pi?: ExtensionAPI): Promi } resetMetrics(); + resetRoutingHistory(); resetHookState(); if (basePath) clearPersistedHookState(basePath); active = false; @@ -809,6 +816,9 @@ export async function startAuto( // Initialize metrics — loads existing ledger from disk initMetrics(base); + // Initialize routing history for adaptive learning + initRoutingHistory(base); + // Snapshot installed skills so we can detect new ones after research if (resolveSkillDiscoveryMode() !== "off") { snapshotSkills(); @@ -1011,7 +1021,7 @@ export async function handleAgentEnd( const hookStartedAt = Date.now(); if (currentUnit) { const modelId = ctx.model?.id ?? "unknown"; - snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId); + snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId, currentUnitRouting ?? undefined); saveActivityLog(ctx, basePath, currentUnit.type, currentUnit.id); } currentUnit = { type: hookUnit.unitType, id: hookUnit.unitId, startedAt: hookStartedAt }; @@ -1227,7 +1237,10 @@ function updateProgressWidget( unitId: string, state: GSDState, ): void { - _updateProgressWidget(ctx, unitType, unitId, state, widgetStateAccessors); + const badge = currentUnitRouting?.tier + ? ({ light: "L", standard: "S", heavy: "H" }[currentUnitRouting.tier] ?? undefined) + : undefined; + _updateProgressWidget(ctx, unitType, unitId, state, widgetStateAccessors, badge); } /** State accessors for the widget — closures over module globals. */ @@ -1395,7 +1408,7 @@ async function dispatchNextUnit( // Save final session before stopping if (currentUnit) { const modelId = ctx.model?.id ?? "unknown"; - snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId); + snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId, currentUnitRouting ?? undefined); saveActivityLog(ctx, basePath, currentUnit.type, currentUnit.id); } sendDesktopNotification("GSD", "All milestones complete!", "success", "milestone"); @@ -1423,7 +1436,7 @@ async function dispatchNextUnit( if (!mid || !midTitle) { if (currentUnit) { const modelId = ctx.model?.id ?? "unknown"; - snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId); + snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId, currentUnitRouting ?? undefined); saveActivityLog(ctx, basePath, currentUnit.type, currentUnit.id); } await stopAuto(ctx, pi); @@ -1438,7 +1451,7 @@ async function dispatchNextUnit( if (state.phase === "complete") { if (currentUnit) { const modelId = ctx.model?.id ?? "unknown"; - snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId); + snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId, currentUnitRouting ?? undefined); saveActivityLog(ctx, basePath, currentUnit.type, currentUnit.id); } // Clear completed-units.json for the finished milestone so it doesn't grow unbounded. @@ -1508,7 +1521,7 @@ async function dispatchNextUnit( if (state.phase === "blocked") { if (currentUnit) { const modelId = ctx.model?.id ?? "unknown"; - snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId); + snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId, currentUnitRouting ?? undefined); saveActivityLog(ctx, basePath, currentUnit.type, currentUnit.id); } await stopAuto(ctx, pi); @@ -1616,7 +1629,7 @@ async function dispatchNextUnit( if (dispatchResult.action === "stop") { if (currentUnit) { const modelId = ctx.model?.id ?? "unknown"; - snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId); + snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId, currentUnitRouting ?? undefined); saveActivityLog(ctx, basePath, currentUnit.type, currentUnit.id); } await stopAuto(ctx, pi); @@ -1726,7 +1739,7 @@ async function dispatchNextUnit( if (lifetimeCount > MAX_LIFETIME_DISPATCHES) { if (currentUnit) { const modelId = ctx.model?.id ?? "unknown"; - snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId); + snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId, currentUnitRouting ?? undefined); } saveActivityLog(ctx, basePath, unitType, unitId); const expected = diagnoseExpectedArtifact(unitType, unitId, basePath); @@ -1740,7 +1753,7 @@ async function dispatchNextUnit( if (prevCount >= MAX_UNIT_DISPATCHES) { if (currentUnit) { const modelId = ctx.model?.id ?? "unknown"; - snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId); + snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId, currentUnitRouting ?? undefined); } saveActivityLog(ctx, basePath, unitType, unitId); @@ -1898,9 +1911,19 @@ async function dispatchNextUnit( // The session still holds the previous unit's data (newSession hasn't fired yet). if (currentUnit) { const modelId = ctx.model?.id ?? "unknown"; - snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId); + snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId, currentUnitRouting ?? undefined); saveActivityLog(ctx, basePath, currentUnit.type, currentUnit.id); + // Record routing outcome for adaptive learning + if (currentUnitRouting) { + const isRetry = currentUnit.type === unitType && currentUnit.id === unitId; + recordOutcome( + currentUnit.type, + currentUnitRouting.tier as "light" | "standard" | "heavy", + !isRetry, // success = not being retried + ); + } + // Only mark the previous unit as completed if: // 1. We're not about to re-dispatch the same unit (retry scenario) // 2. The expected artifact actually exists on disk @@ -2003,7 +2026,54 @@ async function dispatchNextUnit( const modelConfig = resolveModelWithFallbacksForUnit(unitType); if (modelConfig) { const availableModels = ctx.modelRegistry.getAvailable(); - const modelsToTry = [modelConfig.primary, ...modelConfig.fallbacks]; + + // ─── Dynamic Model Routing ───────────────────────────────────────── + // If enabled, classify unit complexity and potentially downgrade to a + // cheaper model. The user's configured model is the ceiling. + const routingConfig = resolveDynamicRoutingConfig(); + let effectiveModelConfig = modelConfig; + let routingTierLabel = ""; + currentUnitRouting = null; + + if (routingConfig.enabled) { + // Compute budget pressure if budget ceiling is set + let budgetPct: number | undefined; + if (routingConfig.budget_pressure !== false) { + const budgetCeiling = prefs?.budget_ceiling; + if (budgetCeiling !== undefined && budgetCeiling > 0) { + const currentLedger = getLedger(); + const totalCost = currentLedger ? getProjectTotals(currentLedger.units).cost : 0; + budgetPct = totalCost / budgetCeiling; + } + } + + // Classify complexity (hook routing controlled by config.hooks) + const isHook = unitType.startsWith("hook/"); + const shouldClassify = !isHook || routingConfig.hooks !== false; + + if (shouldClassify) { + const classification = classifyUnitComplexity(unitType, unitId, basePath, budgetPct); + const availableModelIds = availableModels.map(m => m.id); + const routing = resolveModelForComplexity(classification, modelConfig, routingConfig, availableModelIds); + + if (routing.wasDowngraded) { + effectiveModelConfig = { + primary: routing.modelId, + fallbacks: routing.fallbacks, + }; + if (verbose) { + ctx.ui.notify( + `Dynamic routing [${tierLabel(classification.tier)}]: ${routing.modelId} (${classification.reason})`, + "info", + ); + } + } + routingTierLabel = ` [${tierLabel(classification.tier)}]`; + currentUnitRouting = { tier: classification.tier, modelDowngraded: routing.wasDowngraded }; + } + } + + const modelsToTry = [effectiveModelConfig.primary, ...effectiveModelConfig.fallbacks]; let modelSet = false; for (const modelId of modelsToTry) { @@ -2068,11 +2138,11 @@ async function dispatchNextUnit( const ok = await pi.setModel(model, { persist: false }); if (ok) { - const fallbackNote = modelId === modelConfig.primary + const fallbackNote = modelId === effectiveModelConfig.primary ? "" - : ` (fallback from ${modelConfig.primary})`; + : ` (fallback from ${effectiveModelConfig.primary})`; const phase = unitPhaseLabel(unitType); - ctx.ui.notify(`Model [${phase}]: ${model.provider}/${model.id}${fallbackNote}`, "info"); + ctx.ui.notify(`Model [${phase}]${routingTierLabel}: ${model.provider}/${model.id}${fallbackNote}`, "info"); modelSet = true; break; } else { @@ -2151,7 +2221,7 @@ async function dispatchNextUnit( if (currentUnit) { const modelId = ctx.model?.id ?? "unknown"; - snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId); + snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId, currentUnitRouting ?? undefined); } saveActivityLog(ctx, basePath, unitType, unitId); @@ -2177,7 +2247,7 @@ async function dispatchNextUnit( timeoutAt: Date.now(), }); const modelId = ctx.model?.id ?? "unknown"; - snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId); + snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId, currentUnitRouting ?? undefined); } saveActivityLog(ctx, basePath, unitType, unitId); diff --git a/src/resources/extensions/gsd/complexity-classifier.ts b/src/resources/extensions/gsd/complexity-classifier.ts new file mode 100644 index 000000000..03ca0049e --- /dev/null +++ b/src/resources/extensions/gsd/complexity-classifier.ts @@ -0,0 +1,322 @@ +// GSD Extension — Complexity Classifier +// Classifies unit complexity for dynamic model routing. +// Pure heuristics + adaptive learning — no LLM calls. Sub-millisecond classification. + +import { existsSync, readFileSync } from "node:fs"; +import { join } from "node:path"; +import { gsdRoot } from "./paths.js"; +import { getAdaptiveTierAdjustment } from "./routing-history.js"; + +// ─── Types ─────────────────────────────────────────────────────────────────── + +export type ComplexityTier = "light" | "standard" | "heavy"; + +export interface ClassificationResult { + tier: ComplexityTier; + reason: string; + downgraded: boolean; // true if budget pressure lowered the tier +} + +export interface TaskMetadata { + fileCount?: number; + dependencyCount?: number; + isNewFile?: boolean; + tags?: string[]; + estimatedLines?: number; + codeBlockCount?: number; // number of fenced code blocks in plan + complexityKeywords?: string[]; // detected complexity signals +} + +// ─── Unit Type → Default Tier Mapping ──────────────────────────────────────── + +const UNIT_TYPE_TIERS: Record = { + // Tier 1 — Light: structured summaries, completion, UAT + "complete-slice": "light", + "run-uat": "light", + + // Tier 2 — Standard: research, routine planning + "research-milestone": "standard", + "research-slice": "standard", + "plan-milestone": "standard", + "plan-slice": "standard", + + // Tier 3 — Heavy: execution, replanning (requires deep reasoning) + "execute-task": "standard", // default standard, upgraded by metadata + "replan-slice": "heavy", + "reassess-roadmap": "heavy", +}; + +// ─── Public API ────────────────────────────────────────────────────────────── + +/** + * Classify unit complexity to determine which model tier to use. + * + * @param unitType The type of unit being dispatched + * @param unitId The unit ID (e.g. "M001/S01/T01") + * @param basePath Project base path (for reading task plans) + * @param budgetPct Current budget usage as fraction (0.0-1.0+), or undefined if no budget + * @param metadata Optional pre-parsed task metadata + */ +export function classifyUnitComplexity( + unitType: string, + unitId: string, + basePath: string, + budgetPct?: number, + metadata?: TaskMetadata, +): ClassificationResult { + // Hook units default to light + if (unitType.startsWith("hook/")) { + const result: ClassificationResult = { tier: "light", reason: "hook unit", downgraded: false }; + return applyBudgetPressure(result, budgetPct); + } + + // Start with the default tier for this unit type + let tier = UNIT_TYPE_TIERS[unitType] ?? "standard"; + let reason = `unit type: ${unitType}`; + + // For execute-task, analyze task metadata for complexity signals + if (unitType === "execute-task") { + const taskAnalysis = analyzeTaskComplexity(unitId, basePath, metadata); + tier = taskAnalysis.tier; + reason = taskAnalysis.reason; + } + + // For plan-slice, check if the slice has many tasks (complex planning) + if (unitType === "plan-slice" || unitType === "plan-milestone") { + const planAnalysis = analyzePlanComplexity(unitId, basePath); + if (planAnalysis) { + tier = planAnalysis.tier; + reason = planAnalysis.reason; + } + } + + // Adaptive learning: check if history suggests bumping the tier + const tags = metadata?.tags ?? extractTaskMetadata(unitId, basePath).tags; + const adaptiveAdjustment = getAdaptiveTierAdjustment(unitType, tier, tags); + if (adaptiveAdjustment && tierOrdinal(adaptiveAdjustment) > tierOrdinal(tier)) { + reason = `${reason} (adaptive: high failure rate at ${tier})`; + tier = adaptiveAdjustment; + } + + const result: ClassificationResult = { tier, reason, downgraded: false }; + return applyBudgetPressure(result, budgetPct); +} + +/** + * Get a short label for the tier (for dashboard display). + */ +export function tierLabel(tier: ComplexityTier): string { + switch (tier) { + case "light": return "L"; + case "standard": return "S"; + case "heavy": return "H"; + } +} + +/** + * Get the tier ordering value (for comparison). + */ +export function tierOrdinal(tier: ComplexityTier): number { + switch (tier) { + case "light": return 0; + case "standard": return 1; + case "heavy": return 2; + } +} + +// ─── Task Complexity Analysis ──────────────────────────────────────────────── + +interface TaskAnalysis { + tier: ComplexityTier; + reason: string; +} + +function analyzeTaskComplexity( + unitId: string, + basePath: string, + metadata?: TaskMetadata, +): TaskAnalysis { + // Try to read task plan for complexity signals + const meta = metadata ?? extractTaskMetadata(unitId, basePath); + + // Heavy signals + if (meta.dependencyCount && meta.dependencyCount >= 3) { + return { tier: "heavy", reason: `${meta.dependencyCount} dependencies` }; + } + if (meta.fileCount && meta.fileCount >= 6) { + return { tier: "heavy", reason: `${meta.fileCount} files to modify` }; + } + if (meta.estimatedLines && meta.estimatedLines >= 500) { + return { tier: "heavy", reason: `~${meta.estimatedLines} lines estimated` }; + } + + // Heavy signals from complexity keywords (Phase 4) + if (meta.complexityKeywords && meta.complexityKeywords.length >= 2) { + return { tier: "heavy", reason: `complex: ${meta.complexityKeywords.join(", ")}` }; + } + if (meta.codeBlockCount && meta.codeBlockCount >= 5) { + return { tier: "heavy", reason: `${meta.codeBlockCount} code blocks in plan` }; + } + + // Standard signals from single complexity keyword + if (meta.complexityKeywords && meta.complexityKeywords.length === 1) { + return { tier: "standard", reason: `${meta.complexityKeywords[0]} task` }; + } + + // Light signals (simple tasks) + if (meta.tags?.some(t => /^(docs?|readme|comment|config|typo|rename)$/i.test(t))) { + return { tier: "light", reason: `simple task: ${meta.tags.join(", ")}` }; + } + if (meta.fileCount !== undefined && meta.fileCount <= 1 && !meta.isNewFile) { + return { tier: "light", reason: "single file modification" }; + } + + // Standard by default + return { tier: "standard", reason: "standard execution task" }; +} + +function analyzePlanComplexity( + unitId: string, + basePath: string, +): TaskAnalysis | null { + // Check if this is a milestone-level plan (more complex) vs single slice + const parts = unitId.split("/"); + if (parts.length === 1) { + // Milestone-level planning is always at least standard + return { tier: "standard", reason: "milestone-level planning" }; + } + + // For slice planning, try to read the context/research to gauge complexity + // If research exists and is large, bump to heavy + const [mid, sid] = parts; + const researchPath = join(gsdRoot(basePath), mid, "slices", sid, "RESEARCH.md"); + try { + if (existsSync(researchPath)) { + const content = readFileSync(researchPath, "utf-8"); + const lineCount = content.split("\n").length; + if (lineCount > 200) { + return { tier: "heavy", reason: `complex slice: ${lineCount}-line research` }; + } + } + } catch { + // Non-fatal + } + + return null; // Use default tier +} + +/** + * Extract task metadata from the task plan file on disk. + */ +function extractTaskMetadata(unitId: string, basePath: string): TaskMetadata { + const meta: TaskMetadata = {}; + const parts = unitId.split("/"); + if (parts.length !== 3) return meta; + + const [mid, sid, tid] = parts; + const taskPlanPath = join(gsdRoot(basePath), mid, "slices", sid, "tasks", `${tid}-PLAN.md`); + + try { + if (!existsSync(taskPlanPath)) return meta; + const content = readFileSync(taskPlanPath, "utf-8"); + const lines = content.split("\n"); + + // Count files mentioned in "Files:" or "- Files:" lines + const fileLines = lines.filter(l => /^\s*-?\s*files?\s*:/i.test(l)); + if (fileLines.length > 0) { + // Count comma-separated or bullet-pointed files + const allFiles = new Set(); + for (const line of fileLines) { + const filesStr = line.replace(/^\s*-?\s*files?\s*:\s*/i, ""); + const files = filesStr.split(/[,;]/).map(f => f.trim()).filter(Boolean); + files.forEach(f => allFiles.add(f)); + } + meta.fileCount = allFiles.size; + } + + // Check for "new file" or "create" keywords + meta.isNewFile = lines.some(l => /\b(create|new file|scaffold|bootstrap)\b/i.test(l)); + + // Look for tags/labels in frontmatter or content + const tags: string[] = []; + if (content.match(/\b(refactor|migration|architect)/i)) tags.push("refactor"); + if (content.match(/\b(test|spec|coverage)\b/i)) tags.push("test"); + if (content.match(/\b(doc|readme|comment|jsdoc)\b/i)) tags.push("docs"); + if (content.match(/\b(config|env|setting)\b/i)) tags.push("config"); + if (content.match(/\b(rename|typo|spelling)\b/i)) tags.push("rename"); + meta.tags = tags; + + // Try to extract estimated lines from content + const estimateMatch = content.match(/~?\s*(\d+)\s*lines?\b/i); + if (estimateMatch) { + meta.estimatedLines = parseInt(estimateMatch[1], 10); + } + + // Phase 4: Deeper introspection signals + + // Count fenced code blocks (```) — more code blocks = more complex implementation + const codeBlockMatches = content.match(/^```/gm); + meta.codeBlockCount = codeBlockMatches ? Math.floor(codeBlockMatches.length / 2) : 0; + + // Detect complexity keywords that suggest harder tasks + const complexityKeywords: string[] = []; + if (content.match(/\b(migration|migrate|schema change)\b/i)) complexityKeywords.push("migration"); + if (content.match(/\b(architect|design pattern|system design)\b/i)) complexityKeywords.push("architecture"); + if (content.match(/\b(security|auth|encrypt|credential|vulnerability)\b/i)) complexityKeywords.push("security"); + if (content.match(/\b(performance|optimize|cache|index)\b/i)) complexityKeywords.push("performance"); + if (content.match(/\b(concurrent|parallel|race condition|mutex|lock)\b/i)) complexityKeywords.push("concurrency"); + if (content.match(/\b(backward.?compat|breaking change|deprecat)\b/i)) complexityKeywords.push("compatibility"); + meta.complexityKeywords = complexityKeywords; + } catch { + // Non-fatal — metadata extraction is best-effort + } + + return meta; +} + +// ─── Budget Pressure ───────────────────────────────────────────────────────── + +/** + * Apply budget pressure to a classification result. + * As budget usage increases, more aggressively downgrade tiers. + * + * - <50%: Normal classification (no change) + * - 50-75%: Tier 2 → Tier 1 where possible + * - 75-90%: Only heavy tasks keep configured model + * - >90%: Everything except replan-slice gets cheapest model + */ +function applyBudgetPressure( + result: ClassificationResult, + budgetPct?: number, +): ClassificationResult { + if (budgetPct === undefined || budgetPct < 0.5) return result; + + const original = result.tier; + + if (budgetPct >= 0.9) { + // >90%: almost everything goes to light + if (result.tier !== "heavy") { + result.tier = "light"; + } else { + // Even heavy gets downgraded to standard + result.tier = "standard"; + } + } else if (budgetPct >= 0.75) { + // 75-90%: only heavy stays, everything else goes to light + if (result.tier === "standard") { + result.tier = "light"; + } + } else { + // 50-75%: standard → light + if (result.tier === "standard") { + result.tier = "light"; + } + } + + if (result.tier !== original) { + result.downgraded = true; + result.reason = `${result.reason} (budget pressure: ${Math.round(budgetPct * 100)}%)`; + } + + return result; +} diff --git a/src/resources/extensions/gsd/metrics.ts b/src/resources/extensions/gsd/metrics.ts index c1a465ba4..a09de9b91 100644 --- a/src/resources/extensions/gsd/metrics.ts +++ b/src/resources/extensions/gsd/metrics.ts @@ -39,6 +39,8 @@ export interface UnitMetrics { toolCalls: number; assistantMessages: number; userMessages: number; + tier?: string; // complexity tier (light/standard/heavy) if dynamic routing active + modelDowngraded?: boolean; // true if dynamic routing used a cheaper model } export interface MetricsLedger { @@ -104,6 +106,7 @@ export function snapshotUnitMetrics( unitId: string, startedAt: number, model: string, + extras?: { tier?: string; modelDowngraded?: boolean }, ): UnitMetrics | null { if (!ledger) return null; @@ -156,6 +159,8 @@ export function snapshotUnitMetrics( toolCalls, assistantMessages, userMessages, + ...(extras?.tier ? { tier: extras.tier } : {}), + ...(extras?.modelDowngraded !== undefined ? { modelDowngraded: extras.modelDowngraded } : {}), }; ledger.units.push(unit); @@ -294,6 +299,49 @@ export function getProjectTotals(units: UnitMetrics[]): ProjectTotals { return totals; } +// ─── Tier Aggregation ──────────────────────────────────────────────────────── + +export interface TierAggregate { + tier: string; + units: number; + tokens: TokenCounts; + cost: number; + downgraded: number; // units that were downgraded by dynamic routing +} + +export function aggregateByTier(units: UnitMetrics[]): TierAggregate[] { + const map = new Map(); + for (const u of units) { + const tier = u.tier ?? "unknown"; + let agg = map.get(tier); + if (!agg) { + agg = { tier, units: 0, tokens: emptyTokens(), cost: 0, downgraded: 0 }; + map.set(tier, agg); + } + agg.units++; + agg.tokens = addTokens(agg.tokens, u.tokens); + agg.cost += u.cost; + if (u.modelDowngraded) agg.downgraded++; + } + const order = ["light", "standard", "heavy", "unknown"]; + return order.map(t => map.get(t)).filter((a): a is TierAggregate => !!a); +} + +/** + * Format a summary of savings from dynamic routing. + * Returns empty string if no units were downgraded. + */ +export function formatTierSavings(units: UnitMetrics[]): string { + const downgraded = units.filter(u => u.modelDowngraded); + if (downgraded.length === 0) return ""; + + const downgradedCost = downgraded.reduce((sum, u) => sum + u.cost, 0); + const totalUnits = units.filter(u => u.tier).length; + const pct = totalUnits > 0 ? Math.round((downgraded.length / totalUnits) * 100) : 0; + + return `Dynamic routing: ${downgraded.length}/${totalUnits} units downgraded (${pct}%), cost: ${formatCost(downgradedCost)}`; +} + // ─── Formatting helpers ─────────────────────────────────────────────────────── export function formatCost(cost: number): string { diff --git a/src/resources/extensions/gsd/model-cost-table.ts b/src/resources/extensions/gsd/model-cost-table.ts new file mode 100644 index 000000000..82be7930d --- /dev/null +++ b/src/resources/extensions/gsd/model-cost-table.ts @@ -0,0 +1,65 @@ +// GSD Extension — Model Cost Table +// Static cost reference for known models, used by the dynamic router +// for cross-provider cost comparison. +// +// Costs are approximate per-1K-token rates in USD (input tokens). +// Updated with GSD releases. Users can override via preferences. + +export interface ModelCostEntry { + /** Model ID (bare, without provider prefix) */ + id: string; + /** Approximate cost per 1K input tokens in USD */ + inputPer1k: number; + /** Approximate cost per 1K output tokens in USD */ + outputPer1k: number; + /** Last updated date */ + updatedAt: string; +} + +/** + * Bundled cost table for known models. + * Updated periodically with GSD releases. + */ +export const BUNDLED_COST_TABLE: ModelCostEntry[] = [ + // Anthropic + { id: "claude-opus-4-6", inputPer1k: 0.015, outputPer1k: 0.075, updatedAt: "2025-03-15" }, + { id: "claude-sonnet-4-6", inputPer1k: 0.003, outputPer1k: 0.015, updatedAt: "2025-03-15" }, + { id: "claude-haiku-4-5", inputPer1k: 0.0008, outputPer1k: 0.004, updatedAt: "2025-03-15" }, + { id: "claude-sonnet-4-5-20250514", inputPer1k: 0.003, outputPer1k: 0.015, updatedAt: "2025-03-15" }, + { id: "claude-3-5-sonnet-latest", inputPer1k: 0.003, outputPer1k: 0.015, updatedAt: "2025-03-15" }, + { id: "claude-3-5-haiku-latest", inputPer1k: 0.0008, outputPer1k: 0.004, updatedAt: "2025-03-15" }, + { id: "claude-3-opus-latest", inputPer1k: 0.015, outputPer1k: 0.075, updatedAt: "2025-03-15" }, + + // OpenAI + { id: "gpt-4o", inputPer1k: 0.0025, outputPer1k: 0.01, updatedAt: "2025-03-15" }, + { id: "gpt-4o-mini", inputPer1k: 0.00015, outputPer1k: 0.0006, updatedAt: "2025-03-15" }, + { id: "o1", inputPer1k: 0.015, outputPer1k: 0.06, updatedAt: "2025-03-15" }, + { id: "o3", inputPer1k: 0.015, outputPer1k: 0.06, updatedAt: "2025-03-15" }, + { id: "gpt-4-turbo", inputPer1k: 0.01, outputPer1k: 0.03, updatedAt: "2025-03-15" }, + + // Google + { id: "gemini-2.0-flash", inputPer1k: 0.0001, outputPer1k: 0.0004, updatedAt: "2025-03-15" }, + { id: "gemini-flash-2.0", inputPer1k: 0.0001, outputPer1k: 0.0004, updatedAt: "2025-03-15" }, + { id: "gemini-2.5-pro", inputPer1k: 0.00125, outputPer1k: 0.005, updatedAt: "2025-03-15" }, + + // DeepSeek + { id: "deepseek-chat", inputPer1k: 0.00014, outputPer1k: 0.00028, updatedAt: "2025-03-15" }, +]; + +/** + * Lookup cost for a model ID. Returns undefined if not found. + */ +export function lookupModelCost(modelId: string): ModelCostEntry | undefined { + const bareId = modelId.includes("/") ? modelId.split("/").pop()! : modelId; + return BUNDLED_COST_TABLE.find(e => e.id === bareId) + ?? BUNDLED_COST_TABLE.find(e => bareId.includes(e.id) || e.id.includes(bareId)); +} + +/** + * Compare two models by input cost. Returns negative if a is cheaper. + */ +export function compareModelCost(modelIdA: string, modelIdB: string): number { + const costA = lookupModelCost(modelIdA)?.inputPer1k ?? 999; + const costB = lookupModelCost(modelIdB)?.inputPer1k ?? 999; + return costA - costB; +} diff --git a/src/resources/extensions/gsd/model-router.ts b/src/resources/extensions/gsd/model-router.ts new file mode 100644 index 000000000..fd76d53ca --- /dev/null +++ b/src/resources/extensions/gsd/model-router.ts @@ -0,0 +1,256 @@ +// GSD Extension — Dynamic Model Router +// Maps complexity tiers to models, enforcing downgrade-only semantics. +// The user's configured model is always the ceiling. + +import type { ComplexityTier, ClassificationResult } from "./complexity-classifier.js"; +import { tierOrdinal } from "./complexity-classifier.js"; +import type { ResolvedModelConfig } from "./preferences.js"; + +// ─── Types ─────────────────────────────────────────────────────────────────── + +export interface DynamicRoutingConfig { + enabled?: boolean; + tier_models?: { + light?: string; + standard?: string; + heavy?: string; + }; + escalate_on_failure?: boolean; // default: true + budget_pressure?: boolean; // default: true + cross_provider?: boolean; // default: true + hooks?: boolean; // default: true +} + +export interface RoutingDecision { + /** The model ID to use (may be downgraded from configured) */ + modelId: string; + /** Fallback chain: [selected_model, ...configured_fallbacks, configured_primary] */ + fallbacks: string[]; + /** The complexity tier that drove this decision */ + tier: ComplexityTier; + /** True if the model was downgraded from the configured primary */ + wasDowngraded: boolean; + /** Human-readable reason for this decision */ + reason: string; +} + +// ─── Known Model Tiers ─────────────────────────────────────────────────────── +// Maps known model IDs to their capability tier. Used when tier_models is not +// explicitly configured to pick the best available model for each tier. + +const MODEL_CAPABILITY_TIER: Record = { + // Light-tier models (cheapest) + "claude-haiku-4-5": "light", + "claude-3-5-haiku-latest": "light", + "claude-3-haiku-20240307": "light", + "gpt-4o-mini": "light", + "gemini-2.0-flash": "light", + "gemini-flash-2.0": "light", + + // Standard-tier models + "claude-sonnet-4-6": "standard", + "claude-sonnet-4-5-20250514": "standard", + "claude-3-5-sonnet-latest": "standard", + "gpt-4o": "standard", + "gemini-2.5-pro": "standard", + "deepseek-chat": "standard", + + // Heavy-tier models (most capable) + "claude-opus-4-6": "heavy", + "claude-3-opus-latest": "heavy", + "gpt-4-turbo": "heavy", + "o1": "heavy", + "o3": "heavy", +}; + +// ─── Cost Table (per 1K input tokens, approximate USD) ─────────────────────── +// Used for cross-provider cost comparison when multiple providers offer +// the same capability tier. + +const MODEL_COST_PER_1K_INPUT: Record = { + "claude-haiku-4-5": 0.0008, + "claude-3-5-haiku-latest": 0.0008, + "claude-sonnet-4-6": 0.003, + "claude-sonnet-4-5-20250514": 0.003, + "claude-opus-4-6": 0.015, + "gpt-4o-mini": 0.00015, + "gpt-4o": 0.0025, + "gemini-2.0-flash": 0.0001, + "gemini-2.5-pro": 0.00125, + "deepseek-chat": 0.00014, +}; + +// ─── Public API ────────────────────────────────────────────────────────────── + +/** + * Resolve the model to use for a given complexity tier. + * + * Downgrade-only: the returned model is always equal to or cheaper than + * the user's configured primary model. Never upgrades beyond configuration. + * + * @param classification The complexity classification result + * @param phaseConfig The user's configured model for this phase (ceiling) + * @param routingConfig Dynamic routing configuration + * @param availableModelIds List of available model IDs (from registry) + */ +export function resolveModelForComplexity( + classification: ClassificationResult, + phaseConfig: ResolvedModelConfig | undefined, + routingConfig: DynamicRoutingConfig, + availableModelIds: string[], +): RoutingDecision { + // If no phase config or routing disabled, pass through + if (!phaseConfig || !routingConfig.enabled) { + return { + modelId: phaseConfig?.primary ?? "", + fallbacks: phaseConfig?.fallbacks ?? [], + tier: classification.tier, + wasDowngraded: false, + reason: "dynamic routing disabled or no phase config", + }; + } + + const configuredPrimary = phaseConfig.primary; + const configuredTier = getModelTier(configuredPrimary); + const requestedTier = classification.tier; + + // Downgrade-only: if requested tier >= configured tier, no change + if (tierOrdinal(requestedTier) >= tierOrdinal(configuredTier)) { + return { + modelId: configuredPrimary, + fallbacks: phaseConfig.fallbacks, + tier: requestedTier, + wasDowngraded: false, + reason: `tier ${requestedTier} >= configured ${configuredTier}`, + }; + } + + // Find the best model for the requested tier + const targetModelId = findModelForTier( + requestedTier, + routingConfig, + availableModelIds, + routingConfig.cross_provider !== false, + ); + + if (!targetModelId) { + // No suitable model found — use configured primary + return { + modelId: configuredPrimary, + fallbacks: phaseConfig.fallbacks, + tier: requestedTier, + wasDowngraded: false, + reason: `no ${requestedTier}-tier model available`, + }; + } + + // Build fallback chain: [downgraded_model, ...configured_fallbacks, configured_primary] + const fallbacks = [ + ...phaseConfig.fallbacks.filter(f => f !== targetModelId), + configuredPrimary, + ].filter(f => f !== targetModelId); + + return { + modelId: targetModelId, + fallbacks, + tier: requestedTier, + wasDowngraded: true, + reason: classification.reason, + }; +} + +/** + * Escalate to the next tier after a failure. + * Returns the new tier, or null if already at heavy (max). + */ +export function escalateTier(currentTier: ComplexityTier): ComplexityTier | null { + switch (currentTier) { + case "light": return "standard"; + case "standard": return "heavy"; + case "heavy": return null; + } +} + +/** + * Get the default routing config (all features enabled). + */ +export function defaultRoutingConfig(): DynamicRoutingConfig { + return { + enabled: false, + escalate_on_failure: true, + budget_pressure: true, + cross_provider: true, + hooks: true, + }; +} + +// ─── Internal ──────────────────────────────────────────────────────────────── + +function getModelTier(modelId: string): ComplexityTier { + // Strip provider prefix if present + const bareId = modelId.includes("/") ? modelId.split("/").pop()! : modelId; + + // Check exact match first + if (MODEL_CAPABILITY_TIER[bareId]) return MODEL_CAPABILITY_TIER[bareId]; + + // Check if any known model ID is a prefix/suffix match + for (const [knownId, tier] of Object.entries(MODEL_CAPABILITY_TIER)) { + if (bareId.includes(knownId) || knownId.includes(bareId)) return tier; + } + + // Unknown models are assumed heavy (safest assumption) + return "heavy"; +} + +function findModelForTier( + tier: ComplexityTier, + config: DynamicRoutingConfig, + availableModelIds: string[], + crossProvider: boolean, +): string | null { + // 1. Check explicit tier_models config + const explicitModel = config.tier_models?.[tier]; + if (explicitModel && availableModelIds.includes(explicitModel)) { + return explicitModel; + } + // Also check with provider prefix stripped + if (explicitModel) { + const match = availableModelIds.find(id => { + const bareAvail = id.includes("/") ? id.split("/").pop()! : id; + const bareExplicit = explicitModel.includes("/") ? explicitModel.split("/").pop()! : explicitModel; + return bareAvail === bareExplicit; + }); + if (match) return match; + } + + // 2. Auto-detect: find the cheapest available model in the requested tier + const candidates = availableModelIds + .filter(id => { + const modelTier = getModelTier(id); + return modelTier === tier; + }) + .sort((a, b) => { + if (!crossProvider) return 0; + const costA = getModelCost(a); + const costB = getModelCost(b); + return costA - costB; + }); + + return candidates[0] ?? null; +} + +function getModelCost(modelId: string): number { + const bareId = modelId.includes("/") ? modelId.split("/").pop()! : modelId; + + if (MODEL_COST_PER_1K_INPUT[bareId] !== undefined) { + return MODEL_COST_PER_1K_INPUT[bareId]; + } + + // Check partial matches + for (const [knownId, cost] of Object.entries(MODEL_COST_PER_1K_INPUT)) { + if (bareId.includes(knownId) || knownId.includes(bareId)) return cost; + } + + // Unknown cost — assume expensive to avoid routing to unknown cheap models + return 999; +} diff --git a/src/resources/extensions/gsd/preferences.ts b/src/resources/extensions/gsd/preferences.ts index 06227bc95..04fc534a5 100644 --- a/src/resources/extensions/gsd/preferences.ts +++ b/src/resources/extensions/gsd/preferences.ts @@ -4,6 +4,8 @@ import { isAbsolute, join } from "node:path"; import { getAgentDir } from "@gsd/pi-coding-agent"; import type { GitPreferences } from "./git-service.js"; import type { PostUnitHookConfig, PreDispatchHookConfig, BudgetEnforcementMode, NotificationPreferences, TokenProfile, InlineLevel, PhaseSkipPreferences } from "./types.js"; +import type { DynamicRoutingConfig } from "./model-router.js"; +import { defaultRoutingConfig } from "./model-router.js"; import { VALID_BRANCH_NAME } from "./git-service.js"; const GLOBAL_PREFERENCES_PATH = join(homedir(), ".gsd", "preferences.md"); @@ -36,6 +38,7 @@ const KNOWN_PREFERENCE_KEYS = new Set([ "git", "post_unit_hooks", "pre_dispatch_hooks", + "dynamic_routing", "token_profile", "phases", ]); @@ -128,6 +131,7 @@ export interface GSDPreferences { git?: GitPreferences; post_unit_hooks?: PostUnitHookConfig[]; pre_dispatch_hooks?: PreDispatchHookConfig[]; + dynamic_routing?: DynamicRoutingConfig; token_profile?: TokenProfile; phases?: PhaseSkipPreferences; } @@ -674,6 +678,20 @@ export function resolveModelWithFallbacksForUnit(unitType: string): ResolvedMode }; } +/** + * Resolve the dynamic routing configuration from effective preferences. + * Returns the merged config with defaults applied. + */ +export function resolveDynamicRoutingConfig(): DynamicRoutingConfig { + const prefs = loadEffectiveGSDPreferences(); + const configured = prefs?.preferences.dynamic_routing; + if (!configured) return defaultRoutingConfig(); + return { + ...defaultRoutingConfig(), + ...configured, + }; +} + export function resolveAutoSupervisorConfig(): AutoSupervisorConfig { const prefs = loadEffectiveGSDPreferences(); const configured = prefs?.preferences.auto_supervisor ?? {}; @@ -780,6 +798,9 @@ function mergePreferences(base: GSDPreferences, override: GSDPreferences): GSDPr : undefined, post_unit_hooks: mergePostUnitHooks(base.post_unit_hooks, override.post_unit_hooks), pre_dispatch_hooks: mergePreDispatchHooks(base.pre_dispatch_hooks, override.pre_dispatch_hooks), + dynamic_routing: (base.dynamic_routing || override.dynamic_routing) + ? { ...(base.dynamic_routing ?? {}), ...(override.dynamic_routing ?? {}) } as DynamicRoutingConfig + : undefined, token_profile: override.token_profile ?? base.token_profile, phases: (base.phases || override.phases) ? { ...(base.phases ?? {}), ...(override.phases ?? {}) } @@ -1100,6 +1121,56 @@ export function validatePreferences(preferences: GSDPreferences): { } } + // ─── Dynamic Routing ───────────────────────────────────────────────── + if (preferences.dynamic_routing !== undefined) { + if (typeof preferences.dynamic_routing === "object" && preferences.dynamic_routing !== null) { + const dr = preferences.dynamic_routing as unknown as Record; + const validDr: Partial = {}; + + if (dr.enabled !== undefined) { + if (typeof dr.enabled === "boolean") validDr.enabled = dr.enabled; + else errors.push("dynamic_routing.enabled must be a boolean"); + } + if (dr.escalate_on_failure !== undefined) { + if (typeof dr.escalate_on_failure === "boolean") validDr.escalate_on_failure = dr.escalate_on_failure; + else errors.push("dynamic_routing.escalate_on_failure must be a boolean"); + } + if (dr.budget_pressure !== undefined) { + if (typeof dr.budget_pressure === "boolean") validDr.budget_pressure = dr.budget_pressure; + else errors.push("dynamic_routing.budget_pressure must be a boolean"); + } + if (dr.cross_provider !== undefined) { + if (typeof dr.cross_provider === "boolean") validDr.cross_provider = dr.cross_provider; + else errors.push("dynamic_routing.cross_provider must be a boolean"); + } + if (dr.hooks !== undefined) { + if (typeof dr.hooks === "boolean") validDr.hooks = dr.hooks; + else errors.push("dynamic_routing.hooks must be a boolean"); + } + if (dr.tier_models !== undefined) { + if (typeof dr.tier_models === "object" && dr.tier_models !== null) { + const tm = dr.tier_models as Record; + const validTm: Record = {}; + for (const tier of ["light", "standard", "heavy"]) { + if (tm[tier] !== undefined) { + if (typeof tm[tier] === "string") validTm[tier] = tm[tier] as string; + else errors.push(`dynamic_routing.tier_models.${tier} must be a string`); + } + } + if (Object.keys(validTm).length > 0) validDr.tier_models = validTm as DynamicRoutingConfig["tier_models"]; + } else { + errors.push("dynamic_routing.tier_models must be an object"); + } + } + + if (Object.keys(validDr).length > 0) { + validated.dynamic_routing = validDr as unknown as DynamicRoutingConfig; + } + } else { + errors.push("dynamic_routing must be an object"); + } + } + // ─── Git Preferences ─────────────────────────────────────────────────── if (preferences.git && typeof preferences.git === "object") { const git: Record = {}; diff --git a/src/resources/extensions/gsd/tests/complexity-classifier.test.ts b/src/resources/extensions/gsd/tests/complexity-classifier.test.ts new file mode 100644 index 000000000..4c6a39c08 --- /dev/null +++ b/src/resources/extensions/gsd/tests/complexity-classifier.test.ts @@ -0,0 +1,181 @@ +import test from "node:test"; +import assert from "node:assert/strict"; + +import { classifyUnitComplexity, tierLabel, tierOrdinal } from "../complexity-classifier.js"; +import type { ComplexityTier, TaskMetadata } from "../complexity-classifier.js"; + +// ─── tierLabel ─────────────────────────────────────────────────────────────── + +test("tierLabel returns correct short labels", () => { + assert.equal(tierLabel("light"), "L"); + assert.equal(tierLabel("standard"), "S"); + assert.equal(tierLabel("heavy"), "H"); +}); + +// ─── tierOrdinal ───────────────────────────────────────────────────────────── + +test("tierOrdinal returns correct ordering", () => { + assert.ok(tierOrdinal("light") < tierOrdinal("standard")); + assert.ok(tierOrdinal("standard") < tierOrdinal("heavy")); +}); + +// ─── Unit Type Classification ──────────────────────────────────────────────── + +test("complete-slice classifies as light", () => { + const result = classifyUnitComplexity("complete-slice", "M001/S01", "/tmp/fake"); + assert.equal(result.tier, "light"); +}); + +test("run-uat classifies as light", () => { + const result = classifyUnitComplexity("run-uat", "M001/S01", "/tmp/fake"); + assert.equal(result.tier, "light"); +}); + +test("research-milestone classifies as standard", () => { + const result = classifyUnitComplexity("research-milestone", "M001", "/tmp/fake"); + assert.equal(result.tier, "standard"); +}); + +test("research-slice classifies as standard", () => { + const result = classifyUnitComplexity("research-slice", "M001/S01", "/tmp/fake"); + assert.equal(result.tier, "standard"); +}); + +test("plan-milestone classifies as standard", () => { + const result = classifyUnitComplexity("plan-milestone", "M001", "/tmp/fake"); + assert.equal(result.tier, "standard"); +}); + +test("plan-slice classifies as standard", () => { + const result = classifyUnitComplexity("plan-slice", "M001/S01", "/tmp/fake"); + assert.equal(result.tier, "standard"); +}); + +test("replan-slice classifies as heavy", () => { + const result = classifyUnitComplexity("replan-slice", "M001/S01", "/tmp/fake"); + assert.equal(result.tier, "heavy"); +}); + +test("reassess-roadmap classifies as heavy", () => { + const result = classifyUnitComplexity("reassess-roadmap", "M001", "/tmp/fake"); + assert.equal(result.tier, "heavy"); +}); + +test("hook units classify as light", () => { + const result = classifyUnitComplexity("hook/verify", "M001/S01/T01", "/tmp/fake"); + assert.equal(result.tier, "light"); + assert.match(result.reason, /hook/); +}); + +test("unknown unit types default to standard", () => { + const result = classifyUnitComplexity("custom-thing", "M001", "/tmp/fake"); + assert.equal(result.tier, "standard"); +}); + +// ─── Task Metadata Classification ──────────────────────────────────────────── + +test("execute-task with many dependencies classifies as heavy", () => { + const metadata: TaskMetadata = { dependencyCount: 4 }; + const result = classifyUnitComplexity("execute-task", "M001/S01/T01", "/tmp/fake", undefined, metadata); + assert.equal(result.tier, "heavy"); + assert.match(result.reason, /dependencies/); +}); + +test("execute-task with many files classifies as heavy", () => { + const metadata: TaskMetadata = { fileCount: 8 }; + const result = classifyUnitComplexity("execute-task", "M001/S01/T01", "/tmp/fake", undefined, metadata); + assert.equal(result.tier, "heavy"); + assert.match(result.reason, /files/); +}); + +test("execute-task with large estimated lines classifies as heavy", () => { + const metadata: TaskMetadata = { estimatedLines: 600 }; + const result = classifyUnitComplexity("execute-task", "M001/S01/T01", "/tmp/fake", undefined, metadata); + assert.equal(result.tier, "heavy"); + assert.match(result.reason, /lines/); +}); + +test("execute-task with docs tags classifies as light", () => { + const metadata: TaskMetadata = { tags: ["docs"] }; + const result = classifyUnitComplexity("execute-task", "M001/S01/T01", "/tmp/fake", undefined, metadata); + assert.equal(result.tier, "light"); +}); + +test("execute-task with single file modification classifies as light", () => { + const metadata: TaskMetadata = { fileCount: 1, isNewFile: false }; + const result = classifyUnitComplexity("execute-task", "M001/S01/T01", "/tmp/fake", undefined, metadata); + assert.equal(result.tier, "light"); +}); + +test("execute-task with no metadata classifies as standard", () => { + const result = classifyUnitComplexity("execute-task", "M001/S01/T01", "/tmp/fake"); + assert.equal(result.tier, "standard"); +}); + +// ─── Budget Pressure ───────────────────────────────────────────────────────── + +test("no budget pressure below 50%", () => { + const result = classifyUnitComplexity("research-slice", "M001/S01", "/tmp/fake", 0.3); + assert.equal(result.tier, "standard"); + assert.equal(result.downgraded, false); +}); + +test("budget pressure at 50% downgrades standard to light", () => { + const result = classifyUnitComplexity("research-slice", "M001/S01", "/tmp/fake", 0.55); + assert.equal(result.tier, "light"); + assert.equal(result.downgraded, true); + assert.match(result.reason, /budget pressure/); +}); + +test("budget pressure at 75% keeps heavy as heavy", () => { + const result = classifyUnitComplexity("replan-slice", "M001/S01", "/tmp/fake", 0.80); + assert.equal(result.tier, "heavy"); + assert.equal(result.downgraded, false); +}); + +test("budget pressure at 90% downgrades heavy to standard", () => { + const result = classifyUnitComplexity("replan-slice", "M001/S01", "/tmp/fake", 0.95); + assert.equal(result.tier, "standard"); + assert.equal(result.downgraded, true); +}); + +test("budget pressure at 90% downgrades standard to light", () => { + const result = classifyUnitComplexity("research-slice", "M001/S01", "/tmp/fake", 0.95); + assert.equal(result.tier, "light"); + assert.equal(result.downgraded, true); +}); + +test("budget pressure at 90% downgrades light stays light", () => { + const result = classifyUnitComplexity("complete-slice", "M001/S01", "/tmp/fake", 0.95); + assert.equal(result.tier, "light"); +}); + +// ─── Phase 4: Task Plan Introspection ──────────────────────────────────────── + +test("execute-task with multiple complexity keywords classifies as heavy", () => { + const metadata: TaskMetadata = { complexityKeywords: ["migration", "security"] }; + const result = classifyUnitComplexity("execute-task", "M001/S01/T01", "/tmp/fake", undefined, metadata); + assert.equal(result.tier, "heavy"); + assert.match(result.reason, /migration/); + assert.match(result.reason, /security/); +}); + +test("execute-task with single complexity keyword classifies as standard", () => { + const metadata: TaskMetadata = { complexityKeywords: ["performance"] }; + const result = classifyUnitComplexity("execute-task", "M001/S01/T01", "/tmp/fake", undefined, metadata); + assert.equal(result.tier, "standard"); + assert.match(result.reason, /performance/); +}); + +test("execute-task with many code blocks classifies as heavy", () => { + const metadata: TaskMetadata = { codeBlockCount: 6 }; + const result = classifyUnitComplexity("execute-task", "M001/S01/T01", "/tmp/fake", undefined, metadata); + assert.equal(result.tier, "heavy"); + assert.match(result.reason, /code blocks/); +}); + +test("execute-task with few code blocks stays standard", () => { + const metadata: TaskMetadata = { codeBlockCount: 2 }; + const result = classifyUnitComplexity("execute-task", "M001/S01/T01", "/tmp/fake", undefined, metadata); + assert.equal(result.tier, "standard"); +}); diff --git a/src/resources/extensions/gsd/tests/model-cost-table.test.ts b/src/resources/extensions/gsd/tests/model-cost-table.test.ts new file mode 100644 index 000000000..98906c083 --- /dev/null +++ b/src/resources/extensions/gsd/tests/model-cost-table.test.ts @@ -0,0 +1,69 @@ +import test from "node:test"; +import assert from "node:assert/strict"; + +import { lookupModelCost, compareModelCost, BUNDLED_COST_TABLE } from "../model-cost-table.js"; + +// ─── lookupModelCost ───────────────────────────────────────────────────────── + +test("lookupModelCost finds exact match", () => { + const entry = lookupModelCost("claude-opus-4-6"); + assert.ok(entry); + assert.equal(entry.id, "claude-opus-4-6"); + assert.ok(entry.inputPer1k > 0); + assert.ok(entry.outputPer1k > 0); +}); + +test("lookupModelCost strips provider prefix", () => { + const entry = lookupModelCost("anthropic/claude-opus-4-6"); + assert.ok(entry); + assert.equal(entry.id, "claude-opus-4-6"); +}); + +test("lookupModelCost returns undefined for unknown model", () => { + const entry = lookupModelCost("totally-unknown-model"); + assert.equal(entry, undefined); +}); + +test("lookupModelCost finds haiku", () => { + const entry = lookupModelCost("claude-haiku-4-5"); + assert.ok(entry); + assert.ok(entry.inputPer1k < 0.001, "haiku should be cheap"); +}); + +// ─── compareModelCost ──────────────────────────────────────────────────────── + +test("haiku is cheaper than opus", () => { + assert.ok(compareModelCost("claude-haiku-4-5", "claude-opus-4-6") < 0); +}); + +test("opus is more expensive than sonnet", () => { + assert.ok(compareModelCost("claude-opus-4-6", "claude-sonnet-4-6") > 0); +}); + +test("same model has equal cost", () => { + assert.equal(compareModelCost("claude-opus-4-6", "claude-opus-4-6"), 0); +}); + +// ─── BUNDLED_COST_TABLE ────────────────────────────────────────────────────── + +test("cost table has entries for all major providers", () => { + const ids = BUNDLED_COST_TABLE.map(e => e.id); + // Anthropic + assert.ok(ids.includes("claude-opus-4-6")); + assert.ok(ids.includes("claude-sonnet-4-6")); + assert.ok(ids.includes("claude-haiku-4-5")); + // OpenAI + assert.ok(ids.includes("gpt-4o")); + assert.ok(ids.includes("gpt-4o-mini")); + // Google + assert.ok(ids.includes("gemini-2.0-flash")); +}); + +test("all cost table entries have valid data", () => { + for (const entry of BUNDLED_COST_TABLE) { + assert.ok(entry.id, `entry missing id`); + assert.ok(entry.inputPer1k >= 0, `${entry.id} inputPer1k should be >= 0`); + assert.ok(entry.outputPer1k >= 0, `${entry.id} outputPer1k should be >= 0`); + assert.ok(entry.updatedAt, `${entry.id} missing updatedAt`); + } +}); diff --git a/src/resources/extensions/gsd/tests/model-router.test.ts b/src/resources/extensions/gsd/tests/model-router.test.ts new file mode 100644 index 000000000..c7af7fcca --- /dev/null +++ b/src/resources/extensions/gsd/tests/model-router.test.ts @@ -0,0 +1,167 @@ +import test from "node:test"; +import assert from "node:assert/strict"; + +import { + resolveModelForComplexity, + escalateTier, + defaultRoutingConfig, +} from "../model-router.js"; +import type { DynamicRoutingConfig, RoutingDecision } from "../model-router.js"; +import type { ClassificationResult } from "../complexity-classifier.js"; + +// ─── Helpers ───────────────────────────────────────────────────────────────── + +function makeClassification(tier: "light" | "standard" | "heavy", reason = "test"): ClassificationResult { + return { tier, reason, downgraded: false }; +} + +const AVAILABLE_MODELS = [ + "claude-opus-4-6", + "claude-sonnet-4-6", + "claude-haiku-4-5", + "gpt-4o-mini", +]; + +// ─── Passthrough when disabled ─────────────────────────────────────────────── + +test("returns configured model when routing is disabled", () => { + const config = { ...defaultRoutingConfig(), enabled: false }; + const result = resolveModelForComplexity( + makeClassification("light"), + { primary: "claude-opus-4-6", fallbacks: [] }, + config, + AVAILABLE_MODELS, + ); + assert.equal(result.modelId, "claude-opus-4-6"); + assert.equal(result.wasDowngraded, false); +}); + +test("returns configured model when no phase config", () => { + const config = { ...defaultRoutingConfig(), enabled: true }; + const result = resolveModelForComplexity( + makeClassification("light"), + undefined, + config, + AVAILABLE_MODELS, + ); + assert.equal(result.modelId, ""); + assert.equal(result.wasDowngraded, false); +}); + +// ─── Downgrade-only semantics ──────────────────────────────────────────────── + +test("does not downgrade when tier matches configured model tier", () => { + const config = { ...defaultRoutingConfig(), enabled: true }; + const result = resolveModelForComplexity( + makeClassification("heavy"), + { primary: "claude-opus-4-6", fallbacks: [] }, + config, + AVAILABLE_MODELS, + ); + assert.equal(result.modelId, "claude-opus-4-6"); + assert.equal(result.wasDowngraded, false); +}); + +test("does not upgrade beyond configured model", () => { + const config = { ...defaultRoutingConfig(), enabled: true }; + // Configured model is sonnet (standard), classification says heavy + const result = resolveModelForComplexity( + makeClassification("heavy"), + { primary: "claude-sonnet-4-6", fallbacks: [] }, + config, + AVAILABLE_MODELS, + ); + assert.equal(result.modelId, "claude-sonnet-4-6"); + assert.equal(result.wasDowngraded, false); +}); + +test("downgrades from opus to haiku for light tier", () => { + const config = { ...defaultRoutingConfig(), enabled: true }; + const result = resolveModelForComplexity( + makeClassification("light"), + { primary: "claude-opus-4-6", fallbacks: [] }, + config, + AVAILABLE_MODELS, + ); + // Should pick haiku or gpt-4o-mini (cheapest light tier) + assert.ok( + result.modelId === "claude-haiku-4-5" || result.modelId === "gpt-4o-mini", + `Expected light-tier model, got ${result.modelId}`, + ); + assert.equal(result.wasDowngraded, true); +}); + +test("downgrades from opus to sonnet for standard tier", () => { + const config = { ...defaultRoutingConfig(), enabled: true }; + const result = resolveModelForComplexity( + makeClassification("standard"), + { primary: "claude-opus-4-6", fallbacks: [] }, + config, + AVAILABLE_MODELS, + ); + assert.equal(result.modelId, "claude-sonnet-4-6"); + assert.equal(result.wasDowngraded, true); +}); + +// ─── Explicit tier_models ──────────────────────────────────────────────────── + +test("uses explicit tier_models when configured", () => { + const config: DynamicRoutingConfig = { + ...defaultRoutingConfig(), + enabled: true, + tier_models: { light: "gpt-4o-mini", standard: "claude-sonnet-4-6" }, + }; + const result = resolveModelForComplexity( + makeClassification("light"), + { primary: "claude-opus-4-6", fallbacks: [] }, + config, + AVAILABLE_MODELS, + ); + assert.equal(result.modelId, "gpt-4o-mini"); + assert.equal(result.wasDowngraded, true); +}); + +// ─── Fallback chain construction ───────────────────────────────────────────── + +test("fallback chain includes configured primary as last resort", () => { + const config = { ...defaultRoutingConfig(), enabled: true }; + const result = resolveModelForComplexity( + makeClassification("light"), + { primary: "claude-opus-4-6", fallbacks: ["claude-sonnet-4-6"] }, + config, + AVAILABLE_MODELS, + ); + assert.ok(result.wasDowngraded); + // Fallbacks should include the configured fallbacks and primary + assert.ok(result.fallbacks.includes("claude-opus-4-6"), "primary should be in fallbacks"); + assert.ok(result.fallbacks.includes("claude-sonnet-4-6"), "configured fallback should be in fallbacks"); +}); + +// ─── Escalation ────────────────────────────────────────────────────────────── + +test("escalateTier moves light → standard", () => { + assert.equal(escalateTier("light"), "standard"); +}); + +test("escalateTier moves standard → heavy", () => { + assert.equal(escalateTier("standard"), "heavy"); +}); + +test("escalateTier returns null for heavy (max)", () => { + assert.equal(escalateTier("heavy"), null); +}); + +// ─── No suitable model available ───────────────────────────────────────────── + +test("falls back to configured model when no light-tier model available", () => { + const config = { ...defaultRoutingConfig(), enabled: true }; + // Only heavy-tier models available + const result = resolveModelForComplexity( + makeClassification("light"), + { primary: "claude-opus-4-6", fallbacks: [] }, + config, + ["claude-opus-4-6"], + ); + assert.equal(result.modelId, "claude-opus-4-6"); + assert.equal(result.wasDowngraded, false); +}); diff --git a/src/resources/extensions/gsd/tests/routing-history.test.ts b/src/resources/extensions/gsd/tests/routing-history.test.ts index f3e09473c..887ad709d 100644 --- a/src/resources/extensions/gsd/tests/routing-history.test.ts +++ b/src/resources/extensions/gsd/tests/routing-history.test.ts @@ -1,87 +1,240 @@ -/** - * Routing History — structural tests for adaptive learning module. - * - * Verifies routing-history.ts exports and structure from #579. - * Uses source-level checks to avoid @gsd/pi-coding-agent import chain. - */ - import test from "node:test"; import assert from "node:assert/strict"; -import { readFileSync } from "node:fs"; -import { join, dirname } from "node:path"; -import { fileURLToPath } from "node:url"; +import { mkdirSync, rmSync, writeFileSync, readFileSync } from "node:fs"; +import { join } from "node:path"; +import { tmpdir } from "node:os"; -const __dirname = dirname(fileURLToPath(import.meta.url)); -const historySrc = readFileSync(join(__dirname, "..", "routing-history.ts"), "utf-8"); +import { + initRoutingHistory, + resetRoutingHistory, + recordOutcome, + recordFeedback, + getAdaptiveTierAdjustment, + clearRoutingHistory, + getRoutingHistory, +} from "../routing-history.js"; -// ═══════════════════════════════════════════════════════════════════════════ -// Module Exports -// ═══════════════════════════════════════════════════════════════════════════ +// ─── Test Setup ────────────────────────────────────────────────────────────── -test("routing-history: exports initRoutingHistory", () => { - assert.ok(historySrc.includes("export function initRoutingHistory"), "should export initRoutingHistory"); +function makeTmpDir(): string { + const dir = join(tmpdir(), `gsd-routing-test-${Date.now()}-${Math.random().toString(36).slice(2)}`); + mkdirSync(join(dir, ".gsd"), { recursive: true }); + return dir; +} + +function cleanup(dir: string): void { + try { rmSync(dir, { recursive: true, force: true }); } catch {} + resetRoutingHistory(); +} + +// ─── recordOutcome ─────────────────────────────────────────────────────────── + +test("recordOutcome tracks success and failure counts", () => { + const dir = makeTmpDir(); + try { + initRoutingHistory(dir); + recordOutcome("execute-task", "standard", true); + recordOutcome("execute-task", "standard", true); + recordOutcome("execute-task", "standard", false); + + const history = getRoutingHistory(); + assert.ok(history); + const pattern = history.patterns["execute-task"]; + assert.ok(pattern); + assert.equal(pattern.standard.success, 2); + assert.equal(pattern.standard.fail, 1); + } finally { + cleanup(dir); + } }); -test("routing-history: exports recordOutcome", () => { - assert.ok(historySrc.includes("export function recordOutcome"), "should export recordOutcome"); +test("recordOutcome tracks tag-specific patterns", () => { + const dir = makeTmpDir(); + try { + initRoutingHistory(dir); + recordOutcome("execute-task", "light", true, ["docs"]); + + const history = getRoutingHistory(); + assert.ok(history); + assert.ok(history.patterns["execute-task:docs"]); + assert.equal(history.patterns["execute-task:docs"].light.success, 1); + } finally { + cleanup(dir); + } }); -test("routing-history: exports recordFeedback", () => { - assert.ok(historySrc.includes("export function recordFeedback"), "should export recordFeedback"); +test("recordOutcome applies rolling window", () => { + const dir = makeTmpDir(); + try { + initRoutingHistory(dir); + // Record 60 successes — should be capped to 50 + for (let i = 0; i < 60; i++) { + recordOutcome("execute-task", "standard", true); + } + + const history = getRoutingHistory(); + assert.ok(history); + const total = history.patterns["execute-task"].standard.success + + history.patterns["execute-task"].standard.fail; + assert.ok(total <= 50, `total ${total} should be <= 50`); + } finally { + cleanup(dir); + } }); -test("routing-history: exports getAdaptiveTierAdjustment", () => { - assert.ok(historySrc.includes("export function getAdaptiveTierAdjustment"), "should export getAdaptiveTierAdjustment"); +// ─── getAdaptiveTierAdjustment ─────────────────────────────────────────────── + +test("no adjustment when insufficient data", () => { + const dir = makeTmpDir(); + try { + initRoutingHistory(dir); + recordOutcome("execute-task", "light", false); + // Only 1 data point — not enough + const adj = getAdaptiveTierAdjustment("execute-task", "light"); + assert.equal(adj, null); + } finally { + cleanup(dir); + } }); -test("routing-history: exports resetRoutingHistory", () => { - assert.ok(historySrc.includes("export function resetRoutingHistory"), "should export resetRoutingHistory"); +test("bumps tier when failure rate exceeds threshold", () => { + const dir = makeTmpDir(); + try { + initRoutingHistory(dir); + // Record high failure rate at light tier + recordOutcome("execute-task", "light", false); + recordOutcome("execute-task", "light", false); + recordOutcome("execute-task", "light", true); + // 2/3 = 66% failure rate > 20% threshold + + const adj = getAdaptiveTierAdjustment("execute-task", "light"); + assert.equal(adj, "standard"); + } finally { + cleanup(dir); + } }); -// ═══════════════════════════════════════════════════════════════════════════ -// Design Constants -// ═══════════════════════════════════════════════════════════════════════════ - -test("routing-history: uses rolling window of 50 entries", () => { - assert.ok(historySrc.includes("ROLLING_WINDOW = 50"), "should use 50-entry rolling window"); +test("no adjustment when success rate is high", () => { + const dir = makeTmpDir(); + try { + initRoutingHistory(dir); + for (let i = 0; i < 10; i++) { + recordOutcome("execute-task", "light", true); + } + const adj = getAdaptiveTierAdjustment("execute-task", "light"); + assert.equal(adj, null); + } finally { + cleanup(dir); + } }); -test("routing-history: failure threshold is 20%", () => { - assert.ok(historySrc.includes("FAILURE_THRESHOLD = 0.20"), "should use 20% failure threshold"); +test("tag-specific patterns take precedence", () => { + const dir = makeTmpDir(); + try { + initRoutingHistory(dir); + // Base pattern has high success rate (tagged calls also count toward base) + for (let i = 0; i < 15; i++) { + recordOutcome("execute-task", "light", true); + } + // But docs-tagged tasks fail at light + recordOutcome("execute-task", "light", false, ["docs"]); + recordOutcome("execute-task", "light", false, ["docs"]); + recordOutcome("execute-task", "light", true, ["docs"]); + + // With tags, should bump (docs pattern: 1/3 success = 66% failure) + const adj = getAdaptiveTierAdjustment("execute-task", "light", ["docs"]); + assert.equal(adj, "standard"); + + // Without tags, should not bump (base: 16/18 success = 11% failure) + const adjBase = getAdaptiveTierAdjustment("execute-task", "light"); + assert.equal(adjBase, null); + } finally { + cleanup(dir); + } }); -test("routing-history: feedback weight is 2x", () => { - assert.ok(historySrc.includes("FEEDBACK_WEIGHT = 2"), "feedback should count 2x"); +// ─── recordFeedback ────────────────────────────────────────────────────────── + +test("recordFeedback stores feedback entries", () => { + const dir = makeTmpDir(); + try { + initRoutingHistory(dir); + recordFeedback("execute-task", "M001/S01/T01", "standard", "over"); + + const history = getRoutingHistory(); + assert.ok(history); + assert.equal(history.feedback.length, 1); + assert.equal(history.feedback[0].rating, "over"); + assert.equal(history.feedback[0].tier, "standard"); + } finally { + cleanup(dir); + } }); -// ═══════════════════════════════════════════════════════════════════════════ -// Type Structure -// ═══════════════════════════════════════════════════════════════════════════ +test("recordFeedback 'under' increases failure count at tier", () => { + const dir = makeTmpDir(); + try { + initRoutingHistory(dir); + recordFeedback("execute-task", "M001/S01/T01", "light", "under"); -test("routing-history: imports ComplexityTier from types.ts", () => { - assert.ok( - historySrc.includes('from "./types.js"') && historySrc.includes("ComplexityTier"), - "should import ComplexityTier from types.ts", - ); + const history = getRoutingHistory(); + assert.ok(history); + // "under" adds 2 (FEEDBACK_WEIGHT) failures + assert.equal(history.patterns["execute-task"].light.fail, 2); + } finally { + cleanup(dir); + } }); -test("routing-history: defines RoutingHistoryData interface", () => { - assert.ok(historySrc.includes("interface RoutingHistoryData"), "should define RoutingHistoryData"); +test("recordFeedback 'over' increases success count at lower tier", () => { + const dir = makeTmpDir(); + try { + initRoutingHistory(dir); + recordFeedback("execute-task", "M001/S01/T01", "standard", "over"); + + const history = getRoutingHistory(); + assert.ok(history); + // "over" at standard → adds 2 successes at light + assert.equal(history.patterns["execute-task"].light.success, 2); + } finally { + cleanup(dir); + } }); -test("routing-history: defines FeedbackEntry interface", () => { - assert.ok(historySrc.includes("interface FeedbackEntry"), "should define FeedbackEntry"); +// ─── clearRoutingHistory ───────────────────────────────────────────────────── + +test("clearRoutingHistory resets all data", () => { + const dir = makeTmpDir(); + try { + initRoutingHistory(dir); + recordOutcome("execute-task", "light", true); + clearRoutingHistory(dir); + + const history = getRoutingHistory(); + assert.ok(history); + assert.deepEqual(history.patterns, {}); + assert.deepEqual(history.feedback, []); + } finally { + cleanup(dir); + } }); -// ═══════════════════════════════════════════════════════════════════════════ -// Persistence -// ═══════════════════════════════════════════════════════════════════════════ +// ─── Persistence ───────────────────────────────────────────────────────────── -test("routing-history: persists to routing-history.json", () => { - assert.ok(historySrc.includes("routing-history.json"), "should persist to routing-history.json"); -}); +test("routing history persists to disk and reloads", () => { + const dir = makeTmpDir(); + try { + initRoutingHistory(dir); + recordOutcome("execute-task", "standard", true); + recordOutcome("execute-task", "standard", true); + resetRoutingHistory(); -test("routing-history: has save and load functions", () => { - assert.ok(historySrc.includes("saveHistory") || historySrc.includes("function save"), "should have save"); - assert.ok(historySrc.includes("loadHistory") || historySrc.includes("function load"), "should have load"); + // Reload from disk + initRoutingHistory(dir); + const history = getRoutingHistory(); + assert.ok(history); + assert.equal(history.patterns["execute-task"].standard.success, 2); + } finally { + cleanup(dir); + } });