diff --git a/docs/ADR-004-capability-aware-model-routing.md b/docs/ADR-004-capability-aware-model-routing.md index 93d28f862..c2ce3d2d2 100644 --- a/docs/ADR-004-capability-aware-model-routing.md +++ b/docs/ADR-004-capability-aware-model-routing.md @@ -1,8 +1,8 @@ # ADR-004: Capability-Aware Model Routing -**Status:** Proposed (Revised) +**Status:** Implemented (Phase 2) **Date:** 2026-03-26 -**Revised:** 2026-03-26 +**Revised:** 2026-04-03 **Deciders:** Jeremy McSpadden **Related:** ADR-003 (pipeline simplification), [Issue #2655](https://github.com/gsd-build/gsd-2/issues/2655), `docs/dynamic-model-routing.md` diff --git a/docs/configuration.md b/docs/configuration.md index b223f8b7b..00512fa22 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -686,6 +686,7 @@ Complexity-based model routing. See [Dynamic Model Routing](./dynamic-model-rout ```yaml dynamic_routing: enabled: true + capability_routing: true # score models by task capability (v2.59) tier_models: light: claude-haiku-4-5 standard: claude-sonnet-4-6 @@ -695,6 +696,18 @@ dynamic_routing: cross_provider: true ``` +### `context_management` (v2.59) + +Controls observation masking and tool result truncation during auto-mode sessions. Reduces context bloat between compactions with zero LLM overhead. + +```yaml +context_management: + observation_masking: true # replace old tool results with placeholders (default: true) + observation_mask_turns: 8 # keep results from last N user turns (1-50, default: 8) + compaction_threshold_percent: 0.70 # target compaction at 70% context usage (0.5-0.95, default: 0.70) + tool_result_max_chars: 800 # cap individual tool result content (200-10000, default: 800) +``` + ### `service_tier` (v2.42) OpenAI service tier preference for supported models. Toggle with `/gsd fast`. diff --git a/docs/dynamic-model-routing.md b/docs/dynamic-model-routing.md index 9d0d5525e..9bbf125fe 100644 --- a/docs/dynamic-model-routing.md +++ b/docs/dynamic-model-routing.md @@ -70,6 +70,36 @@ When approaching the budget ceiling, the router progressively downgrades: When enabled, the router may select models from providers other than your primary. This uses the built-in cost table to find the cheapest model at each tier. Requires the target provider to be configured. +## Capability-Aware Scoring + +*Introduced in v2.59.0 (ADR-004 Phase 2)* + +When `capability_routing` is enabled, the router goes beyond tier classification and scores models against task-specific capability requirements. Each known model has a 7-dimension profile: + +| Dimension | What It Measures | +|-----------|-----------------| +| `coding` | Code generation, refactoring, implementation quality | +| `debugging` | Error diagnosis, fix accuracy | +| `research` | Information gathering, codebase exploration | +| `reasoning` | Multi-step logic, architectural decisions | +| `speed` | Response latency (inverse of cost) | +| `longContext` | Performance with large context windows | +| `instruction` | Adherence to structured instructions and templates | + +Each unit type maps to a weighted requirement vector. For example, `execute-task` weights `coding: 0.9, reasoning: 0.6, debugging: 0.5` while `research-slice` weights `research: 0.9, reasoning: 0.7, longContext: 0.5`. + +For `execute-task` units, the classifier also inspects task metadata (tags, description) to refine requirements. Documentation tasks boost `instruction` and lower `coding`; test tasks boost `debugging`. + +Enable capability routing: + +```yaml +dynamic_routing: + enabled: true + capability_routing: true +``` + +When enabled, models within the target tier are ranked by capability score rather than selected arbitrarily. When disabled (the default), the existing tier-only selection applies. + ## Complexity Classification Units are classified using pure heuristics — no LLM calls, sub-millisecond: diff --git a/docs/pi-context-optimization-opportunities.md b/docs/pi-context-optimization-opportunities.md new file mode 100644 index 000000000..9e34cc44c --- /dev/null +++ b/docs/pi-context-optimization-opportunities.md @@ -0,0 +1,198 @@ +# pi-coding-agent: Context Optimization Opportunities + +> **Status**: Research only — not planned for implementation. +> Scope: `packages/pi-coding-agent` and `packages/pi-agent-core` infrastructure. +> These changes would benefit every consumer of the pi engine, not just GSD. + +--- + +## 1. Prompt Caching (`cache_control`) — Highest Impact + +**Current state**: Every LLM call re-pays full input token cost for the system prompt, tool definitions, and context files. No `cache_control` breakpoints are set anywhere in the API call path. + +**Opportunity**: Anthropic's KV cache delivers 90% cost reduction on cached tokens (0.1x input rate). Claude Code achieves 92–98% cache hit rates by placing stable content before volatile content. + +**Where to instrument** (`packages/pi-ai/src/providers/anthropic.ts`): +- Set `cache_control: { type: "ephemeral" }` on the last tool definition block +- Set `cache_control` after the static system prompt sections (base boilerplate + context files) +- Leave the per-turn user message uncached + +**Critical constraint**: The cache breakpoint must be placed *after* all static content and *before* any dynamic content (timestamps, per-request variables). Moving a timestamp before a cache breakpoint defeats it on every call. + +**Cache hierarchy**: Tools → system → messages. Changing a tool definition invalidates system and message caches. Tool definitions should be sorted deterministically (alphabetically) to prevent spurious cache misses. + +**Expected savings**: 80–90% reduction in input token cost for multi-turn sessions (the dominant cost pattern in GSD auto-mode). + +--- + +## 2. Observation Masking in the Message Pipeline + +**Current state**: `agent-loop.ts` passes the full `context.messages` array to the LLM on every turn. Tool results from 50 turns ago are re-read in full on every subsequent call. The `transformContext` hook exists on `AgentContext` and fires before every LLM call, but has no default implementation — extensions are responsible for any pruning. + +**Opportunity**: Replace old tool result content with lightweight placeholders after N turns. JetBrains Research tested this on SWE-bench Verified (500 tasks, up to 250-turn trajectories) and found: +- 50%+ cost reduction vs. unmanaged history +- Performance matched or slightly exceeded LLM summarization +- Zero overhead (no extra LLM call required) + +**Proposed implementation** (default `transformContext` in `pi-agent-core`): +```typescript +// Keep last KEEP_RECENT_TURNS verbatim; mask older tool results +const KEEP_RECENT_TURNS = 8; + +function defaultObservationMask(messages: AgentMessage[]): AgentMessage[] { + const cutoff = findTurnBoundary(messages, KEEP_RECENT_TURNS); + return messages.map((m, i) => { + if (i >= cutoff) return m; + if (m.type === "toolResult" || m.type === "bashExecution") { + return { ...m, content: "[result masked — within summarized history]", excludeFromContext: false }; + } + return m; + }); +} +``` + +**Compaction interaction**: Observation masking reduces the token accumulation rate, pushing the compaction threshold further out. The two mechanisms are complementary — masking handles the steady state, compaction handles the rare deep-session case. + +--- + +## 3. Earlier Compaction Threshold + +**Current state** (`packages/pi-coding-agent/src/core/constants.ts`): +```typescript +COMPACTION_RESERVE_TOKENS = 16_384 // triggers at contextWindow - 16K +COMPACTION_KEEP_RECENT_TOKENS = 20_000 +``` + +For a 200K context window, compaction fires at ~183K tokens — 91.5% utilization. + +**Problem**: Context drift (not raw exhaustion) causes ~65% of enterprise agent failures. Performance degrades measurably beyond ~30K tokens per Zylos production data. The current threshold lets sessions run degraded for a long stretch before compaction fires. + +**Opportunity**: Lower the trigger to 70% utilization. For a 200K window, this means compacting at ~140K tokens — 43K tokens earlier. + +```typescript +// Proposed +COMPACTION_THRESHOLD_PERCENT = 0.70 // fire at 70% of contextWindow +COMPACTION_RESERVE_TOKENS = contextWindow * (1 - COMPACTION_THRESHOLD_PERCENT) +``` + +**Trade-off**: More frequent compactions, each happening earlier when there's more "fresh" content to keep. Summary quality improves because less material needs to be discarded at each cut. + +--- + +## 4. Tool Result Truncation at Write Time + +**Current state**: `TOOL_RESULT_MAX_CHARS = 2_000` in `constants.ts`, but this limit is only applied *during compaction summarization*, not when the tool result enters the message store. A bash result returning 50KB of log output is stored and re-sent verbatim until compaction fires. + +**Opportunity**: Truncate at write time in `messages.ts` → `convertToLlm()` or in the tool result handler. Two strategies: + +- **Hard truncation**: Slice at N chars, append `"\n[truncated — {original_length} chars]"`. Simple, zero overhead. +- **Semantic head/tail**: Keep first 500 chars (context, command echo) + last 1000 chars (final output, errors). Better for bash results where the end contains the error. + +**Recommendation**: Semantic head/tail as the default, configurable per tool type. File read results benefit from head; bash/test output benefits from head+tail. + +--- + +## 5. Context File Deduplication and Trim + +**Current state** (`packages/pi-coding-agent/src/core/resource-loader.ts`, lines 84–109): +- Searches from `~/.gsd/agent/` → ancestor dirs → cwd +- Deduplicates by *file path* but not by *content* +- Entire file content concatenated verbatim into system prompt — no trimming, no summarization + +**Anti-pattern**: A project with AGENTS.md at 3 ancestor levels (repo root, workspace, home) injects all three in full. If they share common boilerplate, that content is re-injected multiple times. + +**Opportunities**: +1. **Content deduplication**: Hash paragraph-level chunks; skip any chunk already seen in a previously-loaded file +2. **Section-aware loading**: Parse `## ` headings in AGENTS.md; only include sections relevant to the current task type (e.g., `## Testing` section only when running tests) +3. **Token budget enforcement**: If total context files exceed N tokens, summarize oldest/most-distant file rather than including verbatim + +--- + +## 6. Skill Content Lazy Loading and Summarization + +**Current state**: When `/skill:name` is invoked, the full skill file content is injected inline as `...` in the user message. No chunking, no summarization. A 10KB skill file adds ~2,500 tokens to that turn. + +**Opportunity**: +- **Cached skill injection**: If the same skill is used across multiple turns (rare but possible), it's re-injected each time. Cache with `cache_control` after first injection. +- **Skill digest mode**: Inject a 200-token summary of the skill on first reference; full content only if the model requests it via a `get_skill_detail` tool call. Reduces cost for skills that don't end up being followed. +- **Skill prefetching**: Before a known long session (e.g., auto-mode start), pre-inject all likely skills with `cache_control` so they're cached for the entire session. + +--- + +## 7. Token Estimation Accuracy + +**Current state** (`compaction.ts`, line 216): `chars / 4` heuristic. This overestimates token count for English prose (~3.5 chars/token) and underestimates for code with short identifiers or Unicode. + +**Opportunity**: Use a proper tokenizer. +- `@anthropic-ai/tokenizer` (tiktoken-compatible, ships with the SDK) — accurate but ~5ms per call +- Tiered approach: use chars/4 for display; use proper tokenizer only for compaction threshold decisions (where accuracy matters) + +**Impact**: More accurate compaction timing, fewer unnecessary compactions, slightly better `COMPACTION_KEEP_RECENT_TOKENS` boundary placement. + +--- + +## 8. Format: Markdown over XML for Internal Context + +**Current state**: The message pipeline uses ``, ``, `` XML wrappers in several places. System prompt sections are largely prose Markdown. + +**Findings**: XML tags carry 15–40% more tokens than equivalent Markdown for the same semantic content, due to paired open/close tags. However, Claude was optimized for XML and shows higher accuracy on tasks requiring precise section parsing. + +**Recommendation**: Audit XML usage in the pipeline and convert to Markdown where the content is: +- Non-nested (flat instructions, status messages) +- Human-readable rather than machine-parsed by the model +- Not requiring precise boundary detection + +Keep XML for: few-shot examples with ambiguous boundaries, skill content (requires precise isolation from surrounding text), compaction summaries that the model must treat as authoritative history. + +**Estimated savings**: 5–15% reduction in system prompt token count. + +--- + +## 9. Dynamic Tool Set Delivery + +**Current state**: All tool definitions are included in every LLM request. Tool descriptions consume 60–80% of input tokens in static configurations. As new extensions register tools, the baseline grows linearly. + +**Opportunity** (higher complexity): Implement the three-function Dynamic Toolset pattern: +1. `search_tools(query)` — semantic search over tool catalog +2. `describe_tools(ids[])` — fetch full schemas on demand +3. `execute_tool(id, params)` — unchanged execution + +Speakeasy measured 91–97% token reduction with 100% task success rate. Trade-off: 2–3x more tool calls, ~50% longer wall time. Net cost dramatically lower. + +**Feasibility for pi**: The tool registry (`packages/pi-coding-agent/src/core/tool-registry.ts`) already stores tool metadata separately from definitions. The primary engineering work is the semantic search index and the `describe_tools` / `search_tools` tool implementations. + +--- + +## 10. Cost Attribution and Per-Phase Reporting + +**Current state**: `SessionManager.getUsageTotals()` accumulates cost across the entire session. No per-phase or per-agent breakdown is stored. Cost visibility is limited to the footer total and `GSD_SHOW_TOKEN_COST=1` per-turn display. + +**Opportunity**: Emit structured cost events that extensions can subscribe to: +```typescript +interface CostCheckpointEvent { + type: "cost_checkpoint"; + label: string; // "discuss-phase", "execute-slice-3" + deltaTokens: Usage; // tokens since last checkpoint + cumulativeTokens: Usage; + cumulativeCost: number; +} +``` + +GSD extension could consume these events to surface per-milestone cost in `/gsd stats` and flag milestones that are disproportionately expensive — enabling budget-aware planning. + +--- + +## Implementation Ordering (if pursued) + +| Priority | Item | Effort | Expected Impact | +|----------|------|--------|-----------------| +| 1 | Prompt caching (`cache_control`) | Low | 80–90% input cost reduction | +| 2 | Earlier compaction threshold (70%) | Trivial | Reduces drift in long sessions | +| 3 | Tool result truncation at write time | Low | Reduces context bloat between compactions | +| 4 | Context file deduplication | Medium | Variable — high for multi-level AGENTS.md setups | +| 5 | Observation masking (default `transformContext`) | Medium | 50%+ on long-running agents | +| 6 | Token estimation (proper tokenizer) | Low | Accuracy improvement, minor cost impact | +| 7 | Markdown over XML audit | Low | 5–15% system prompt reduction | +| 8 | Skill caching with `cache_control` | Low | Meaningful for skill-heavy sessions | +| 9 | Dynamic tool set delivery | High | 90%+ on large tool catalogs; major architecture change | +| 10 | Per-phase cost attribution events | Medium | Visibility only; enables future budget routing | diff --git a/docs/token-optimization.md b/docs/token-optimization.md index 5c5ea3466..4a3a423af 100644 --- a/docs/token-optimization.md +++ b/docs/token-optimization.md @@ -262,15 +262,59 @@ PREFERENCES.md ├─ resolveProfileDefaults() → model defaults + phase skip defaults ├─ resolveInlineLevel() → standard │ └─ prompt builders gate context inclusion by level - └─ classifyUnitComplexity() → routes to execution/execution_simple model - ├─ task plan analysis (steps, files, signals) - ├─ unit type defaults - ├─ budget pressure adjustment - └─ adaptive learning from routing-history.json + ├─ classifyUnitComplexity() → routes to execution/execution_simple model + │ ├─ task plan analysis (steps, files, signals) + │ ├─ unit type defaults + │ ├─ budget pressure adjustment + │ ├─ adaptive learning from routing-history.json + │ └─ capability scoring (when capability_routing: true) + │ └─ 7-dimension model profiles × task requirement vectors + └─ context_management + ├─ observation masking (before_provider_request hook) + ├─ tool result truncation (tool_result_max_chars) + └─ phase handoff anchors (injected into prompt builders) ``` The profile is resolved once and flows through the entire dispatch pipeline. Explicit preferences override profile defaults at every layer. +## Observation Masking + +*Introduced in v2.59.0* + +During auto-mode sessions, tool results accumulate in the conversation history and consume context window space. Observation masking replaces tool result content older than N user turns with a lightweight placeholder before each LLM call. This reduces token usage with zero LLM overhead — no summarization calls, no latency. + +Masking is enabled by default during auto-mode. Configure via preferences: + +```yaml +context_management: + observation_masking: true # default: true (set false to disable) + observation_mask_turns: 8 # keep results from last 8 user turns (range: 1-50) + tool_result_max_chars: 800 # truncate individual tool results beyond this length +``` + +### How It Works + +1. Before each provider request, the `before_provider_request` hook inspects the messages array +2. Tool results (`toolResult`, `bashExecution`) older than the configured turn threshold are replaced with `[result masked — within summarized history]` +3. Recent tool results (within the keep window) are preserved in full +4. All assistant and user messages are always preserved — only tool result content is masked + +This pairs with the existing compaction system: masking reduces context pressure between compactions, and compaction handles the full context reset when the window fills. + +### Tool Result Truncation + +Individual tool results that exceed `tool_result_max_chars` (default: 800) are truncated with a `…[truncated]` marker. This prevents a single large tool output from dominating the context window. + +## Phase Handoff Anchors + +*Introduced in v2.59.0* + +When auto-mode transitions between phases (research → planning → execution), structured JSON anchors are written to `.gsd/milestones//anchors/.json`. Downstream prompt builders inject these anchors so the next phase inherits intent, decisions, blockers, and next steps without re-inferring from artifact files. + +This reduces context drift — the 65% of enterprise agent failures caused by agents losing track of prior decisions across phase boundaries. + +Anchors are written automatically after successful completion of `research-milestone`, `research-slice`, `plan-milestone`, and `plan-slice` units. No configuration needed. + ## Prompt Compression *Introduced in v2.29.0* diff --git a/src/resources/extensions/gsd/auto-model-selection.ts b/src/resources/extensions/gsd/auto-model-selection.ts index 60cca2663..cf2326e35 100644 --- a/src/resources/extensions/gsd/auto-model-selection.ts +++ b/src/resources/extensions/gsd/auto-model-selection.ts @@ -9,7 +9,7 @@ import type { ExtensionAPI, ExtensionContext } from "@gsd/pi-coding-agent"; import type { GSDPreferences } from "./preferences.js"; import { resolveModelWithFallbacksForUnit, resolveDynamicRoutingConfig } from "./preferences.js"; import type { ComplexityTier } from "./complexity-classifier.js"; -import { classifyUnitComplexity, tierLabel } from "./complexity-classifier.js"; +import { classifyUnitComplexity, tierLabel, extractTaskMetadata } from "./complexity-classifier.js"; import { resolveModelForComplexity, escalateTier } from "./model-router.js"; import { getLedger, getProjectTotals } from "./metrics.js"; import { unitPhaseLabel } from "./auto-dashboard.js"; @@ -107,7 +107,15 @@ export async function selectAndApplyModel( } } - const routingResult = resolveModelForComplexity(classification, modelConfig, routingConfig, availableModelIds); + // Extract task metadata for capability scoring + const taskMeta = unitType === "execute-task" + ? extractTaskMetadata(unitId, basePath) + : undefined; + + const routingResult = resolveModelForComplexity( + classification, modelConfig, routingConfig, availableModelIds, + unitType, taskMeta, + ); if (routingResult.wasDowngraded) { effectiveModelConfig = { @@ -115,8 +123,9 @@ export async function selectAndApplyModel( fallbacks: routingResult.fallbacks, }; if (verbose) { + const method = routingResult.selectionMethod === "capability-scored" ? "capability-scored" : "tier-only"; ctx.ui.notify( - `Dynamic routing [${tierLabel(classification.tier)}]: ${routingResult.modelId} (${classification.reason})`, + `Dynamic routing [${tierLabel(classification.tier)}]: ${routingResult.modelId} (${method} — ${classification.reason})`, "info", ); } diff --git a/src/resources/extensions/gsd/auto-prompts.ts b/src/resources/extensions/gsd/auto-prompts.ts index 5b6e9de5b..33000a526 100644 --- a/src/resources/extensions/gsd/auto-prompts.ts +++ b/src/resources/extensions/gsd/auto-prompts.ts @@ -26,6 +26,7 @@ import { existsSync } from "node:fs"; import { computeBudgets, resolveExecutorContextWindow, truncateAtSectionBoundary } from "./context-budget.js"; import { getPendingGates } from "./gsd-db.js"; import { formatDecisionsCompact, formatRequirementsCompact } from "./structured-data-formatter.js"; +import { readPhaseAnchor, formatAnchorForPrompt } from "./phase-anchor.js"; // ─── Preamble Cap ───────────────────────────────────────────────────────────── @@ -906,6 +907,11 @@ export async function buildPlanMilestonePrompt(mid: string, midTitle: string, ba const researchRel = relMilestoneFile(base, mid, "RESEARCH"); const inlined: string[] = []; + + // Inject phase handoff anchor from research phase (if available) + const researchAnchor = readPhaseAnchor(base, mid, "research-milestone"); + if (researchAnchor) inlined.push(formatAnchorForPrompt(researchAnchor)); + inlined.push(await inlineFile(contextPath, contextRel, "Milestone Context")); const researchInline = await inlineFileOptional(researchPath, researchRel, "Milestone Research"); if (researchInline) inlined.push(researchInline); @@ -1033,6 +1039,11 @@ export async function buildPlanSlicePrompt( const researchRel = relSliceFile(base, mid, sid, "RESEARCH"); const inlined: string[] = []; + + // Inject phase handoff anchor from research phase (if available) + const researchSliceAnchor = readPhaseAnchor(base, mid, "research-slice"); + if (researchSliceAnchor) inlined.push(formatAnchorForPrompt(researchSliceAnchor)); + inlined.push(await inlineFile(roadmapPath, roadmapRel, "Milestone Roadmap")); const researchInline = await inlineFileOptional(researchPath, researchRel, "Slice Research"); if (researchInline) inlined.push(researchInline); @@ -1100,6 +1111,9 @@ export async function buildExecuteTaskPrompt( : { level: level as InlineLevel | undefined }; const inlineLevel = opts.level ?? resolveInlineLevel(); + // Inject phase handoff anchor from planning phase (if available) + const planAnchor = readPhaseAnchor(base, mid, "plan-slice"); + const priorSummaries = opts.carryForwardPaths ?? await getPriorTaskSummaryPaths(mid, sid, tid, base); const priorLines = priorSummaries.length > 0 ? priorSummaries.map(p => `- \`${p}\``).join("\n") @@ -1190,9 +1204,12 @@ export async function buildExecuteTaskPrompt( ? `### Runtime Context\nSource: \`.gsd/RUNTIME.md\`\n\n${runtimeContent.trim()}` : ""; + const phaseAnchorSection = planAnchor ? formatAnchorForPrompt(planAnchor) : ""; + return loadPrompt("execute-task", { overridesSection, runtimeContext, + phaseAnchorSection, workingDirectory: base, milestoneId: mid, sliceId: sid, sliceTitle: sTitle, taskId: tid, taskTitle: tTitle, planPath: join(base, relSliceFile(base, mid, sid, "PLAN")), diff --git a/src/resources/extensions/gsd/auto/phases.ts b/src/resources/extensions/gsd/auto/phases.ts index 620fe6809..5b0caaa1c 100644 --- a/src/resources/extensions/gsd/auto/phases.ts +++ b/src/resources/extensions/gsd/auto/phases.ts @@ -1205,6 +1205,23 @@ export async function runUnitPhase( s.unitRecoveryCount.delete(`${unitType}/${unitId}`); } + // Write phase handoff anchor after successful research/planning completion + const anchorPhases = new Set(["research-milestone", "research-slice", "plan-milestone", "plan-slice"]); + if (artifactVerified && mid && anchorPhases.has(unitType)) { + try { + const { writePhaseAnchor } = await import("../phase-anchor.js"); + writePhaseAnchor(s.basePath, mid, { + phase: unitType, + milestoneId: mid, + generatedAt: new Date().toISOString(), + intent: `Completed ${unitType} for ${unitId}`, + decisions: [], + blockers: [], + nextSteps: [], + }); + } catch { /* non-fatal — anchor is advisory */ } + } + deps.emitJournalEvent({ ts: new Date().toISOString(), flowId: ic.flowId, seq: ic.nextSeq(), eventType: "unit-end", data: { unitType, unitId, status: unitResult.status, artifactVerified, ...(unitResult.errorContext ? { errorContext: unitResult.errorContext } : {}) }, causedBy: { flowId: ic.flowId, seq: unitStartSeq } }); return { action: "next", data: { unitStartedAt: s.currentUnit?.startedAt } }; diff --git a/src/resources/extensions/gsd/bootstrap/register-hooks.ts b/src/resources/extensions/gsd/bootstrap/register-hooks.ts index d7504fa52..d76b046a1 100644 --- a/src/resources/extensions/gsd/bootstrap/register-hooks.ts +++ b/src/resources/extensions/gsd/bootstrap/register-hooks.ts @@ -263,13 +263,62 @@ export function registerHooks(pi: ExtensionAPI): void { }); pi.on("before_provider_request", async (event) => { - const modelId = event.model?.id; - if (!modelId) return; - const { getEffectiveServiceTier, supportsServiceTier } = await import("../service-tier.js"); - const tier = getEffectiveServiceTier(); - if (!tier || !supportsServiceTier(modelId)) return; const payload = event.payload as Record | null; if (!payload || typeof payload !== "object") return; + + // ── Observation Masking ───────────────────────────────────────────── + // Replace old tool results with placeholders to reduce context bloat. + // Only active during auto-mode when context_management.observation_masking is enabled. + if (isAutoActive()) { + try { + const { loadEffectiveGSDPreferences } = await import("../preferences.js"); + const prefs = loadEffectiveGSDPreferences(); + const cmConfig = prefs?.preferences.context_management; + + // Observation masking: replace old tool results with placeholders + if (cmConfig?.observation_masking !== false) { + const keepTurns = cmConfig?.observation_mask_turns ?? 8; + const { createObservationMask } = await import("../context-masker.js"); + const mask = createObservationMask(keepTurns); + const messages = payload.messages; + if (Array.isArray(messages)) { + payload.messages = mask(messages); + } + } + + // Tool result truncation: cap individual tool result content length. + // In pi-ai format, toolResult messages have role: "toolResult" and content: TextContent[]. + // Creates new objects to avoid mutating shared conversation state. + const maxChars = cmConfig?.tool_result_max_chars ?? 800; + const msgs = payload.messages; + if (Array.isArray(msgs)) { + payload.messages = msgs.map((msg: Record) => { + // Match toolResult messages (role: "toolResult", content is array of content blocks) + if (msg?.role === "toolResult" && Array.isArray(msg.content)) { + const blocks = msg.content as Array>; + const totalLen = blocks.reduce((sum: number, b) => sum + (typeof b.text === "string" ? b.text.length : 0), 0); + if (totalLen > maxChars) { + const truncated = blocks.map(b => { + if (typeof b.text === "string" && b.text.length > maxChars) { + return { ...b, text: b.text.slice(0, maxChars) + "\n…[truncated]" }; + } + return b; + }); + return { ...msg, content: truncated }; + } + } + return msg; + }); + } + } catch { /* non-fatal */ } + } + + // ── Service Tier ──────────────────────────────────────────────────── + const modelId = event.model?.id; + if (!modelId) return payload; + const { getEffectiveServiceTier, supportsServiceTier } = await import("../service-tier.js"); + const tier = getEffectiveServiceTier(); + if (!tier || !supportsServiceTier(modelId)) return payload; payload.service_tier = tier; return payload; }); diff --git a/src/resources/extensions/gsd/captures.ts b/src/resources/extensions/gsd/captures.ts index 645d907f6..052c43211 100644 --- a/src/resources/extensions/gsd/captures.ts +++ b/src/resources/extensions/gsd/captures.ts @@ -15,7 +15,7 @@ import { gsdRoot } from "./paths.js"; // ─── Types ──────────────────────────────────────────────────────────────────── -export type Classification = "quick-task" | "inject" | "defer" | "replan" | "note"; +export type Classification = "quick-task" | "inject" | "defer" | "replan" | "note" | "stop" | "backtrack"; export interface CaptureEntry { id: string; @@ -42,7 +42,7 @@ export interface TriageResult { const CAPTURES_FILENAME = "CAPTURES.md"; const VALID_CLASSIFICATIONS: readonly string[] = [ - "quick-task", "inject", "defer", "replan", "note", + "quick-task", "inject", "defer", "replan", "note", "stop", "backtrack", ]; // ─── Path Resolution ────────────────────────────────────────────────────────── diff --git a/src/resources/extensions/gsd/complexity-classifier.ts b/src/resources/extensions/gsd/complexity-classifier.ts index c7ae14dbf..114178810 100644 --- a/src/resources/extensions/gsd/complexity-classifier.ts +++ b/src/resources/extensions/gsd/complexity-classifier.ts @@ -212,7 +212,7 @@ function analyzePlanComplexity( /** * Extract task metadata from the task plan file on disk. */ -function extractTaskMetadata(unitId: string, basePath: string): TaskMetadata { +export function extractTaskMetadata(unitId: string, basePath: string): TaskMetadata { const meta: TaskMetadata = {}; const { milestone: mid, slice: sid, task: tid } = parseUnitId(unitId); if (!mid || !sid || !tid) return meta; diff --git a/src/resources/extensions/gsd/context-masker.ts b/src/resources/extensions/gsd/context-masker.ts new file mode 100644 index 000000000..824c3a91e --- /dev/null +++ b/src/resources/extensions/gsd/context-masker.ts @@ -0,0 +1,74 @@ +/** + * Observation masking for GSD auto-mode sessions. + * + * Replaces tool result content older than N turns with a placeholder. + * Reduces context bloat between compactions with zero LLM overhead. + * Preserves message ordering, roles, and all assistant/user messages. + * + * Operates on the pi-ai Message[] format (post-convertToLlm, pre-provider): + * - toolResult messages: { role: "toolResult", content: TextContent[] } + * - bash results are already converted to: { role: "user", content: [{type:"text",text:"..."}] } + * and start with "Ran `" from bashExecutionToText. + */ + +interface MaskableMessage { + role: string; + content: unknown; + type?: string; + [key: string]: unknown; +} + +const MASK_PLACEHOLDER = "[result masked — within summarized history]"; +const MASK_CONTENT_BLOCK = [{ type: "text" as const, text: MASK_PLACEHOLDER }]; + +function findTurnBoundary(messages: MaskableMessage[], keepRecentTurns: number): number { + let turnsSeen = 0; + for (let i = messages.length - 1; i >= 0; i--) { + const m = messages[i]; + // In the LLM payload, genuine user turns have role "user". + // Tool results have role "toolResult" and are excluded by this check. + if (m.role === "user") { + // Skip bash-result user messages (converted from bashExecution) — these aren't real user turns + if (isBashResultUserMessage(m)) continue; + turnsSeen++; + if (turnsSeen >= keepRecentTurns) return i; + } + } + return 0; +} + +/** + * Detect user messages that originated from bashExecution. + * After convertToLlm, these are {role: "user", content: [{type:"text", text:"Ran `cmd`\n..."}]}. + * The bashExecutionToText format always starts with "Ran `". + */ +function isBashResultUserMessage(m: MaskableMessage): boolean { + if (m.role !== "user" || !Array.isArray(m.content)) return false; + const first = m.content[0]; + return first && typeof first === "object" && "text" in first && + typeof first.text === "string" && first.text.startsWith("Ran `"); +} + +function isMaskableMessage(m: MaskableMessage): boolean { + // Tool result messages (role: "toolResult" in pi-ai format) + if (m.role === "toolResult") return true; + // Bash-result user messages (converted from bashExecution by convertToLlm) + if (isBashResultUserMessage(m)) return true; + return false; +} + +export function createObservationMask(keepRecentTurns: number = 8) { + return (messages: MaskableMessage[]): MaskableMessage[] => { + const boundary = findTurnBoundary(messages, keepRecentTurns); + if (boundary === 0) return messages; + + return messages.map((m, i) => { + if (i >= boundary) return m; + if (isMaskableMessage(m)) { + // Content may be string or array of content blocks — always replace with array + return { ...m, content: MASK_CONTENT_BLOCK }; + } + return m; + }); + }; +} diff --git a/src/resources/extensions/gsd/docs/preferences-reference.md b/src/resources/extensions/gsd/docs/preferences-reference.md index 8f110ce37..2ae6b8b6e 100644 --- a/src/resources/extensions/gsd/docs/preferences-reference.md +++ b/src/resources/extensions/gsd/docs/preferences-reference.md @@ -189,6 +189,13 @@ Setting `prefer_skills: []` does **not** disable skill discovery — it just mea - `budget_pressure`: boolean — downgrade model tier when budget is under pressure. Default: `true`. - `cross_provider`: boolean — allow routing across different providers. Default: `true`. - `hooks`: boolean — enable routing hooks. Default: `true`. + - `capability_routing`: boolean — enable capability-profile scoring for model selection within a tier. Requires `enabled: true`. Default: `false`. + +- `context_management`: configures context hygiene for auto-mode sessions. Keys: + - `observation_masking`: boolean — mask old tool results to reduce context bloat. Default: `true`. + - `observation_mask_turns`: number — keep this many recent turns verbatim (1-50). Default: `8`. + - `compaction_threshold_percent`: number — trigger compaction at this % of context window (0.5-0.95). Lower values fire compaction earlier, reducing drift. Default: `0.70`. + - `tool_result_max_chars`: number — max chars per tool result in GSD sessions (200-10000). Default: `800`. - `auto_visualize`: boolean — show a visualizer hint after each milestone completion in auto-mode. Default: `false`. diff --git a/src/resources/extensions/gsd/model-router.ts b/src/resources/extensions/gsd/model-router.ts index f97a69561..5b45ef9b4 100644 --- a/src/resources/extensions/gsd/model-router.ts +++ b/src/resources/extensions/gsd/model-router.ts @@ -10,6 +10,7 @@ import type { ResolvedModelConfig } from "./preferences.js"; export interface DynamicRoutingConfig { enabled?: boolean; + capability_routing?: boolean; // default: false — enable capability profile scoring tier_models?: { light?: string; standard?: string; @@ -32,6 +33,12 @@ export interface RoutingDecision { wasDowngraded: boolean; /** Human-readable reason for this decision */ reason: string; + /** How the model was selected. */ + selectionMethod?: "tier-only" | "capability-scored"; + /** Capability scores per model (when capability-scored). */ + capabilityScores?: Record; + /** Task requirement vector (when capability-scored). */ + taskRequirements?: Partial>; } // ─── Known Model Tiers ─────────────────────────────────────────────────────── @@ -114,6 +121,91 @@ const MODEL_COST_PER_1K_INPUT: Record = { "deepseek-chat": 0.00014, }; +// ─── Capability Profiles (ADR-004 Phase 2) ────────────────────────────────── +// 7-dimension profiles, 0–100 normalized. Models without a profile +// score 50 uniformly — capability scoring is a no-op for them. + +export interface ModelCapabilities { + coding: number; + debugging: number; + research: number; + reasoning: number; + speed: number; + longContext: number; + instruction: number; +} + +export const MODEL_CAPABILITY_PROFILES: Record = { + "claude-opus-4-6": { coding: 95, debugging: 90, research: 85, reasoning: 95, speed: 30, longContext: 80, instruction: 90 }, + "claude-sonnet-4-6": { coding: 85, debugging: 80, research: 75, reasoning: 80, speed: 60, longContext: 75, instruction: 85 }, + "claude-haiku-4-5": { coding: 60, debugging: 50, research: 45, reasoning: 50, speed: 95, longContext: 50, instruction: 75 }, + "gpt-4o": { coding: 80, debugging: 75, research: 70, reasoning: 75, speed: 65, longContext: 70, instruction: 80 }, + "gpt-4o-mini": { coding: 55, debugging: 45, research: 40, reasoning: 45, speed: 90, longContext: 45, instruction: 70 }, + "gemini-2.5-pro": { coding: 75, debugging: 70, research: 85, reasoning: 75, speed: 55, longContext: 90, instruction: 75 }, + "gemini-2.0-flash": { coding: 50, debugging: 40, research: 50, reasoning: 40, speed: 95, longContext: 60, instruction: 65 }, + "deepseek-chat": { coding: 75, debugging: 65, research: 55, reasoning: 70, speed: 70, longContext: 55, instruction: 65 }, + "o3": { coding: 80, debugging: 85, research: 80, reasoning: 92, speed: 25, longContext: 70, instruction: 85 }, +}; + +const BASE_REQUIREMENTS: Record>> = { + "execute-task": { coding: 0.9, instruction: 0.7, speed: 0.3 }, + "research-milestone": { research: 0.9, longContext: 0.7, reasoning: 0.5 }, + "research-slice": { research: 0.9, longContext: 0.7, reasoning: 0.5 }, + "plan-milestone": { reasoning: 0.9, coding: 0.5 }, + "plan-slice": { reasoning: 0.9, coding: 0.5 }, + "replan-slice": { reasoning: 0.9, debugging: 0.6, coding: 0.5 }, + "reassess-roadmap": { reasoning: 0.9, research: 0.5 }, + "complete-slice": { instruction: 0.8, speed: 0.7 }, + "run-uat": { instruction: 0.7, speed: 0.8 }, + "discuss-milestone": { reasoning: 0.6, instruction: 0.7 }, + "complete-milestone": { instruction: 0.8, reasoning: 0.5 }, +}; + +/** + * Compute a task requirement vector from unit type and optional metadata. + */ +export function computeTaskRequirements( + unitType: string, + metadata?: { tags?: string[]; complexityKeywords?: string[]; fileCount?: number; estimatedLines?: number }, +): Partial> { + const base = { ...(BASE_REQUIREMENTS[unitType] ?? { reasoning: 0.5 }) }; + + if (unitType === "execute-task" && metadata) { + if (metadata.tags?.some(t => /^(docs?|readme|comment|config|typo|rename)$/i.test(t))) { + return { ...base, instruction: 0.9, coding: 0.3, speed: 0.7 }; + } + if (metadata.complexityKeywords?.some(k => k === "concurrency" || k === "compatibility")) { + return { ...base, debugging: 0.9, reasoning: 0.8 }; + } + if (metadata.complexityKeywords?.some(k => k === "migration" || k === "architecture")) { + return { ...base, reasoning: 0.9, coding: 0.8 }; + } + if ((metadata.fileCount ?? 0) >= 6 || (metadata.estimatedLines ?? 0) >= 500) { + return { ...base, coding: 0.9, reasoning: 0.7 }; + } + } + + return base; +} + +/** + * Score a model against a task requirement vector. + * Returns weighted average in range 0–100. Returns 50 for empty requirements. + */ +export function scoreModel( + capabilities: ModelCapabilities, + requirements: Partial>, +): number { + let weightedSum = 0; + let weightSum = 0; + for (const [dim, weight] of Object.entries(requirements)) { + const capability = capabilities[dim as keyof ModelCapabilities] ?? 50; + weightedSum += weight * capability; + weightSum += weight; + } + return weightSum > 0 ? weightedSum / weightSum : 50; +} + // ─── Public API ────────────────────────────────────────────────────────────── /** @@ -132,6 +224,8 @@ export function resolveModelForComplexity( phaseConfig: ResolvedModelConfig | undefined, routingConfig: DynamicRoutingConfig, availableModelIds: string[], + unitType?: string, + metadata?: { tags?: string[]; complexityKeywords?: string[]; fileCount?: number; estimatedLines?: number }, ): RoutingDecision { // If no phase config or routing disabled, pass through if (!phaseConfig || !routingConfig.enabled) { @@ -175,25 +269,40 @@ export function resolveModelForComplexity( } // Find the best model for the requested tier - const targetModelId = findModelForTier( - requestedTier, - routingConfig, - availableModelIds, - routingConfig.cross_provider !== false, - ); + const useCapabilityScoring = routingConfig.capability_routing && unitType; + + let targetModelId: string | null; + let capabilityScores: Record | undefined; + let taskRequirements: Partial> | undefined; + let selectionMethod: "tier-only" | "capability-scored" = "tier-only"; + + if (useCapabilityScoring) { + const result = findModelForTierWithCapability( + requestedTier, routingConfig, availableModelIds, + routingConfig.cross_provider !== false, unitType, metadata, + ); + targetModelId = result.modelId; + capabilityScores = Object.keys(result.scores).length > 0 ? result.scores : undefined; + taskRequirements = Object.keys(result.requirements).length > 0 ? result.requirements : undefined; + selectionMethod = capabilityScores ? "capability-scored" : "tier-only"; + } else { + targetModelId = findModelForTier( + requestedTier, routingConfig, availableModelIds, + routingConfig.cross_provider !== false, + ); + } if (!targetModelId) { - // No suitable model found — use configured primary return { modelId: configuredPrimary, fallbacks: phaseConfig.fallbacks, tier: requestedTier, wasDowngraded: false, reason: `no ${requestedTier}-tier model available`, + selectionMethod, }; } - // Build fallback chain: [downgraded_model, ...configured_fallbacks, configured_primary] const fallbacks = [ ...phaseConfig.fallbacks.filter(f => f !== targetModelId), configuredPrimary, @@ -205,6 +314,9 @@ export function resolveModelForComplexity( tier: requestedTier, wasDowngraded: true, reason: classification.reason, + selectionMethod, + capabilityScores, + taskRequirements, }; } @@ -226,6 +338,7 @@ export function escalateTier(currentTier: ComplexityTier): ComplexityTier | null export function defaultRoutingConfig(): DynamicRoutingConfig { return { enabled: true, + capability_routing: false, escalate_on_failure: true, budget_pressure: true, cross_provider: true, @@ -298,6 +411,56 @@ function findModelForTier( return candidates[0] ?? null; } +function findModelForTierWithCapability( + tier: ComplexityTier, + config: DynamicRoutingConfig, + availableModelIds: string[], + crossProvider: boolean, + unitType: string, + metadata?: { tags?: string[]; complexityKeywords?: string[]; fileCount?: number; estimatedLines?: number }, +): { modelId: string | null; scores: Record; requirements: Partial> } { + const explicitModel = config.tier_models?.[tier]; + if (explicitModel) { + const match = availableModelIds.find(id => { + const bareAvail = id.includes("/") ? id.split("/").pop()! : id; + const bareExplicit = explicitModel.includes("/") ? explicitModel.split("/").pop()! : explicitModel; + return bareAvail === bareExplicit || id === explicitModel; + }); + if (match) return { modelId: match, scores: {}, requirements: {} }; + } + + const requirements = computeTaskRequirements(unitType, metadata); + const candidates = availableModelIds.filter(id => getModelTier(id) === tier); + if (candidates.length === 0) return { modelId: null, scores: {}, requirements }; + + const scores: Record = {}; + for (const id of candidates) { + const bareId = id.includes("/") ? id.split("/").pop()! : id; + const profile = getModelProfile(bareId); + scores[id] = scoreModel(profile, requirements); + } + + candidates.sort((a, b) => { + const scoreDiff = scores[b] - scores[a]; + if (Math.abs(scoreDiff) > 2) return scoreDiff; + if (crossProvider) { + const costDiff = getModelCost(a) - getModelCost(b); + if (costDiff !== 0) return costDiff; + } + return a.localeCompare(b); + }); + + return { modelId: candidates[0], scores, requirements }; +} + +function getModelProfile(bareId: string): ModelCapabilities { + if (MODEL_CAPABILITY_PROFILES[bareId]) return MODEL_CAPABILITY_PROFILES[bareId]; + for (const [knownId, profile] of Object.entries(MODEL_CAPABILITY_PROFILES)) { + if (bareId.includes(knownId) || knownId.includes(bareId)) return profile; + } + return { coding: 50, debugging: 50, research: 50, reasoning: 50, speed: 50, longContext: 50, instruction: 50 }; +} + function getModelCost(modelId: string): number { const bareId = modelId.includes("/") ? modelId.split("/").pop()! : modelId; diff --git a/src/resources/extensions/gsd/phase-anchor.ts b/src/resources/extensions/gsd/phase-anchor.ts new file mode 100644 index 000000000..16f1df5e1 --- /dev/null +++ b/src/resources/extensions/gsd/phase-anchor.ts @@ -0,0 +1,71 @@ +/** + * Phase handoff anchors — compact structured summaries written between + * GSD auto-mode phases so downstream agents inherit decisions, blockers, + * and intent without re-inferring from scratch. + */ + +import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs"; +import { join } from "node:path"; +import { gsdRoot } from "./paths.js"; + +export interface PhaseAnchor { + phase: string; + milestoneId: string; + generatedAt: string; + intent: string; + decisions: string[]; + blockers: string[]; + nextSteps: string[]; +} + +function anchorsDir(basePath: string, milestoneId: string): string { + return join(gsdRoot(basePath), "milestones", milestoneId, "anchors"); +} + +function anchorPath(basePath: string, milestoneId: string, phase: string): string { + return join(anchorsDir(basePath, milestoneId), `${phase}.json`); +} + +export function writePhaseAnchor(basePath: string, milestoneId: string, anchor: PhaseAnchor): void { + const dir = anchorsDir(basePath, milestoneId); + if (!existsSync(dir)) { + mkdirSync(dir, { recursive: true }); + } + writeFileSync(anchorPath(basePath, milestoneId, anchor.phase), JSON.stringify(anchor, null, 2), "utf-8"); +} + +export function readPhaseAnchor(basePath: string, milestoneId: string, phase: string): PhaseAnchor | null { + const path = anchorPath(basePath, milestoneId, phase); + if (!existsSync(path)) return null; + try { + return JSON.parse(readFileSync(path, "utf-8")) as PhaseAnchor; + } catch { + return null; + } +} + +export function formatAnchorForPrompt(anchor: PhaseAnchor): string { + const lines: string[] = [ + `## Handoff from ${anchor.phase}`, + "", + `**Intent:** ${anchor.intent}`, + ]; + + if (anchor.decisions.length > 0) { + lines.push("", "**Decisions:**"); + for (const d of anchor.decisions) lines.push(`- ${d}`); + } + + if (anchor.blockers.length > 0) { + lines.push("", "**Blockers:**"); + for (const b of anchor.blockers) lines.push(`- ${b}`); + } + + if (anchor.nextSteps.length > 0) { + lines.push("", "**Next steps:**"); + for (const s of anchor.nextSteps) lines.push(`- ${s}`); + } + + lines.push("", "---"); + return lines.join("\n"); +} diff --git a/src/resources/extensions/gsd/preferences-types.ts b/src/resources/extensions/gsd/preferences-types.ts index 7ae8c9bda..4356badca 100644 --- a/src/resources/extensions/gsd/preferences-types.ts +++ b/src/resources/extensions/gsd/preferences-types.ts @@ -21,6 +21,13 @@ import type { GateEvaluationConfig, } from "./types.js"; import type { DynamicRoutingConfig } from "./model-router.js"; + +export interface ContextManagementConfig { + observation_masking?: boolean; // default: true + observation_mask_turns?: number; // default: 8, range: 1-50 + compaction_threshold_percent?: number; // default: 0.70, range: 0.5-0.95 + tool_result_max_chars?: number; // default: 800, range: 200-10000 +} import type { GitHubSyncConfig } from "../github-sync/types.js"; // ─── Workflow Modes ────────────────────────────────────────────────────────── @@ -94,6 +101,7 @@ export const KNOWN_PREFERENCE_KEYS = new Set([ "forensics_dedup", "show_token_cost", "stale_commit_threshold_minutes", + "context_management", "experimental", ]); @@ -227,6 +235,7 @@ export interface GSDPreferences { post_unit_hooks?: PostUnitHookConfig[]; pre_dispatch_hooks?: PreDispatchHookConfig[]; dynamic_routing?: DynamicRoutingConfig; + context_management?: ContextManagementConfig; token_profile?: TokenProfile; phases?: PhaseSkipPreferences; auto_visualize?: boolean; diff --git a/src/resources/extensions/gsd/preferences-validation.ts b/src/resources/extensions/gsd/preferences-validation.ts index 6b4e0e217..57a715521 100644 --- a/src/resources/extensions/gsd/preferences-validation.ts +++ b/src/resources/extensions/gsd/preferences-validation.ts @@ -428,6 +428,10 @@ export function validatePreferences(preferences: GSDPreferences): { if (typeof dr.hooks === "boolean") validDr.hooks = dr.hooks; else errors.push("dynamic_routing.hooks must be a boolean"); } + if (dr.capability_routing !== undefined) { + if (typeof dr.capability_routing === "boolean") validDr.capability_routing = dr.capability_routing; + else errors.push("dynamic_routing.capability_routing must be a boolean"); + } if (dr.tier_models !== undefined) { if (typeof dr.tier_models === "object" && dr.tier_models !== null) { const tm = dr.tier_models as Record; @@ -452,6 +456,40 @@ export function validatePreferences(preferences: GSDPreferences): { } } + // ─── Context Management ────────────────────────────────────────────── + if (preferences.context_management !== undefined) { + if (typeof preferences.context_management === "object" && preferences.context_management !== null) { + const cm = preferences.context_management as unknown as Record; + const validCm: Record = {}; + + if (cm.observation_masking !== undefined) { + if (typeof cm.observation_masking === "boolean") validCm.observation_masking = cm.observation_masking; + else errors.push("context_management.observation_masking must be a boolean"); + } + if (cm.observation_mask_turns !== undefined) { + const turns = cm.observation_mask_turns; + if (typeof turns === "number" && turns >= 1 && turns <= 50) validCm.observation_mask_turns = turns; + else errors.push("context_management.observation_mask_turns must be a number between 1 and 50"); + } + if (cm.compaction_threshold_percent !== undefined) { + const pct = cm.compaction_threshold_percent; + if (typeof pct === "number" && pct >= 0.5 && pct <= 0.95) validCm.compaction_threshold_percent = pct; + else errors.push("context_management.compaction_threshold_percent must be a number between 0.5 and 0.95"); + } + if (cm.tool_result_max_chars !== undefined) { + const chars = cm.tool_result_max_chars; + if (typeof chars === "number" && chars >= 200 && chars <= 10000) validCm.tool_result_max_chars = chars; + else errors.push("context_management.tool_result_max_chars must be a number between 200 and 10000"); + } + + if (Object.keys(validCm).length > 0) { + validated.context_management = validCm as any; + } + } else { + errors.push("context_management must be an object"); + } + } + // ─── Parallel Config ──────────────────────────────────────────────────── if (preferences.parallel && typeof preferences.parallel === "object") { const p = preferences.parallel as unknown as Record; diff --git a/src/resources/extensions/gsd/prompts/execute-task.md b/src/resources/extensions/gsd/prompts/execute-task.md index b433638ac..f1f22fe86 100644 --- a/src/resources/extensions/gsd/prompts/execute-task.md +++ b/src/resources/extensions/gsd/prompts/execute-task.md @@ -12,6 +12,8 @@ A researcher explored the codebase and a planner decomposed the work — you are {{runtimeContext}} +{{phaseAnchorSection}} + {{resumeSection}} {{carryForwardSection}} diff --git a/src/resources/extensions/gsd/tests/context-masker.test.ts b/src/resources/extensions/gsd/tests/context-masker.test.ts new file mode 100644 index 000000000..e09f11c14 --- /dev/null +++ b/src/resources/extensions/gsd/tests/context-masker.test.ts @@ -0,0 +1,122 @@ +import test from "node:test"; +import assert from "node:assert/strict"; + +import { createObservationMask } from "../context-masker.js"; + +// These helpers produce messages in the pi-ai LLM payload format +// (post-convertToLlm, pre-provider), which is what before_provider_request sees. + +function userMsg(content: string) { + return { role: "user", content: [{ type: "text", text: content }] }; +} + +function assistantMsg(content: string) { + return { role: "assistant", content: [{ type: "text", text: content }] }; +} + +/** toolResult in pi-ai format: role "toolResult", content as TextContent[] */ +function toolResult(text: string) { + return { role: "toolResult", content: [{ type: "text", text }], toolCallId: "toolu_test", toolName: "Read", isError: false }; +} + +/** bashExecution after convertToLlm: becomes a user message with "Ran `cmd`" prefix */ +function bashResult(text: string) { + return { role: "user", content: [{ type: "text", text: `Ran \`echo test\`\n\`\`\`\n${text}\n\`\`\`` }] }; +} + +const MASK_TEXT = "[result masked — within summarized history]"; + +test("masks nothing when message count is within keepRecentTurns", () => { + const mask = createObservationMask(8); + const messages = [ + userMsg("hello"), + assistantMsg("hi"), + toolResult("file contents"), + ]; + const result = mask(messages as any); + assert.equal(result.length, 3); + assert.deepEqual((result[2].content as any)[0].text, "file contents"); +}); + +test("masks tool results older than keepRecentTurns", () => { + const mask = createObservationMask(2); + const messages = [ + userMsg("turn 1"), + toolResult("old tool output"), + assistantMsg("response 1"), + userMsg("turn 2"), + toolResult("newer tool output"), + assistantMsg("response 2"), + userMsg("turn 3"), + toolResult("newest tool output"), + assistantMsg("response 3"), + ]; + const result = mask(messages as any); + // Old tool result (before boundary) should be masked + assert.equal((result[1].content as any)[0].text, MASK_TEXT); + // Recent tool results (within keep window) should be preserved + assert.equal((result[4].content as any)[0].text, "newer tool output"); + assert.equal((result[7].content as any)[0].text, "newest tool output"); +}); + +test("never masks assistant messages", () => { + const mask = createObservationMask(1); + const messages = [ + userMsg("turn 1"), + assistantMsg("old reasoning"), + userMsg("turn 2"), + assistantMsg("new reasoning"), + ]; + const result = mask(messages as any); + assert.equal((result[1].content as any)[0].text, "old reasoning"); + assert.equal((result[3].content as any)[0].text, "new reasoning"); +}); + +test("never masks user messages", () => { + const mask = createObservationMask(1); + const messages = [ + userMsg("old user message"), + assistantMsg("response"), + userMsg("new user message"), + assistantMsg("response"), + ]; + const result = mask(messages as any); + assert.equal((result[0].content as any)[0].text, "old user message"); +}); + +test("masks bash result user messages", () => { + const mask = createObservationMask(1); + const messages = [ + userMsg("turn 1"), + bashResult("huge log output"), + assistantMsg("response 1"), + userMsg("turn 2"), + assistantMsg("response 2"), + ]; + const result = mask(messages as any); + assert.equal((result[1].content as any)[0].text, MASK_TEXT); +}); + +test("returns same array length", () => { + const mask = createObservationMask(1); + const messages = [ + userMsg("a"), toolResult("b"), assistantMsg("c"), + userMsg("d"), toolResult("e"), assistantMsg("f"), + ]; + const result = mask(messages as any); + assert.equal(result.length, messages.length); +}); + +test("masks toolResult by role, not by type field", () => { + const mask = createObservationMask(1); + const messages = [ + userMsg("turn 1"), + // This is the actual pi-ai format: role "toolResult", no type field + { role: "toolResult", content: [{ type: "text", text: "old result" }], toolCallId: "t1", toolName: "Read", isError: false }, + assistantMsg("response 1"), + userMsg("turn 2"), + assistantMsg("response 2"), + ]; + const result = mask(messages as any); + assert.equal((result[1].content as any)[0].text, MASK_TEXT); +}); diff --git a/src/resources/extensions/gsd/tests/model-router.test.ts b/src/resources/extensions/gsd/tests/model-router.test.ts index fb1128eb5..f15977495 100644 --- a/src/resources/extensions/gsd/tests/model-router.test.ts +++ b/src/resources/extensions/gsd/tests/model-router.test.ts @@ -5,8 +5,11 @@ import { resolveModelForComplexity, escalateTier, defaultRoutingConfig, + scoreModel, + computeTaskRequirements, + MODEL_CAPABILITY_PROFILES, } from "../model-router.js"; -import type { DynamicRoutingConfig, RoutingDecision } from "../model-router.js"; +import type { DynamicRoutingConfig, RoutingDecision, ModelCapabilities } from "../model-router.js"; import type { ClassificationResult } from "../complexity-classifier.js"; // ─── Helpers ───────────────────────────────────────────────────────────────── @@ -206,6 +209,89 @@ test("#2192: known model is still downgraded normally", () => { assert.notEqual(result.modelId, "claude-opus-4-6"); }); +// ─── Capability Scoring (ADR-004 Phase 2) ─────────────────────────────────── + +test("defaultRoutingConfig includes capability_routing: false", () => { + const config = defaultRoutingConfig(); + assert.equal(config.capability_routing, false); +}); + +test("scoreModel computes weighted average of capability × requirement", () => { + const caps: ModelCapabilities = { + coding: 90, debugging: 80, research: 70, + reasoning: 85, speed: 50, longContext: 60, instruction: 75, + }; + const reqs = { coding: 0.9, reasoning: 0.5 }; + const score = scoreModel(caps, reqs); + // Expected: (0.9*90 + 0.5*85) / (0.9 + 0.5) = (81 + 42.5) / 1.4 = 88.21... + assert.ok(Math.abs(score - 88.21) < 0.1, `score ${score} should be ~88.21`); +}); + +test("scoreModel returns 50 for empty requirements", () => { + const caps: ModelCapabilities = { + coding: 90, debugging: 80, research: 70, + reasoning: 85, speed: 50, longContext: 60, instruction: 75, + }; + const score = scoreModel(caps, {}); + assert.equal(score, 50); +}); + +test("computeTaskRequirements returns base vector for known unit type", () => { + const reqs = computeTaskRequirements("execute-task"); + assert.ok(reqs.coding !== undefined && reqs.coding > 0); +}); + +test("computeTaskRequirements boosts instruction for docs-tagged tasks", () => { + const reqs = computeTaskRequirements("execute-task", { tags: ["docs"] }); + assert.ok((reqs.instruction ?? 0) >= 0.8); + assert.ok((reqs.coding ?? 1) <= 0.4); +}); + +test("computeTaskRequirements returns generic vector for unknown unit type", () => { + const reqs = computeTaskRequirements("unknown-unit"); + assert.ok(reqs.reasoning !== undefined); +}); + +test("resolveModelForComplexity uses capability scoring when enabled", () => { + const config: DynamicRoutingConfig = { + ...defaultRoutingConfig(), + enabled: true, + capability_routing: true, + }; + const result = resolveModelForComplexity( + makeClassification("light"), + { primary: "claude-opus-4-6", fallbacks: [] }, + config, + ["claude-opus-4-6", "claude-haiku-4-5", "gpt-4o-mini"], + "execute-task", + ); + assert.equal(result.wasDowngraded, true); + assert.equal(result.selectionMethod, "capability-scored"); +}); + +test("resolveModelForComplexity falls back to tier-only when capability_routing is false", () => { + const config: DynamicRoutingConfig = { + ...defaultRoutingConfig(), + enabled: true, + capability_routing: false, + }; + const result = resolveModelForComplexity( + makeClassification("light"), + { primary: "claude-opus-4-6", fallbacks: [] }, + config, + ["claude-opus-4-6", "claude-haiku-4-5", "gpt-4o-mini"], + ); + assert.equal(result.wasDowngraded, true); + assert.ok(!result.selectionMethod || result.selectionMethod === "tier-only"); +}); + +test("MODEL_CAPABILITY_PROFILES has entries for core models", () => { + const profiledModels = Object.keys(MODEL_CAPABILITY_PROFILES); + assert.ok(profiledModels.length >= 9, `Expected ≥9 profiles, got ${profiledModels.length}`); + assert.ok(MODEL_CAPABILITY_PROFILES["claude-opus-4-6"]); + assert.ok(MODEL_CAPABILITY_PROFILES["claude-haiku-4-5"]); +}); + // ─── #2885: openai-codex and modern OpenAI models in tier map ──────────────── test("#2885: openai-codex light-tier models are recognized", () => { diff --git a/src/resources/extensions/gsd/tests/phase-anchor.test.ts b/src/resources/extensions/gsd/tests/phase-anchor.test.ts new file mode 100644 index 000000000..825bb6cc8 --- /dev/null +++ b/src/resources/extensions/gsd/tests/phase-anchor.test.ts @@ -0,0 +1,83 @@ +import test from "node:test"; +import assert from "node:assert/strict"; +import { mkdtempSync, mkdirSync, rmSync, existsSync } from "node:fs"; +import { join } from "node:path"; +import { tmpdir } from "node:os"; + +import { writePhaseAnchor, readPhaseAnchor, formatAnchorForPrompt } from "../phase-anchor.js"; +import type { PhaseAnchor } from "../phase-anchor.js"; + +function makeTempBase(): string { + const tmp = mkdtempSync(join(tmpdir(), "gsd-anchor-test-")); + mkdirSync(join(tmp, ".gsd", "milestones", "M001", "anchors"), { recursive: true }); + return tmp; +} + +test("writePhaseAnchor creates anchor file in correct location", () => { + const base = makeTempBase(); + try { + const anchor: PhaseAnchor = { + phase: "discuss", + milestoneId: "M001", + generatedAt: new Date().toISOString(), + intent: "Define authentication requirements", + decisions: ["Use JWT tokens", "Session expiry 24h"], + blockers: [], + nextSteps: ["Plan the implementation slices"], + }; + writePhaseAnchor(base, "M001", anchor); + assert.ok(existsSync(join(base, ".gsd", "milestones", "M001", "anchors", "discuss.json"))); + } finally { + rmSync(base, { recursive: true, force: true }); + } +}); + +test("readPhaseAnchor returns written anchor", () => { + const base = makeTempBase(); + try { + const anchor: PhaseAnchor = { + phase: "plan", + milestoneId: "M001", + generatedAt: new Date().toISOString(), + intent: "Break work into slices", + decisions: ["3 slices: auth, UI, tests"], + blockers: ["Need DB schema first"], + nextSteps: ["Execute S01"], + }; + writePhaseAnchor(base, "M001", anchor); + const read = readPhaseAnchor(base, "M001", "plan"); + assert.ok(read); + assert.equal(read!.intent, "Break work into slices"); + assert.deepEqual(read!.decisions, ["3 slices: auth, UI, tests"]); + assert.deepEqual(read!.blockers, ["Need DB schema first"]); + } finally { + rmSync(base, { recursive: true, force: true }); + } +}); + +test("readPhaseAnchor returns null when no anchor exists", () => { + const base = makeTempBase(); + try { + const read = readPhaseAnchor(base, "M001", "discuss"); + assert.equal(read, null); + } finally { + rmSync(base, { recursive: true, force: true }); + } +}); + +test("formatAnchorForPrompt produces markdown block", () => { + const anchor: PhaseAnchor = { + phase: "discuss", + milestoneId: "M001", + generatedAt: "2026-04-03T00:00:00.000Z", + intent: "Define requirements", + decisions: ["Use JWT"], + blockers: [], + nextSteps: ["Plan slices"], + }; + const md = formatAnchorForPrompt(anchor); + assert.ok(md.includes("## Handoff from discuss")); + assert.ok(md.includes("Define requirements")); + assert.ok(md.includes("Use JWT")); + assert.ok(md.includes("Plan slices")); +}); diff --git a/src/resources/extensions/gsd/triage-ui.ts b/src/resources/extensions/gsd/triage-ui.ts index a9b81f46f..da9030d41 100644 --- a/src/resources/extensions/gsd/triage-ui.ts +++ b/src/resources/extensions/gsd/triage-ui.ts @@ -49,10 +49,18 @@ const CLASSIFICATION_LABELS: Record