diff --git a/docs/ADR-004-capability-aware-model-routing.md b/docs/ADR-004-capability-aware-model-routing.md
index 93d28f862..c2ce3d2d2 100644
--- a/docs/ADR-004-capability-aware-model-routing.md
+++ b/docs/ADR-004-capability-aware-model-routing.md
@@ -1,8 +1,8 @@
# ADR-004: Capability-Aware Model Routing
-**Status:** Proposed (Revised)
+**Status:** Implemented (Phase 2)
**Date:** 2026-03-26
-**Revised:** 2026-03-26
+**Revised:** 2026-04-03
**Deciders:** Jeremy McSpadden
**Related:** ADR-003 (pipeline simplification), [Issue #2655](https://github.com/gsd-build/gsd-2/issues/2655), `docs/dynamic-model-routing.md`
diff --git a/docs/configuration.md b/docs/configuration.md
index b223f8b7b..00512fa22 100644
--- a/docs/configuration.md
+++ b/docs/configuration.md
@@ -686,6 +686,7 @@ Complexity-based model routing. See [Dynamic Model Routing](./dynamic-model-rout
```yaml
dynamic_routing:
enabled: true
+ capability_routing: true # score models by task capability (v2.59)
tier_models:
light: claude-haiku-4-5
standard: claude-sonnet-4-6
@@ -695,6 +696,18 @@ dynamic_routing:
cross_provider: true
```
+### `context_management` (v2.59)
+
+Controls observation masking and tool result truncation during auto-mode sessions. Reduces context bloat between compactions with zero LLM overhead.
+
+```yaml
+context_management:
+ observation_masking: true # replace old tool results with placeholders (default: true)
+ observation_mask_turns: 8 # keep results from last N user turns (1-50, default: 8)
+ compaction_threshold_percent: 0.70 # target compaction at 70% context usage (0.5-0.95, default: 0.70)
+ tool_result_max_chars: 800 # cap individual tool result content (200-10000, default: 800)
+```
+
### `service_tier` (v2.42)
OpenAI service tier preference for supported models. Toggle with `/gsd fast`.
diff --git a/docs/dynamic-model-routing.md b/docs/dynamic-model-routing.md
index 9d0d5525e..9bbf125fe 100644
--- a/docs/dynamic-model-routing.md
+++ b/docs/dynamic-model-routing.md
@@ -70,6 +70,36 @@ When approaching the budget ceiling, the router progressively downgrades:
When enabled, the router may select models from providers other than your primary. This uses the built-in cost table to find the cheapest model at each tier. Requires the target provider to be configured.
+## Capability-Aware Scoring
+
+*Introduced in v2.59.0 (ADR-004 Phase 2)*
+
+When `capability_routing` is enabled, the router goes beyond tier classification and scores models against task-specific capability requirements. Each known model has a 7-dimension profile:
+
+| Dimension | What It Measures |
+|-----------|-----------------|
+| `coding` | Code generation, refactoring, implementation quality |
+| `debugging` | Error diagnosis, fix accuracy |
+| `research` | Information gathering, codebase exploration |
+| `reasoning` | Multi-step logic, architectural decisions |
+| `speed` | Response latency (inverse of cost) |
+| `longContext` | Performance with large context windows |
+| `instruction` | Adherence to structured instructions and templates |
+
+Each unit type maps to a weighted requirement vector. For example, `execute-task` weights `coding: 0.9, reasoning: 0.6, debugging: 0.5` while `research-slice` weights `research: 0.9, reasoning: 0.7, longContext: 0.5`.
+
+For `execute-task` units, the classifier also inspects task metadata (tags, description) to refine requirements. Documentation tasks boost `instruction` and lower `coding`; test tasks boost `debugging`.
+
+Enable capability routing:
+
+```yaml
+dynamic_routing:
+ enabled: true
+ capability_routing: true
+```
+
+When enabled, models within the target tier are ranked by capability score rather than selected arbitrarily. When disabled (the default), the existing tier-only selection applies.
+
## Complexity Classification
Units are classified using pure heuristics — no LLM calls, sub-millisecond:
diff --git a/docs/pi-context-optimization-opportunities.md b/docs/pi-context-optimization-opportunities.md
new file mode 100644
index 000000000..9e34cc44c
--- /dev/null
+++ b/docs/pi-context-optimization-opportunities.md
@@ -0,0 +1,198 @@
+# pi-coding-agent: Context Optimization Opportunities
+
+> **Status**: Research only — not planned for implementation.
+> Scope: `packages/pi-coding-agent` and `packages/pi-agent-core` infrastructure.
+> These changes would benefit every consumer of the pi engine, not just GSD.
+
+---
+
+## 1. Prompt Caching (`cache_control`) — Highest Impact
+
+**Current state**: Every LLM call re-pays full input token cost for the system prompt, tool definitions, and context files. No `cache_control` breakpoints are set anywhere in the API call path.
+
+**Opportunity**: Anthropic's KV cache delivers 90% cost reduction on cached tokens (0.1x input rate). Claude Code achieves 92–98% cache hit rates by placing stable content before volatile content.
+
+**Where to instrument** (`packages/pi-ai/src/providers/anthropic.ts`):
+- Set `cache_control: { type: "ephemeral" }` on the last tool definition block
+- Set `cache_control` after the static system prompt sections (base boilerplate + context files)
+- Leave the per-turn user message uncached
+
+**Critical constraint**: The cache breakpoint must be placed *after* all static content and *before* any dynamic content (timestamps, per-request variables). Moving a timestamp before a cache breakpoint defeats it on every call.
+
+**Cache hierarchy**: Tools → system → messages. Changing a tool definition invalidates system and message caches. Tool definitions should be sorted deterministically (alphabetically) to prevent spurious cache misses.
+
+**Expected savings**: 80–90% reduction in input token cost for multi-turn sessions (the dominant cost pattern in GSD auto-mode).
+
+---
+
+## 2. Observation Masking in the Message Pipeline
+
+**Current state**: `agent-loop.ts` passes the full `context.messages` array to the LLM on every turn. Tool results from 50 turns ago are re-read in full on every subsequent call. The `transformContext` hook exists on `AgentContext` and fires before every LLM call, but has no default implementation — extensions are responsible for any pruning.
+
+**Opportunity**: Replace old tool result content with lightweight placeholders after N turns. JetBrains Research tested this on SWE-bench Verified (500 tasks, up to 250-turn trajectories) and found:
+- 50%+ cost reduction vs. unmanaged history
+- Performance matched or slightly exceeded LLM summarization
+- Zero overhead (no extra LLM call required)
+
+**Proposed implementation** (default `transformContext` in `pi-agent-core`):
+```typescript
+// Keep last KEEP_RECENT_TURNS verbatim; mask older tool results
+const KEEP_RECENT_TURNS = 8;
+
+function defaultObservationMask(messages: AgentMessage[]): AgentMessage[] {
+ const cutoff = findTurnBoundary(messages, KEEP_RECENT_TURNS);
+ return messages.map((m, i) => {
+ if (i >= cutoff) return m;
+ if (m.type === "toolResult" || m.type === "bashExecution") {
+ return { ...m, content: "[result masked — within summarized history]", excludeFromContext: false };
+ }
+ return m;
+ });
+}
+```
+
+**Compaction interaction**: Observation masking reduces the token accumulation rate, pushing the compaction threshold further out. The two mechanisms are complementary — masking handles the steady state, compaction handles the rare deep-session case.
+
+---
+
+## 3. Earlier Compaction Threshold
+
+**Current state** (`packages/pi-coding-agent/src/core/constants.ts`):
+```typescript
+COMPACTION_RESERVE_TOKENS = 16_384 // triggers at contextWindow - 16K
+COMPACTION_KEEP_RECENT_TOKENS = 20_000
+```
+
+For a 200K context window, compaction fires at ~183K tokens — 91.5% utilization.
+
+**Problem**: Context drift (not raw exhaustion) causes ~65% of enterprise agent failures. Performance degrades measurably beyond ~30K tokens per Zylos production data. The current threshold lets sessions run degraded for a long stretch before compaction fires.
+
+**Opportunity**: Lower the trigger to 70% utilization. For a 200K window, this means compacting at ~140K tokens — 43K tokens earlier.
+
+```typescript
+// Proposed
+COMPACTION_THRESHOLD_PERCENT = 0.70 // fire at 70% of contextWindow
+COMPACTION_RESERVE_TOKENS = contextWindow * (1 - COMPACTION_THRESHOLD_PERCENT)
+```
+
+**Trade-off**: More frequent compactions, each happening earlier when there's more "fresh" content to keep. Summary quality improves because less material needs to be discarded at each cut.
+
+---
+
+## 4. Tool Result Truncation at Write Time
+
+**Current state**: `TOOL_RESULT_MAX_CHARS = 2_000` in `constants.ts`, but this limit is only applied *during compaction summarization*, not when the tool result enters the message store. A bash result returning 50KB of log output is stored and re-sent verbatim until compaction fires.
+
+**Opportunity**: Truncate at write time in `messages.ts` → `convertToLlm()` or in the tool result handler. Two strategies:
+
+- **Hard truncation**: Slice at N chars, append `"\n[truncated — {original_length} chars]"`. Simple, zero overhead.
+- **Semantic head/tail**: Keep first 500 chars (context, command echo) + last 1000 chars (final output, errors). Better for bash results where the end contains the error.
+
+**Recommendation**: Semantic head/tail as the default, configurable per tool type. File read results benefit from head; bash/test output benefits from head+tail.
+
+---
+
+## 5. Context File Deduplication and Trim
+
+**Current state** (`packages/pi-coding-agent/src/core/resource-loader.ts`, lines 84–109):
+- Searches from `~/.gsd/agent/` → ancestor dirs → cwd
+- Deduplicates by *file path* but not by *content*
+- Entire file content concatenated verbatim into system prompt — no trimming, no summarization
+
+**Anti-pattern**: A project with AGENTS.md at 3 ancestor levels (repo root, workspace, home) injects all three in full. If they share common boilerplate, that content is re-injected multiple times.
+
+**Opportunities**:
+1. **Content deduplication**: Hash paragraph-level chunks; skip any chunk already seen in a previously-loaded file
+2. **Section-aware loading**: Parse `## ` headings in AGENTS.md; only include sections relevant to the current task type (e.g., `## Testing` section only when running tests)
+3. **Token budget enforcement**: If total context files exceed N tokens, summarize oldest/most-distant file rather than including verbatim
+
+---
+
+## 6. Skill Content Lazy Loading and Summarization
+
+**Current state**: When `/skill:name` is invoked, the full skill file content is injected inline as `...` in the user message. No chunking, no summarization. A 10KB skill file adds ~2,500 tokens to that turn.
+
+**Opportunity**:
+- **Cached skill injection**: If the same skill is used across multiple turns (rare but possible), it's re-injected each time. Cache with `cache_control` after first injection.
+- **Skill digest mode**: Inject a 200-token summary of the skill on first reference; full content only if the model requests it via a `get_skill_detail` tool call. Reduces cost for skills that don't end up being followed.
+- **Skill prefetching**: Before a known long session (e.g., auto-mode start), pre-inject all likely skills with `cache_control` so they're cached for the entire session.
+
+---
+
+## 7. Token Estimation Accuracy
+
+**Current state** (`compaction.ts`, line 216): `chars / 4` heuristic. This overestimates token count for English prose (~3.5 chars/token) and underestimates for code with short identifiers or Unicode.
+
+**Opportunity**: Use a proper tokenizer.
+- `@anthropic-ai/tokenizer` (tiktoken-compatible, ships with the SDK) — accurate but ~5ms per call
+- Tiered approach: use chars/4 for display; use proper tokenizer only for compaction threshold decisions (where accuracy matters)
+
+**Impact**: More accurate compaction timing, fewer unnecessary compactions, slightly better `COMPACTION_KEEP_RECENT_TOKENS` boundary placement.
+
+---
+
+## 8. Format: Markdown over XML for Internal Context
+
+**Current state**: The message pipeline uses ``, ``, `` XML wrappers in several places. System prompt sections are largely prose Markdown.
+
+**Findings**: XML tags carry 15–40% more tokens than equivalent Markdown for the same semantic content, due to paired open/close tags. However, Claude was optimized for XML and shows higher accuracy on tasks requiring precise section parsing.
+
+**Recommendation**: Audit XML usage in the pipeline and convert to Markdown where the content is:
+- Non-nested (flat instructions, status messages)
+- Human-readable rather than machine-parsed by the model
+- Not requiring precise boundary detection
+
+Keep XML for: few-shot examples with ambiguous boundaries, skill content (requires precise isolation from surrounding text), compaction summaries that the model must treat as authoritative history.
+
+**Estimated savings**: 5–15% reduction in system prompt token count.
+
+---
+
+## 9. Dynamic Tool Set Delivery
+
+**Current state**: All tool definitions are included in every LLM request. Tool descriptions consume 60–80% of input tokens in static configurations. As new extensions register tools, the baseline grows linearly.
+
+**Opportunity** (higher complexity): Implement the three-function Dynamic Toolset pattern:
+1. `search_tools(query)` — semantic search over tool catalog
+2. `describe_tools(ids[])` — fetch full schemas on demand
+3. `execute_tool(id, params)` — unchanged execution
+
+Speakeasy measured 91–97% token reduction with 100% task success rate. Trade-off: 2–3x more tool calls, ~50% longer wall time. Net cost dramatically lower.
+
+**Feasibility for pi**: The tool registry (`packages/pi-coding-agent/src/core/tool-registry.ts`) already stores tool metadata separately from definitions. The primary engineering work is the semantic search index and the `describe_tools` / `search_tools` tool implementations.
+
+---
+
+## 10. Cost Attribution and Per-Phase Reporting
+
+**Current state**: `SessionManager.getUsageTotals()` accumulates cost across the entire session. No per-phase or per-agent breakdown is stored. Cost visibility is limited to the footer total and `GSD_SHOW_TOKEN_COST=1` per-turn display.
+
+**Opportunity**: Emit structured cost events that extensions can subscribe to:
+```typescript
+interface CostCheckpointEvent {
+ type: "cost_checkpoint";
+ label: string; // "discuss-phase", "execute-slice-3"
+ deltaTokens: Usage; // tokens since last checkpoint
+ cumulativeTokens: Usage;
+ cumulativeCost: number;
+}
+```
+
+GSD extension could consume these events to surface per-milestone cost in `/gsd stats` and flag milestones that are disproportionately expensive — enabling budget-aware planning.
+
+---
+
+## Implementation Ordering (if pursued)
+
+| Priority | Item | Effort | Expected Impact |
+|----------|------|--------|-----------------|
+| 1 | Prompt caching (`cache_control`) | Low | 80–90% input cost reduction |
+| 2 | Earlier compaction threshold (70%) | Trivial | Reduces drift in long sessions |
+| 3 | Tool result truncation at write time | Low | Reduces context bloat between compactions |
+| 4 | Context file deduplication | Medium | Variable — high for multi-level AGENTS.md setups |
+| 5 | Observation masking (default `transformContext`) | Medium | 50%+ on long-running agents |
+| 6 | Token estimation (proper tokenizer) | Low | Accuracy improvement, minor cost impact |
+| 7 | Markdown over XML audit | Low | 5–15% system prompt reduction |
+| 8 | Skill caching with `cache_control` | Low | Meaningful for skill-heavy sessions |
+| 9 | Dynamic tool set delivery | High | 90%+ on large tool catalogs; major architecture change |
+| 10 | Per-phase cost attribution events | Medium | Visibility only; enables future budget routing |
diff --git a/docs/token-optimization.md b/docs/token-optimization.md
index 5c5ea3466..4a3a423af 100644
--- a/docs/token-optimization.md
+++ b/docs/token-optimization.md
@@ -262,15 +262,59 @@ PREFERENCES.md
├─ resolveProfileDefaults() → model defaults + phase skip defaults
├─ resolveInlineLevel() → standard
│ └─ prompt builders gate context inclusion by level
- └─ classifyUnitComplexity() → routes to execution/execution_simple model
- ├─ task plan analysis (steps, files, signals)
- ├─ unit type defaults
- ├─ budget pressure adjustment
- └─ adaptive learning from routing-history.json
+ ├─ classifyUnitComplexity() → routes to execution/execution_simple model
+ │ ├─ task plan analysis (steps, files, signals)
+ │ ├─ unit type defaults
+ │ ├─ budget pressure adjustment
+ │ ├─ adaptive learning from routing-history.json
+ │ └─ capability scoring (when capability_routing: true)
+ │ └─ 7-dimension model profiles × task requirement vectors
+ └─ context_management
+ ├─ observation masking (before_provider_request hook)
+ ├─ tool result truncation (tool_result_max_chars)
+ └─ phase handoff anchors (injected into prompt builders)
```
The profile is resolved once and flows through the entire dispatch pipeline. Explicit preferences override profile defaults at every layer.
+## Observation Masking
+
+*Introduced in v2.59.0*
+
+During auto-mode sessions, tool results accumulate in the conversation history and consume context window space. Observation masking replaces tool result content older than N user turns with a lightweight placeholder before each LLM call. This reduces token usage with zero LLM overhead — no summarization calls, no latency.
+
+Masking is enabled by default during auto-mode. Configure via preferences:
+
+```yaml
+context_management:
+ observation_masking: true # default: true (set false to disable)
+ observation_mask_turns: 8 # keep results from last 8 user turns (range: 1-50)
+ tool_result_max_chars: 800 # truncate individual tool results beyond this length
+```
+
+### How It Works
+
+1. Before each provider request, the `before_provider_request` hook inspects the messages array
+2. Tool results (`toolResult`, `bashExecution`) older than the configured turn threshold are replaced with `[result masked — within summarized history]`
+3. Recent tool results (within the keep window) are preserved in full
+4. All assistant and user messages are always preserved — only tool result content is masked
+
+This pairs with the existing compaction system: masking reduces context pressure between compactions, and compaction handles the full context reset when the window fills.
+
+### Tool Result Truncation
+
+Individual tool results that exceed `tool_result_max_chars` (default: 800) are truncated with a `…[truncated]` marker. This prevents a single large tool output from dominating the context window.
+
+## Phase Handoff Anchors
+
+*Introduced in v2.59.0*
+
+When auto-mode transitions between phases (research → planning → execution), structured JSON anchors are written to `.gsd/milestones//anchors/.json`. Downstream prompt builders inject these anchors so the next phase inherits intent, decisions, blockers, and next steps without re-inferring from artifact files.
+
+This reduces context drift — the 65% of enterprise agent failures caused by agents losing track of prior decisions across phase boundaries.
+
+Anchors are written automatically after successful completion of `research-milestone`, `research-slice`, `plan-milestone`, and `plan-slice` units. No configuration needed.
+
## Prompt Compression
*Introduced in v2.29.0*
diff --git a/src/resources/extensions/gsd/auto-model-selection.ts b/src/resources/extensions/gsd/auto-model-selection.ts
index 60cca2663..cf2326e35 100644
--- a/src/resources/extensions/gsd/auto-model-selection.ts
+++ b/src/resources/extensions/gsd/auto-model-selection.ts
@@ -9,7 +9,7 @@ import type { ExtensionAPI, ExtensionContext } from "@gsd/pi-coding-agent";
import type { GSDPreferences } from "./preferences.js";
import { resolveModelWithFallbacksForUnit, resolveDynamicRoutingConfig } from "./preferences.js";
import type { ComplexityTier } from "./complexity-classifier.js";
-import { classifyUnitComplexity, tierLabel } from "./complexity-classifier.js";
+import { classifyUnitComplexity, tierLabel, extractTaskMetadata } from "./complexity-classifier.js";
import { resolveModelForComplexity, escalateTier } from "./model-router.js";
import { getLedger, getProjectTotals } from "./metrics.js";
import { unitPhaseLabel } from "./auto-dashboard.js";
@@ -107,7 +107,15 @@ export async function selectAndApplyModel(
}
}
- const routingResult = resolveModelForComplexity(classification, modelConfig, routingConfig, availableModelIds);
+ // Extract task metadata for capability scoring
+ const taskMeta = unitType === "execute-task"
+ ? extractTaskMetadata(unitId, basePath)
+ : undefined;
+
+ const routingResult = resolveModelForComplexity(
+ classification, modelConfig, routingConfig, availableModelIds,
+ unitType, taskMeta,
+ );
if (routingResult.wasDowngraded) {
effectiveModelConfig = {
@@ -115,8 +123,9 @@ export async function selectAndApplyModel(
fallbacks: routingResult.fallbacks,
};
if (verbose) {
+ const method = routingResult.selectionMethod === "capability-scored" ? "capability-scored" : "tier-only";
ctx.ui.notify(
- `Dynamic routing [${tierLabel(classification.tier)}]: ${routingResult.modelId} (${classification.reason})`,
+ `Dynamic routing [${tierLabel(classification.tier)}]: ${routingResult.modelId} (${method} — ${classification.reason})`,
"info",
);
}
diff --git a/src/resources/extensions/gsd/auto-prompts.ts b/src/resources/extensions/gsd/auto-prompts.ts
index 5b6e9de5b..33000a526 100644
--- a/src/resources/extensions/gsd/auto-prompts.ts
+++ b/src/resources/extensions/gsd/auto-prompts.ts
@@ -26,6 +26,7 @@ import { existsSync } from "node:fs";
import { computeBudgets, resolveExecutorContextWindow, truncateAtSectionBoundary } from "./context-budget.js";
import { getPendingGates } from "./gsd-db.js";
import { formatDecisionsCompact, formatRequirementsCompact } from "./structured-data-formatter.js";
+import { readPhaseAnchor, formatAnchorForPrompt } from "./phase-anchor.js";
// ─── Preamble Cap ─────────────────────────────────────────────────────────────
@@ -906,6 +907,11 @@ export async function buildPlanMilestonePrompt(mid: string, midTitle: string, ba
const researchRel = relMilestoneFile(base, mid, "RESEARCH");
const inlined: string[] = [];
+
+ // Inject phase handoff anchor from research phase (if available)
+ const researchAnchor = readPhaseAnchor(base, mid, "research-milestone");
+ if (researchAnchor) inlined.push(formatAnchorForPrompt(researchAnchor));
+
inlined.push(await inlineFile(contextPath, contextRel, "Milestone Context"));
const researchInline = await inlineFileOptional(researchPath, researchRel, "Milestone Research");
if (researchInline) inlined.push(researchInline);
@@ -1033,6 +1039,11 @@ export async function buildPlanSlicePrompt(
const researchRel = relSliceFile(base, mid, sid, "RESEARCH");
const inlined: string[] = [];
+
+ // Inject phase handoff anchor from research phase (if available)
+ const researchSliceAnchor = readPhaseAnchor(base, mid, "research-slice");
+ if (researchSliceAnchor) inlined.push(formatAnchorForPrompt(researchSliceAnchor));
+
inlined.push(await inlineFile(roadmapPath, roadmapRel, "Milestone Roadmap"));
const researchInline = await inlineFileOptional(researchPath, researchRel, "Slice Research");
if (researchInline) inlined.push(researchInline);
@@ -1100,6 +1111,9 @@ export async function buildExecuteTaskPrompt(
: { level: level as InlineLevel | undefined };
const inlineLevel = opts.level ?? resolveInlineLevel();
+ // Inject phase handoff anchor from planning phase (if available)
+ const planAnchor = readPhaseAnchor(base, mid, "plan-slice");
+
const priorSummaries = opts.carryForwardPaths ?? await getPriorTaskSummaryPaths(mid, sid, tid, base);
const priorLines = priorSummaries.length > 0
? priorSummaries.map(p => `- \`${p}\``).join("\n")
@@ -1190,9 +1204,12 @@ export async function buildExecuteTaskPrompt(
? `### Runtime Context\nSource: \`.gsd/RUNTIME.md\`\n\n${runtimeContent.trim()}`
: "";
+ const phaseAnchorSection = planAnchor ? formatAnchorForPrompt(planAnchor) : "";
+
return loadPrompt("execute-task", {
overridesSection,
runtimeContext,
+ phaseAnchorSection,
workingDirectory: base,
milestoneId: mid, sliceId: sid, sliceTitle: sTitle, taskId: tid, taskTitle: tTitle,
planPath: join(base, relSliceFile(base, mid, sid, "PLAN")),
diff --git a/src/resources/extensions/gsd/auto/phases.ts b/src/resources/extensions/gsd/auto/phases.ts
index 620fe6809..5b0caaa1c 100644
--- a/src/resources/extensions/gsd/auto/phases.ts
+++ b/src/resources/extensions/gsd/auto/phases.ts
@@ -1205,6 +1205,23 @@ export async function runUnitPhase(
s.unitRecoveryCount.delete(`${unitType}/${unitId}`);
}
+ // Write phase handoff anchor after successful research/planning completion
+ const anchorPhases = new Set(["research-milestone", "research-slice", "plan-milestone", "plan-slice"]);
+ if (artifactVerified && mid && anchorPhases.has(unitType)) {
+ try {
+ const { writePhaseAnchor } = await import("../phase-anchor.js");
+ writePhaseAnchor(s.basePath, mid, {
+ phase: unitType,
+ milestoneId: mid,
+ generatedAt: new Date().toISOString(),
+ intent: `Completed ${unitType} for ${unitId}`,
+ decisions: [],
+ blockers: [],
+ nextSteps: [],
+ });
+ } catch { /* non-fatal — anchor is advisory */ }
+ }
+
deps.emitJournalEvent({ ts: new Date().toISOString(), flowId: ic.flowId, seq: ic.nextSeq(), eventType: "unit-end", data: { unitType, unitId, status: unitResult.status, artifactVerified, ...(unitResult.errorContext ? { errorContext: unitResult.errorContext } : {}) }, causedBy: { flowId: ic.flowId, seq: unitStartSeq } });
return { action: "next", data: { unitStartedAt: s.currentUnit?.startedAt } };
diff --git a/src/resources/extensions/gsd/bootstrap/register-hooks.ts b/src/resources/extensions/gsd/bootstrap/register-hooks.ts
index d7504fa52..d76b046a1 100644
--- a/src/resources/extensions/gsd/bootstrap/register-hooks.ts
+++ b/src/resources/extensions/gsd/bootstrap/register-hooks.ts
@@ -263,13 +263,62 @@ export function registerHooks(pi: ExtensionAPI): void {
});
pi.on("before_provider_request", async (event) => {
- const modelId = event.model?.id;
- if (!modelId) return;
- const { getEffectiveServiceTier, supportsServiceTier } = await import("../service-tier.js");
- const tier = getEffectiveServiceTier();
- if (!tier || !supportsServiceTier(modelId)) return;
const payload = event.payload as Record | null;
if (!payload || typeof payload !== "object") return;
+
+ // ── Observation Masking ─────────────────────────────────────────────
+ // Replace old tool results with placeholders to reduce context bloat.
+ // Only active during auto-mode when context_management.observation_masking is enabled.
+ if (isAutoActive()) {
+ try {
+ const { loadEffectiveGSDPreferences } = await import("../preferences.js");
+ const prefs = loadEffectiveGSDPreferences();
+ const cmConfig = prefs?.preferences.context_management;
+
+ // Observation masking: replace old tool results with placeholders
+ if (cmConfig?.observation_masking !== false) {
+ const keepTurns = cmConfig?.observation_mask_turns ?? 8;
+ const { createObservationMask } = await import("../context-masker.js");
+ const mask = createObservationMask(keepTurns);
+ const messages = payload.messages;
+ if (Array.isArray(messages)) {
+ payload.messages = mask(messages);
+ }
+ }
+
+ // Tool result truncation: cap individual tool result content length.
+ // In pi-ai format, toolResult messages have role: "toolResult" and content: TextContent[].
+ // Creates new objects to avoid mutating shared conversation state.
+ const maxChars = cmConfig?.tool_result_max_chars ?? 800;
+ const msgs = payload.messages;
+ if (Array.isArray(msgs)) {
+ payload.messages = msgs.map((msg: Record) => {
+ // Match toolResult messages (role: "toolResult", content is array of content blocks)
+ if (msg?.role === "toolResult" && Array.isArray(msg.content)) {
+ const blocks = msg.content as Array>;
+ const totalLen = blocks.reduce((sum: number, b) => sum + (typeof b.text === "string" ? b.text.length : 0), 0);
+ if (totalLen > maxChars) {
+ const truncated = blocks.map(b => {
+ if (typeof b.text === "string" && b.text.length > maxChars) {
+ return { ...b, text: b.text.slice(0, maxChars) + "\n…[truncated]" };
+ }
+ return b;
+ });
+ return { ...msg, content: truncated };
+ }
+ }
+ return msg;
+ });
+ }
+ } catch { /* non-fatal */ }
+ }
+
+ // ── Service Tier ────────────────────────────────────────────────────
+ const modelId = event.model?.id;
+ if (!modelId) return payload;
+ const { getEffectiveServiceTier, supportsServiceTier } = await import("../service-tier.js");
+ const tier = getEffectiveServiceTier();
+ if (!tier || !supportsServiceTier(modelId)) return payload;
payload.service_tier = tier;
return payload;
});
diff --git a/src/resources/extensions/gsd/captures.ts b/src/resources/extensions/gsd/captures.ts
index 645d907f6..052c43211 100644
--- a/src/resources/extensions/gsd/captures.ts
+++ b/src/resources/extensions/gsd/captures.ts
@@ -15,7 +15,7 @@ import { gsdRoot } from "./paths.js";
// ─── Types ────────────────────────────────────────────────────────────────────
-export type Classification = "quick-task" | "inject" | "defer" | "replan" | "note";
+export type Classification = "quick-task" | "inject" | "defer" | "replan" | "note" | "stop" | "backtrack";
export interface CaptureEntry {
id: string;
@@ -42,7 +42,7 @@ export interface TriageResult {
const CAPTURES_FILENAME = "CAPTURES.md";
const VALID_CLASSIFICATIONS: readonly string[] = [
- "quick-task", "inject", "defer", "replan", "note",
+ "quick-task", "inject", "defer", "replan", "note", "stop", "backtrack",
];
// ─── Path Resolution ──────────────────────────────────────────────────────────
diff --git a/src/resources/extensions/gsd/complexity-classifier.ts b/src/resources/extensions/gsd/complexity-classifier.ts
index c7ae14dbf..114178810 100644
--- a/src/resources/extensions/gsd/complexity-classifier.ts
+++ b/src/resources/extensions/gsd/complexity-classifier.ts
@@ -212,7 +212,7 @@ function analyzePlanComplexity(
/**
* Extract task metadata from the task plan file on disk.
*/
-function extractTaskMetadata(unitId: string, basePath: string): TaskMetadata {
+export function extractTaskMetadata(unitId: string, basePath: string): TaskMetadata {
const meta: TaskMetadata = {};
const { milestone: mid, slice: sid, task: tid } = parseUnitId(unitId);
if (!mid || !sid || !tid) return meta;
diff --git a/src/resources/extensions/gsd/context-masker.ts b/src/resources/extensions/gsd/context-masker.ts
new file mode 100644
index 000000000..824c3a91e
--- /dev/null
+++ b/src/resources/extensions/gsd/context-masker.ts
@@ -0,0 +1,74 @@
+/**
+ * Observation masking for GSD auto-mode sessions.
+ *
+ * Replaces tool result content older than N turns with a placeholder.
+ * Reduces context bloat between compactions with zero LLM overhead.
+ * Preserves message ordering, roles, and all assistant/user messages.
+ *
+ * Operates on the pi-ai Message[] format (post-convertToLlm, pre-provider):
+ * - toolResult messages: { role: "toolResult", content: TextContent[] }
+ * - bash results are already converted to: { role: "user", content: [{type:"text",text:"..."}] }
+ * and start with "Ran `" from bashExecutionToText.
+ */
+
+interface MaskableMessage {
+ role: string;
+ content: unknown;
+ type?: string;
+ [key: string]: unknown;
+}
+
+const MASK_PLACEHOLDER = "[result masked — within summarized history]";
+const MASK_CONTENT_BLOCK = [{ type: "text" as const, text: MASK_PLACEHOLDER }];
+
+function findTurnBoundary(messages: MaskableMessage[], keepRecentTurns: number): number {
+ let turnsSeen = 0;
+ for (let i = messages.length - 1; i >= 0; i--) {
+ const m = messages[i];
+ // In the LLM payload, genuine user turns have role "user".
+ // Tool results have role "toolResult" and are excluded by this check.
+ if (m.role === "user") {
+ // Skip bash-result user messages (converted from bashExecution) — these aren't real user turns
+ if (isBashResultUserMessage(m)) continue;
+ turnsSeen++;
+ if (turnsSeen >= keepRecentTurns) return i;
+ }
+ }
+ return 0;
+}
+
+/**
+ * Detect user messages that originated from bashExecution.
+ * After convertToLlm, these are {role: "user", content: [{type:"text", text:"Ran `cmd`\n..."}]}.
+ * The bashExecutionToText format always starts with "Ran `".
+ */
+function isBashResultUserMessage(m: MaskableMessage): boolean {
+ if (m.role !== "user" || !Array.isArray(m.content)) return false;
+ const first = m.content[0];
+ return first && typeof first === "object" && "text" in first &&
+ typeof first.text === "string" && first.text.startsWith("Ran `");
+}
+
+function isMaskableMessage(m: MaskableMessage): boolean {
+ // Tool result messages (role: "toolResult" in pi-ai format)
+ if (m.role === "toolResult") return true;
+ // Bash-result user messages (converted from bashExecution by convertToLlm)
+ if (isBashResultUserMessage(m)) return true;
+ return false;
+}
+
+export function createObservationMask(keepRecentTurns: number = 8) {
+ return (messages: MaskableMessage[]): MaskableMessage[] => {
+ const boundary = findTurnBoundary(messages, keepRecentTurns);
+ if (boundary === 0) return messages;
+
+ return messages.map((m, i) => {
+ if (i >= boundary) return m;
+ if (isMaskableMessage(m)) {
+ // Content may be string or array of content blocks — always replace with array
+ return { ...m, content: MASK_CONTENT_BLOCK };
+ }
+ return m;
+ });
+ };
+}
diff --git a/src/resources/extensions/gsd/docs/preferences-reference.md b/src/resources/extensions/gsd/docs/preferences-reference.md
index 8f110ce37..2ae6b8b6e 100644
--- a/src/resources/extensions/gsd/docs/preferences-reference.md
+++ b/src/resources/extensions/gsd/docs/preferences-reference.md
@@ -189,6 +189,13 @@ Setting `prefer_skills: []` does **not** disable skill discovery — it just mea
- `budget_pressure`: boolean — downgrade model tier when budget is under pressure. Default: `true`.
- `cross_provider`: boolean — allow routing across different providers. Default: `true`.
- `hooks`: boolean — enable routing hooks. Default: `true`.
+ - `capability_routing`: boolean — enable capability-profile scoring for model selection within a tier. Requires `enabled: true`. Default: `false`.
+
+- `context_management`: configures context hygiene for auto-mode sessions. Keys:
+ - `observation_masking`: boolean — mask old tool results to reduce context bloat. Default: `true`.
+ - `observation_mask_turns`: number — keep this many recent turns verbatim (1-50). Default: `8`.
+ - `compaction_threshold_percent`: number — trigger compaction at this % of context window (0.5-0.95). Lower values fire compaction earlier, reducing drift. Default: `0.70`.
+ - `tool_result_max_chars`: number — max chars per tool result in GSD sessions (200-10000). Default: `800`.
- `auto_visualize`: boolean — show a visualizer hint after each milestone completion in auto-mode. Default: `false`.
diff --git a/src/resources/extensions/gsd/model-router.ts b/src/resources/extensions/gsd/model-router.ts
index f97a69561..5b45ef9b4 100644
--- a/src/resources/extensions/gsd/model-router.ts
+++ b/src/resources/extensions/gsd/model-router.ts
@@ -10,6 +10,7 @@ import type { ResolvedModelConfig } from "./preferences.js";
export interface DynamicRoutingConfig {
enabled?: boolean;
+ capability_routing?: boolean; // default: false — enable capability profile scoring
tier_models?: {
light?: string;
standard?: string;
@@ -32,6 +33,12 @@ export interface RoutingDecision {
wasDowngraded: boolean;
/** Human-readable reason for this decision */
reason: string;
+ /** How the model was selected. */
+ selectionMethod?: "tier-only" | "capability-scored";
+ /** Capability scores per model (when capability-scored). */
+ capabilityScores?: Record;
+ /** Task requirement vector (when capability-scored). */
+ taskRequirements?: Partial>;
}
// ─── Known Model Tiers ───────────────────────────────────────────────────────
@@ -114,6 +121,91 @@ const MODEL_COST_PER_1K_INPUT: Record = {
"deepseek-chat": 0.00014,
};
+// ─── Capability Profiles (ADR-004 Phase 2) ──────────────────────────────────
+// 7-dimension profiles, 0–100 normalized. Models without a profile
+// score 50 uniformly — capability scoring is a no-op for them.
+
+export interface ModelCapabilities {
+ coding: number;
+ debugging: number;
+ research: number;
+ reasoning: number;
+ speed: number;
+ longContext: number;
+ instruction: number;
+}
+
+export const MODEL_CAPABILITY_PROFILES: Record = {
+ "claude-opus-4-6": { coding: 95, debugging: 90, research: 85, reasoning: 95, speed: 30, longContext: 80, instruction: 90 },
+ "claude-sonnet-4-6": { coding: 85, debugging: 80, research: 75, reasoning: 80, speed: 60, longContext: 75, instruction: 85 },
+ "claude-haiku-4-5": { coding: 60, debugging: 50, research: 45, reasoning: 50, speed: 95, longContext: 50, instruction: 75 },
+ "gpt-4o": { coding: 80, debugging: 75, research: 70, reasoning: 75, speed: 65, longContext: 70, instruction: 80 },
+ "gpt-4o-mini": { coding: 55, debugging: 45, research: 40, reasoning: 45, speed: 90, longContext: 45, instruction: 70 },
+ "gemini-2.5-pro": { coding: 75, debugging: 70, research: 85, reasoning: 75, speed: 55, longContext: 90, instruction: 75 },
+ "gemini-2.0-flash": { coding: 50, debugging: 40, research: 50, reasoning: 40, speed: 95, longContext: 60, instruction: 65 },
+ "deepseek-chat": { coding: 75, debugging: 65, research: 55, reasoning: 70, speed: 70, longContext: 55, instruction: 65 },
+ "o3": { coding: 80, debugging: 85, research: 80, reasoning: 92, speed: 25, longContext: 70, instruction: 85 },
+};
+
+const BASE_REQUIREMENTS: Record>> = {
+ "execute-task": { coding: 0.9, instruction: 0.7, speed: 0.3 },
+ "research-milestone": { research: 0.9, longContext: 0.7, reasoning: 0.5 },
+ "research-slice": { research: 0.9, longContext: 0.7, reasoning: 0.5 },
+ "plan-milestone": { reasoning: 0.9, coding: 0.5 },
+ "plan-slice": { reasoning: 0.9, coding: 0.5 },
+ "replan-slice": { reasoning: 0.9, debugging: 0.6, coding: 0.5 },
+ "reassess-roadmap": { reasoning: 0.9, research: 0.5 },
+ "complete-slice": { instruction: 0.8, speed: 0.7 },
+ "run-uat": { instruction: 0.7, speed: 0.8 },
+ "discuss-milestone": { reasoning: 0.6, instruction: 0.7 },
+ "complete-milestone": { instruction: 0.8, reasoning: 0.5 },
+};
+
+/**
+ * Compute a task requirement vector from unit type and optional metadata.
+ */
+export function computeTaskRequirements(
+ unitType: string,
+ metadata?: { tags?: string[]; complexityKeywords?: string[]; fileCount?: number; estimatedLines?: number },
+): Partial> {
+ const base = { ...(BASE_REQUIREMENTS[unitType] ?? { reasoning: 0.5 }) };
+
+ if (unitType === "execute-task" && metadata) {
+ if (metadata.tags?.some(t => /^(docs?|readme|comment|config|typo|rename)$/i.test(t))) {
+ return { ...base, instruction: 0.9, coding: 0.3, speed: 0.7 };
+ }
+ if (metadata.complexityKeywords?.some(k => k === "concurrency" || k === "compatibility")) {
+ return { ...base, debugging: 0.9, reasoning: 0.8 };
+ }
+ if (metadata.complexityKeywords?.some(k => k === "migration" || k === "architecture")) {
+ return { ...base, reasoning: 0.9, coding: 0.8 };
+ }
+ if ((metadata.fileCount ?? 0) >= 6 || (metadata.estimatedLines ?? 0) >= 500) {
+ return { ...base, coding: 0.9, reasoning: 0.7 };
+ }
+ }
+
+ return base;
+}
+
+/**
+ * Score a model against a task requirement vector.
+ * Returns weighted average in range 0–100. Returns 50 for empty requirements.
+ */
+export function scoreModel(
+ capabilities: ModelCapabilities,
+ requirements: Partial>,
+): number {
+ let weightedSum = 0;
+ let weightSum = 0;
+ for (const [dim, weight] of Object.entries(requirements)) {
+ const capability = capabilities[dim as keyof ModelCapabilities] ?? 50;
+ weightedSum += weight * capability;
+ weightSum += weight;
+ }
+ return weightSum > 0 ? weightedSum / weightSum : 50;
+}
+
// ─── Public API ──────────────────────────────────────────────────────────────
/**
@@ -132,6 +224,8 @@ export function resolveModelForComplexity(
phaseConfig: ResolvedModelConfig | undefined,
routingConfig: DynamicRoutingConfig,
availableModelIds: string[],
+ unitType?: string,
+ metadata?: { tags?: string[]; complexityKeywords?: string[]; fileCount?: number; estimatedLines?: number },
): RoutingDecision {
// If no phase config or routing disabled, pass through
if (!phaseConfig || !routingConfig.enabled) {
@@ -175,25 +269,40 @@ export function resolveModelForComplexity(
}
// Find the best model for the requested tier
- const targetModelId = findModelForTier(
- requestedTier,
- routingConfig,
- availableModelIds,
- routingConfig.cross_provider !== false,
- );
+ const useCapabilityScoring = routingConfig.capability_routing && unitType;
+
+ let targetModelId: string | null;
+ let capabilityScores: Record | undefined;
+ let taskRequirements: Partial> | undefined;
+ let selectionMethod: "tier-only" | "capability-scored" = "tier-only";
+
+ if (useCapabilityScoring) {
+ const result = findModelForTierWithCapability(
+ requestedTier, routingConfig, availableModelIds,
+ routingConfig.cross_provider !== false, unitType, metadata,
+ );
+ targetModelId = result.modelId;
+ capabilityScores = Object.keys(result.scores).length > 0 ? result.scores : undefined;
+ taskRequirements = Object.keys(result.requirements).length > 0 ? result.requirements : undefined;
+ selectionMethod = capabilityScores ? "capability-scored" : "tier-only";
+ } else {
+ targetModelId = findModelForTier(
+ requestedTier, routingConfig, availableModelIds,
+ routingConfig.cross_provider !== false,
+ );
+ }
if (!targetModelId) {
- // No suitable model found — use configured primary
return {
modelId: configuredPrimary,
fallbacks: phaseConfig.fallbacks,
tier: requestedTier,
wasDowngraded: false,
reason: `no ${requestedTier}-tier model available`,
+ selectionMethod,
};
}
- // Build fallback chain: [downgraded_model, ...configured_fallbacks, configured_primary]
const fallbacks = [
...phaseConfig.fallbacks.filter(f => f !== targetModelId),
configuredPrimary,
@@ -205,6 +314,9 @@ export function resolveModelForComplexity(
tier: requestedTier,
wasDowngraded: true,
reason: classification.reason,
+ selectionMethod,
+ capabilityScores,
+ taskRequirements,
};
}
@@ -226,6 +338,7 @@ export function escalateTier(currentTier: ComplexityTier): ComplexityTier | null
export function defaultRoutingConfig(): DynamicRoutingConfig {
return {
enabled: true,
+ capability_routing: false,
escalate_on_failure: true,
budget_pressure: true,
cross_provider: true,
@@ -298,6 +411,56 @@ function findModelForTier(
return candidates[0] ?? null;
}
+function findModelForTierWithCapability(
+ tier: ComplexityTier,
+ config: DynamicRoutingConfig,
+ availableModelIds: string[],
+ crossProvider: boolean,
+ unitType: string,
+ metadata?: { tags?: string[]; complexityKeywords?: string[]; fileCount?: number; estimatedLines?: number },
+): { modelId: string | null; scores: Record; requirements: Partial> } {
+ const explicitModel = config.tier_models?.[tier];
+ if (explicitModel) {
+ const match = availableModelIds.find(id => {
+ const bareAvail = id.includes("/") ? id.split("/").pop()! : id;
+ const bareExplicit = explicitModel.includes("/") ? explicitModel.split("/").pop()! : explicitModel;
+ return bareAvail === bareExplicit || id === explicitModel;
+ });
+ if (match) return { modelId: match, scores: {}, requirements: {} };
+ }
+
+ const requirements = computeTaskRequirements(unitType, metadata);
+ const candidates = availableModelIds.filter(id => getModelTier(id) === tier);
+ if (candidates.length === 0) return { modelId: null, scores: {}, requirements };
+
+ const scores: Record = {};
+ for (const id of candidates) {
+ const bareId = id.includes("/") ? id.split("/").pop()! : id;
+ const profile = getModelProfile(bareId);
+ scores[id] = scoreModel(profile, requirements);
+ }
+
+ candidates.sort((a, b) => {
+ const scoreDiff = scores[b] - scores[a];
+ if (Math.abs(scoreDiff) > 2) return scoreDiff;
+ if (crossProvider) {
+ const costDiff = getModelCost(a) - getModelCost(b);
+ if (costDiff !== 0) return costDiff;
+ }
+ return a.localeCompare(b);
+ });
+
+ return { modelId: candidates[0], scores, requirements };
+}
+
+function getModelProfile(bareId: string): ModelCapabilities {
+ if (MODEL_CAPABILITY_PROFILES[bareId]) return MODEL_CAPABILITY_PROFILES[bareId];
+ for (const [knownId, profile] of Object.entries(MODEL_CAPABILITY_PROFILES)) {
+ if (bareId.includes(knownId) || knownId.includes(bareId)) return profile;
+ }
+ return { coding: 50, debugging: 50, research: 50, reasoning: 50, speed: 50, longContext: 50, instruction: 50 };
+}
+
function getModelCost(modelId: string): number {
const bareId = modelId.includes("/") ? modelId.split("/").pop()! : modelId;
diff --git a/src/resources/extensions/gsd/phase-anchor.ts b/src/resources/extensions/gsd/phase-anchor.ts
new file mode 100644
index 000000000..16f1df5e1
--- /dev/null
+++ b/src/resources/extensions/gsd/phase-anchor.ts
@@ -0,0 +1,71 @@
+/**
+ * Phase handoff anchors — compact structured summaries written between
+ * GSD auto-mode phases so downstream agents inherit decisions, blockers,
+ * and intent without re-inferring from scratch.
+ */
+
+import { existsSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
+import { join } from "node:path";
+import { gsdRoot } from "./paths.js";
+
+export interface PhaseAnchor {
+ phase: string;
+ milestoneId: string;
+ generatedAt: string;
+ intent: string;
+ decisions: string[];
+ blockers: string[];
+ nextSteps: string[];
+}
+
+function anchorsDir(basePath: string, milestoneId: string): string {
+ return join(gsdRoot(basePath), "milestones", milestoneId, "anchors");
+}
+
+function anchorPath(basePath: string, milestoneId: string, phase: string): string {
+ return join(anchorsDir(basePath, milestoneId), `${phase}.json`);
+}
+
+export function writePhaseAnchor(basePath: string, milestoneId: string, anchor: PhaseAnchor): void {
+ const dir = anchorsDir(basePath, milestoneId);
+ if (!existsSync(dir)) {
+ mkdirSync(dir, { recursive: true });
+ }
+ writeFileSync(anchorPath(basePath, milestoneId, anchor.phase), JSON.stringify(anchor, null, 2), "utf-8");
+}
+
+export function readPhaseAnchor(basePath: string, milestoneId: string, phase: string): PhaseAnchor | null {
+ const path = anchorPath(basePath, milestoneId, phase);
+ if (!existsSync(path)) return null;
+ try {
+ return JSON.parse(readFileSync(path, "utf-8")) as PhaseAnchor;
+ } catch {
+ return null;
+ }
+}
+
+export function formatAnchorForPrompt(anchor: PhaseAnchor): string {
+ const lines: string[] = [
+ `## Handoff from ${anchor.phase}`,
+ "",
+ `**Intent:** ${anchor.intent}`,
+ ];
+
+ if (anchor.decisions.length > 0) {
+ lines.push("", "**Decisions:**");
+ for (const d of anchor.decisions) lines.push(`- ${d}`);
+ }
+
+ if (anchor.blockers.length > 0) {
+ lines.push("", "**Blockers:**");
+ for (const b of anchor.blockers) lines.push(`- ${b}`);
+ }
+
+ if (anchor.nextSteps.length > 0) {
+ lines.push("", "**Next steps:**");
+ for (const s of anchor.nextSteps) lines.push(`- ${s}`);
+ }
+
+ lines.push("", "---");
+ return lines.join("\n");
+}
diff --git a/src/resources/extensions/gsd/preferences-types.ts b/src/resources/extensions/gsd/preferences-types.ts
index 7ae8c9bda..4356badca 100644
--- a/src/resources/extensions/gsd/preferences-types.ts
+++ b/src/resources/extensions/gsd/preferences-types.ts
@@ -21,6 +21,13 @@ import type {
GateEvaluationConfig,
} from "./types.js";
import type { DynamicRoutingConfig } from "./model-router.js";
+
+export interface ContextManagementConfig {
+ observation_masking?: boolean; // default: true
+ observation_mask_turns?: number; // default: 8, range: 1-50
+ compaction_threshold_percent?: number; // default: 0.70, range: 0.5-0.95
+ tool_result_max_chars?: number; // default: 800, range: 200-10000
+}
import type { GitHubSyncConfig } from "../github-sync/types.js";
// ─── Workflow Modes ──────────────────────────────────────────────────────────
@@ -94,6 +101,7 @@ export const KNOWN_PREFERENCE_KEYS = new Set([
"forensics_dedup",
"show_token_cost",
"stale_commit_threshold_minutes",
+ "context_management",
"experimental",
]);
@@ -227,6 +235,7 @@ export interface GSDPreferences {
post_unit_hooks?: PostUnitHookConfig[];
pre_dispatch_hooks?: PreDispatchHookConfig[];
dynamic_routing?: DynamicRoutingConfig;
+ context_management?: ContextManagementConfig;
token_profile?: TokenProfile;
phases?: PhaseSkipPreferences;
auto_visualize?: boolean;
diff --git a/src/resources/extensions/gsd/preferences-validation.ts b/src/resources/extensions/gsd/preferences-validation.ts
index 6b4e0e217..57a715521 100644
--- a/src/resources/extensions/gsd/preferences-validation.ts
+++ b/src/resources/extensions/gsd/preferences-validation.ts
@@ -428,6 +428,10 @@ export function validatePreferences(preferences: GSDPreferences): {
if (typeof dr.hooks === "boolean") validDr.hooks = dr.hooks;
else errors.push("dynamic_routing.hooks must be a boolean");
}
+ if (dr.capability_routing !== undefined) {
+ if (typeof dr.capability_routing === "boolean") validDr.capability_routing = dr.capability_routing;
+ else errors.push("dynamic_routing.capability_routing must be a boolean");
+ }
if (dr.tier_models !== undefined) {
if (typeof dr.tier_models === "object" && dr.tier_models !== null) {
const tm = dr.tier_models as Record;
@@ -452,6 +456,40 @@ export function validatePreferences(preferences: GSDPreferences): {
}
}
+ // ─── Context Management ──────────────────────────────────────────────
+ if (preferences.context_management !== undefined) {
+ if (typeof preferences.context_management === "object" && preferences.context_management !== null) {
+ const cm = preferences.context_management as unknown as Record;
+ const validCm: Record = {};
+
+ if (cm.observation_masking !== undefined) {
+ if (typeof cm.observation_masking === "boolean") validCm.observation_masking = cm.observation_masking;
+ else errors.push("context_management.observation_masking must be a boolean");
+ }
+ if (cm.observation_mask_turns !== undefined) {
+ const turns = cm.observation_mask_turns;
+ if (typeof turns === "number" && turns >= 1 && turns <= 50) validCm.observation_mask_turns = turns;
+ else errors.push("context_management.observation_mask_turns must be a number between 1 and 50");
+ }
+ if (cm.compaction_threshold_percent !== undefined) {
+ const pct = cm.compaction_threshold_percent;
+ if (typeof pct === "number" && pct >= 0.5 && pct <= 0.95) validCm.compaction_threshold_percent = pct;
+ else errors.push("context_management.compaction_threshold_percent must be a number between 0.5 and 0.95");
+ }
+ if (cm.tool_result_max_chars !== undefined) {
+ const chars = cm.tool_result_max_chars;
+ if (typeof chars === "number" && chars >= 200 && chars <= 10000) validCm.tool_result_max_chars = chars;
+ else errors.push("context_management.tool_result_max_chars must be a number between 200 and 10000");
+ }
+
+ if (Object.keys(validCm).length > 0) {
+ validated.context_management = validCm as any;
+ }
+ } else {
+ errors.push("context_management must be an object");
+ }
+ }
+
// ─── Parallel Config ────────────────────────────────────────────────────
if (preferences.parallel && typeof preferences.parallel === "object") {
const p = preferences.parallel as unknown as Record;
diff --git a/src/resources/extensions/gsd/prompts/execute-task.md b/src/resources/extensions/gsd/prompts/execute-task.md
index b433638ac..f1f22fe86 100644
--- a/src/resources/extensions/gsd/prompts/execute-task.md
+++ b/src/resources/extensions/gsd/prompts/execute-task.md
@@ -12,6 +12,8 @@ A researcher explored the codebase and a planner decomposed the work — you are
{{runtimeContext}}
+{{phaseAnchorSection}}
+
{{resumeSection}}
{{carryForwardSection}}
diff --git a/src/resources/extensions/gsd/tests/context-masker.test.ts b/src/resources/extensions/gsd/tests/context-masker.test.ts
new file mode 100644
index 000000000..e09f11c14
--- /dev/null
+++ b/src/resources/extensions/gsd/tests/context-masker.test.ts
@@ -0,0 +1,122 @@
+import test from "node:test";
+import assert from "node:assert/strict";
+
+import { createObservationMask } from "../context-masker.js";
+
+// These helpers produce messages in the pi-ai LLM payload format
+// (post-convertToLlm, pre-provider), which is what before_provider_request sees.
+
+function userMsg(content: string) {
+ return { role: "user", content: [{ type: "text", text: content }] };
+}
+
+function assistantMsg(content: string) {
+ return { role: "assistant", content: [{ type: "text", text: content }] };
+}
+
+/** toolResult in pi-ai format: role "toolResult", content as TextContent[] */
+function toolResult(text: string) {
+ return { role: "toolResult", content: [{ type: "text", text }], toolCallId: "toolu_test", toolName: "Read", isError: false };
+}
+
+/** bashExecution after convertToLlm: becomes a user message with "Ran `cmd`" prefix */
+function bashResult(text: string) {
+ return { role: "user", content: [{ type: "text", text: `Ran \`echo test\`\n\`\`\`\n${text}\n\`\`\`` }] };
+}
+
+const MASK_TEXT = "[result masked — within summarized history]";
+
+test("masks nothing when message count is within keepRecentTurns", () => {
+ const mask = createObservationMask(8);
+ const messages = [
+ userMsg("hello"),
+ assistantMsg("hi"),
+ toolResult("file contents"),
+ ];
+ const result = mask(messages as any);
+ assert.equal(result.length, 3);
+ assert.deepEqual((result[2].content as any)[0].text, "file contents");
+});
+
+test("masks tool results older than keepRecentTurns", () => {
+ const mask = createObservationMask(2);
+ const messages = [
+ userMsg("turn 1"),
+ toolResult("old tool output"),
+ assistantMsg("response 1"),
+ userMsg("turn 2"),
+ toolResult("newer tool output"),
+ assistantMsg("response 2"),
+ userMsg("turn 3"),
+ toolResult("newest tool output"),
+ assistantMsg("response 3"),
+ ];
+ const result = mask(messages as any);
+ // Old tool result (before boundary) should be masked
+ assert.equal((result[1].content as any)[0].text, MASK_TEXT);
+ // Recent tool results (within keep window) should be preserved
+ assert.equal((result[4].content as any)[0].text, "newer tool output");
+ assert.equal((result[7].content as any)[0].text, "newest tool output");
+});
+
+test("never masks assistant messages", () => {
+ const mask = createObservationMask(1);
+ const messages = [
+ userMsg("turn 1"),
+ assistantMsg("old reasoning"),
+ userMsg("turn 2"),
+ assistantMsg("new reasoning"),
+ ];
+ const result = mask(messages as any);
+ assert.equal((result[1].content as any)[0].text, "old reasoning");
+ assert.equal((result[3].content as any)[0].text, "new reasoning");
+});
+
+test("never masks user messages", () => {
+ const mask = createObservationMask(1);
+ const messages = [
+ userMsg("old user message"),
+ assistantMsg("response"),
+ userMsg("new user message"),
+ assistantMsg("response"),
+ ];
+ const result = mask(messages as any);
+ assert.equal((result[0].content as any)[0].text, "old user message");
+});
+
+test("masks bash result user messages", () => {
+ const mask = createObservationMask(1);
+ const messages = [
+ userMsg("turn 1"),
+ bashResult("huge log output"),
+ assistantMsg("response 1"),
+ userMsg("turn 2"),
+ assistantMsg("response 2"),
+ ];
+ const result = mask(messages as any);
+ assert.equal((result[1].content as any)[0].text, MASK_TEXT);
+});
+
+test("returns same array length", () => {
+ const mask = createObservationMask(1);
+ const messages = [
+ userMsg("a"), toolResult("b"), assistantMsg("c"),
+ userMsg("d"), toolResult("e"), assistantMsg("f"),
+ ];
+ const result = mask(messages as any);
+ assert.equal(result.length, messages.length);
+});
+
+test("masks toolResult by role, not by type field", () => {
+ const mask = createObservationMask(1);
+ const messages = [
+ userMsg("turn 1"),
+ // This is the actual pi-ai format: role "toolResult", no type field
+ { role: "toolResult", content: [{ type: "text", text: "old result" }], toolCallId: "t1", toolName: "Read", isError: false },
+ assistantMsg("response 1"),
+ userMsg("turn 2"),
+ assistantMsg("response 2"),
+ ];
+ const result = mask(messages as any);
+ assert.equal((result[1].content as any)[0].text, MASK_TEXT);
+});
diff --git a/src/resources/extensions/gsd/tests/model-router.test.ts b/src/resources/extensions/gsd/tests/model-router.test.ts
index fb1128eb5..f15977495 100644
--- a/src/resources/extensions/gsd/tests/model-router.test.ts
+++ b/src/resources/extensions/gsd/tests/model-router.test.ts
@@ -5,8 +5,11 @@ import {
resolveModelForComplexity,
escalateTier,
defaultRoutingConfig,
+ scoreModel,
+ computeTaskRequirements,
+ MODEL_CAPABILITY_PROFILES,
} from "../model-router.js";
-import type { DynamicRoutingConfig, RoutingDecision } from "../model-router.js";
+import type { DynamicRoutingConfig, RoutingDecision, ModelCapabilities } from "../model-router.js";
import type { ClassificationResult } from "../complexity-classifier.js";
// ─── Helpers ─────────────────────────────────────────────────────────────────
@@ -206,6 +209,89 @@ test("#2192: known model is still downgraded normally", () => {
assert.notEqual(result.modelId, "claude-opus-4-6");
});
+// ─── Capability Scoring (ADR-004 Phase 2) ───────────────────────────────────
+
+test("defaultRoutingConfig includes capability_routing: false", () => {
+ const config = defaultRoutingConfig();
+ assert.equal(config.capability_routing, false);
+});
+
+test("scoreModel computes weighted average of capability × requirement", () => {
+ const caps: ModelCapabilities = {
+ coding: 90, debugging: 80, research: 70,
+ reasoning: 85, speed: 50, longContext: 60, instruction: 75,
+ };
+ const reqs = { coding: 0.9, reasoning: 0.5 };
+ const score = scoreModel(caps, reqs);
+ // Expected: (0.9*90 + 0.5*85) / (0.9 + 0.5) = (81 + 42.5) / 1.4 = 88.21...
+ assert.ok(Math.abs(score - 88.21) < 0.1, `score ${score} should be ~88.21`);
+});
+
+test("scoreModel returns 50 for empty requirements", () => {
+ const caps: ModelCapabilities = {
+ coding: 90, debugging: 80, research: 70,
+ reasoning: 85, speed: 50, longContext: 60, instruction: 75,
+ };
+ const score = scoreModel(caps, {});
+ assert.equal(score, 50);
+});
+
+test("computeTaskRequirements returns base vector for known unit type", () => {
+ const reqs = computeTaskRequirements("execute-task");
+ assert.ok(reqs.coding !== undefined && reqs.coding > 0);
+});
+
+test("computeTaskRequirements boosts instruction for docs-tagged tasks", () => {
+ const reqs = computeTaskRequirements("execute-task", { tags: ["docs"] });
+ assert.ok((reqs.instruction ?? 0) >= 0.8);
+ assert.ok((reqs.coding ?? 1) <= 0.4);
+});
+
+test("computeTaskRequirements returns generic vector for unknown unit type", () => {
+ const reqs = computeTaskRequirements("unknown-unit");
+ assert.ok(reqs.reasoning !== undefined);
+});
+
+test("resolveModelForComplexity uses capability scoring when enabled", () => {
+ const config: DynamicRoutingConfig = {
+ ...defaultRoutingConfig(),
+ enabled: true,
+ capability_routing: true,
+ };
+ const result = resolveModelForComplexity(
+ makeClassification("light"),
+ { primary: "claude-opus-4-6", fallbacks: [] },
+ config,
+ ["claude-opus-4-6", "claude-haiku-4-5", "gpt-4o-mini"],
+ "execute-task",
+ );
+ assert.equal(result.wasDowngraded, true);
+ assert.equal(result.selectionMethod, "capability-scored");
+});
+
+test("resolveModelForComplexity falls back to tier-only when capability_routing is false", () => {
+ const config: DynamicRoutingConfig = {
+ ...defaultRoutingConfig(),
+ enabled: true,
+ capability_routing: false,
+ };
+ const result = resolveModelForComplexity(
+ makeClassification("light"),
+ { primary: "claude-opus-4-6", fallbacks: [] },
+ config,
+ ["claude-opus-4-6", "claude-haiku-4-5", "gpt-4o-mini"],
+ );
+ assert.equal(result.wasDowngraded, true);
+ assert.ok(!result.selectionMethod || result.selectionMethod === "tier-only");
+});
+
+test("MODEL_CAPABILITY_PROFILES has entries for core models", () => {
+ const profiledModels = Object.keys(MODEL_CAPABILITY_PROFILES);
+ assert.ok(profiledModels.length >= 9, `Expected ≥9 profiles, got ${profiledModels.length}`);
+ assert.ok(MODEL_CAPABILITY_PROFILES["claude-opus-4-6"]);
+ assert.ok(MODEL_CAPABILITY_PROFILES["claude-haiku-4-5"]);
+});
+
// ─── #2885: openai-codex and modern OpenAI models in tier map ────────────────
test("#2885: openai-codex light-tier models are recognized", () => {
diff --git a/src/resources/extensions/gsd/tests/phase-anchor.test.ts b/src/resources/extensions/gsd/tests/phase-anchor.test.ts
new file mode 100644
index 000000000..825bb6cc8
--- /dev/null
+++ b/src/resources/extensions/gsd/tests/phase-anchor.test.ts
@@ -0,0 +1,83 @@
+import test from "node:test";
+import assert from "node:assert/strict";
+import { mkdtempSync, mkdirSync, rmSync, existsSync } from "node:fs";
+import { join } from "node:path";
+import { tmpdir } from "node:os";
+
+import { writePhaseAnchor, readPhaseAnchor, formatAnchorForPrompt } from "../phase-anchor.js";
+import type { PhaseAnchor } from "../phase-anchor.js";
+
+function makeTempBase(): string {
+ const tmp = mkdtempSync(join(tmpdir(), "gsd-anchor-test-"));
+ mkdirSync(join(tmp, ".gsd", "milestones", "M001", "anchors"), { recursive: true });
+ return tmp;
+}
+
+test("writePhaseAnchor creates anchor file in correct location", () => {
+ const base = makeTempBase();
+ try {
+ const anchor: PhaseAnchor = {
+ phase: "discuss",
+ milestoneId: "M001",
+ generatedAt: new Date().toISOString(),
+ intent: "Define authentication requirements",
+ decisions: ["Use JWT tokens", "Session expiry 24h"],
+ blockers: [],
+ nextSteps: ["Plan the implementation slices"],
+ };
+ writePhaseAnchor(base, "M001", anchor);
+ assert.ok(existsSync(join(base, ".gsd", "milestones", "M001", "anchors", "discuss.json")));
+ } finally {
+ rmSync(base, { recursive: true, force: true });
+ }
+});
+
+test("readPhaseAnchor returns written anchor", () => {
+ const base = makeTempBase();
+ try {
+ const anchor: PhaseAnchor = {
+ phase: "plan",
+ milestoneId: "M001",
+ generatedAt: new Date().toISOString(),
+ intent: "Break work into slices",
+ decisions: ["3 slices: auth, UI, tests"],
+ blockers: ["Need DB schema first"],
+ nextSteps: ["Execute S01"],
+ };
+ writePhaseAnchor(base, "M001", anchor);
+ const read = readPhaseAnchor(base, "M001", "plan");
+ assert.ok(read);
+ assert.equal(read!.intent, "Break work into slices");
+ assert.deepEqual(read!.decisions, ["3 slices: auth, UI, tests"]);
+ assert.deepEqual(read!.blockers, ["Need DB schema first"]);
+ } finally {
+ rmSync(base, { recursive: true, force: true });
+ }
+});
+
+test("readPhaseAnchor returns null when no anchor exists", () => {
+ const base = makeTempBase();
+ try {
+ const read = readPhaseAnchor(base, "M001", "discuss");
+ assert.equal(read, null);
+ } finally {
+ rmSync(base, { recursive: true, force: true });
+ }
+});
+
+test("formatAnchorForPrompt produces markdown block", () => {
+ const anchor: PhaseAnchor = {
+ phase: "discuss",
+ milestoneId: "M001",
+ generatedAt: "2026-04-03T00:00:00.000Z",
+ intent: "Define requirements",
+ decisions: ["Use JWT"],
+ blockers: [],
+ nextSteps: ["Plan slices"],
+ };
+ const md = formatAnchorForPrompt(anchor);
+ assert.ok(md.includes("## Handoff from discuss"));
+ assert.ok(md.includes("Define requirements"));
+ assert.ok(md.includes("Use JWT"));
+ assert.ok(md.includes("Plan slices"));
+});
diff --git a/src/resources/extensions/gsd/triage-ui.ts b/src/resources/extensions/gsd/triage-ui.ts
index a9b81f46f..da9030d41 100644
--- a/src/resources/extensions/gsd/triage-ui.ts
+++ b/src/resources/extensions/gsd/triage-ui.ts
@@ -49,10 +49,18 @@ const CLASSIFICATION_LABELS: Record