merge: resolve upstream/main conflicts for PR #3177

2026-04-11 22:59:58 -05:00 · 2026-04-11 22:59:58 -05:00 · 4b69e44a42
commit 4b69e44a42
parent 92a3460b66 12ed853dc3
1004 changed files with 89413 additions and 5192 deletions
--- a/.github/workflows/ai-triage.yml
+++ b/.github/workflows/ai-triage.yml
@ -96,41 +96,47 @@ jobs:
            Be generous in your assessment — only flag clear violations. Ambiguous cases should be marked as aligned.
            Do NOT flag issues/PRs that are legitimately reporting bugs or requesting features, even if they could be better written.`;

-            const response = await fetch('https://api.anthropic.com/v1/messages', {
-              method: 'POST',
-              headers: {
-                'x-api-key': process.env.ANTHROPIC_API_KEY,
-                'content-type': 'application/json',
-                'anthropic-version': '2023-06-01'
-              },
-              body: JSON.stringify({
-                model: 'claude-haiku-4-5-20251001',
-                max_tokens: 1024,
-                messages: [{ role: 'user', content: prompt }]
-              })
-            });
-
-            if (!response.ok) {
-              const err = await response.text();
-              core.setFailed(`Anthropic API error: ${response.status} ${err}`);
-              return;
-            }
-
-            const data = await response.json();
-            const text = data.content[0].text;
-
-            // Extract JSON from response (handle markdown code blocks)
-            const jsonMatch = text.match(/\{[\s\S]*\}/);
-            if (!jsonMatch) {
-              core.setFailed(`Could not parse Claude response: ${text}`);
+            if (!process.env.ANTHROPIC_API_KEY) {
+              core.warning('Skipping AI triage because ANTHROPIC_API_KEY is not configured.');
              return;
            }

            let result;
            try {
+              const response = await fetch('https://api.anthropic.com/v1/messages', {
+                method: 'POST',
+                headers: {
+                  'x-api-key': process.env.ANTHROPIC_API_KEY,
+                  'content-type': 'application/json',
+                  'anthropic-version': '2023-06-01'
+                },
+                body: JSON.stringify({
+                  model: 'claude-haiku-4-5-20251001',
+                  max_tokens: 1024,
+                  messages: [{ role: 'user', content: prompt }]
+                }),
+                signal: AbortSignal.timeout(20000)
+              });
+
+              if (!response.ok) {
+                const err = await response.text();
+                core.warning(`Skipping AI triage after Anthropic API error: ${response.status} ${err}`);
+                return;
+              }
+
+              const data = await response.json();
+              const text = data.content?.[0]?.text ?? '';
+
+              // Extract JSON from response (handle markdown code blocks)
+              const jsonMatch = text.match(/\{[\s\S]*\}/);
+              if (!jsonMatch) {
+                core.warning(`Skipping AI triage because the model response was not parseable JSON: ${text}`);
+                return;
+              }
+
              result = JSON.parse(jsonMatch[0]);
            } catch (e) {
-              core.setFailed(`JSON parse error: ${e.message}\nRaw text: ${text}`);
+              core.warning(`Skipping AI triage after unexpected failure: ${e.message}`);
              return;
            }
            core.info(`Triage result: ${JSON.stringify(result, null, 2)}`);
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -155,7 +155,7 @@ jobs:
        run: npm run test:coverage

  windows-portability:
-    timeout-minutes: 15
+    timeout-minutes: 25
    needs: detect-changes
    if: >-
      needs.detect-changes.outputs.docs-only != 'true'
@ -180,12 +180,17 @@ jobs:
      - name: Typecheck extensions
        run: npm run typecheck:extensions

-      - name: Run unit tests
-        run: npm run test:unit
-
      - name: Run package tests
        run: npm run test:packages

+      - name: Run Windows portability tests
+        run: >-
+          node --import ./src/resources/extensions/gsd/tests/resolve-ts.mjs
+          --experimental-strip-types --test
+          src/tests/windows-portability.test.ts
+          src/resources/extensions/gsd/tests/validate-directory.test.ts
+          src/tests/integration/web-mode-windows-hide.test.ts
+
  rtk-portability:
    timeout-minutes: 20
    needs: detect-changes
--- a/.gitignore
+++ b/.gitignore
@ -2,6 +2,16 @@
 # ── Compiled test output ──
 dist-test/

+# ── Compiled output in src/ (should only contain .ts source) ──
+src/**/*.js
+src/**/*.js.map
+src/**/*.d.ts
+src/**/*.d.ts.map
+!src/**/*.test.js
+
+# ── Repowise index (local machine-generated cache) ──
+.repowise/
+
 # ── GSD project state (development-only, lives in worktree branches) ──
 package-lock.json
 .claude/
@ -42,6 +52,9 @@ tmp/
 packages/*/dist/
 packages/*/node_modules/

+# ── Scratch/WIP files ──
+preflight-script.ts
+
 # ── GSD baseline (auto-generated) ──
 dist/
 !/pkg/dist/modes/
@ -55,6 +68,7 @@ TODOS.md
 .planning/
 .audits/
 docs/coherence-audit/
+.plans/

 # ── GSD project state (per-worktree, never committed) ──
 .gsd/
@ -65,3 +79,6 @@ bun.lock

 # ── GSD baseline (auto-generated) ──
 .gsd
+
+# ── GSD baseline (auto-generated) ──
+.gsd-id
--- a/.mcp.json
+++ b/.mcp.json
@ -0,0 +1,14 @@
+{
+  "mcpServers": {
+    "repowise": {
+      "command": "repowise",
+      "args": [
+        "mcp",
+        "/Users/jeremymcspadden/Github/gsd-2",
+        "--transport",
+        "stdio"
+      ],
+      "description": "repowise: codebase intelligence \u2014 docs, graph, git signals, dead code, decisions"
+    }
+  }
+}
--- a/.plans/extension-loading-multi-path.md
+++ b/.plans/extension-loading-multi-path.md
@ -0,0 +1,138 @@
+# Extension Loading: Dependency Sort + Unified Enable/Disable
+
+## Context
+
+GSD-2 has a well-structured extension system with three discovery paths (bundled, global/community, project-local) that are **already wired up** through pi's `DefaultPackageManager.addAutoDiscoveredResources()`. However, two critical gaps remain:
+
+1. `sortExtensionPaths()` (topological dependency sort) is implemented but **never called** — `dependencies.extensions` in manifests is decorative
+2. The GSD extension registry (enable/disable) only applies to **bundled** extensions — community extensions bypass it entirely
+
+### Architecture (Current Flow)
+
+```
+GSD loader.ts
+  → discoverExtensionEntryPaths(bundledExtDir)
+  → filter by GSD registry (isExtensionEnabled)
+  → set GSD_BUNDLED_EXTENSION_PATHS env var
+      ↓
+DefaultResourceLoader.reload()
+  → packageManager.resolve()
+    → addAutoDiscoveredResources()
+      → project: cwd/.gsd/extensions/     (CONFIG_DIR_NAME = ".gsd")
+      → global:  ~/.gsd/agent/extensions/  (includes synced bundled)
+  → loadExtensions(mergedPaths)            ← NO sort, NO registry check on community
+```
+
+### Key Files
+
+| File | Role |
+|------|------|
+| `src/loader.ts` (lines 146-161) | GSD startup — bundled discovery + registry filter |
+| `src/extension-sort.ts` | Topological sort (Kahn's BFS) — EXISTS but NEVER CALLED |
+| `src/extension-registry.ts` | Registry I/O, enable/disable, tier checks |
+| `src/resource-loader.ts` (lines 589-607) | `buildResourceLoader()` — constructs DefaultResourceLoader |
+| `packages/pi-coding-agent/src/core/resource-loader.ts` (lines 311-395) | `reload()` — merges paths, calls `loadExtensions()` |
+| `packages/pi-coding-agent/src/core/package-manager.ts` (lines 1585-1700) | `addAutoDiscoveredResources()` — auto-discovers from .gsd/ dirs |
+| `packages/pi-coding-agent/src/core/extensions/loader.ts` (lines 945-1002) | `discoverAndLoadExtensions()` — DEAD CODE, never invoked |
+
+---
+
+## Plan
+
+### Task 1: Wire topological sort into extension loading
+
+**What:** Call `sortExtensionPaths()` on the merged extension paths before passing them to `loadExtensions()`.
+
+**Where:** `packages/pi-coding-agent/src/core/resource-loader.ts` ~line 381-385
+
+**Before:**
+```typescript
+const extensionsResult = await loadExtensions(extensionPaths, this.cwd, this.eventBus);
+```
+
+**After:**
+```typescript
+import { sortExtensionPaths } from '../../../src/extension-sort.js';
+
+const { sortedPaths, warnings } = sortExtensionPaths(extensionPaths);
+for (const w of warnings) {
+  // emit as diagnostic, not hard error
+}
+const extensionsResult = await loadExtensions(sortedPaths, this.cwd, this.eventBus);
+```
+
+**Consideration:** `sortExtensionPaths` lives in `src/` (GSD side), not in `packages/pi-coding-agent/`. Need to either:
+- (a) Move it into pi-coding-agent as a shared utility, OR
+- (b) Import it cross-package (already done for other GSD→pi imports), OR
+- (c) Call it on the GSD side before paths reach pi — harder since auto-discovered paths are added inside pi's package manager
+
+Option (a) is cleanest — the sort logic only depends on `readManifestFromEntryPath` which is also in `src/extension-registry.ts` but could be duplicated or shared.
+
+### Task 2: Apply GSD registry to community extensions
+
+**What:** When `buildResourceLoader()` in `src/resource-loader.ts` constructs the DefaultResourceLoader, also discover and filter community extensions from `~/.gsd/agent/extensions/` through the GSD registry — same as it already does for `~/.pi/agent/extensions/` paths.
+
+**Where:** `src/resource-loader.ts` → `buildResourceLoader()` (lines 589-607)
+
+**Current code already filters pi extensions:**
+```typescript
+const piExtensionPaths = discoverExtensionEntryPaths(piExtensionsDir)
+  .filter((entryPath) => !bundledKeys.has(getExtensionKey(entryPath, piExtensionsDir)))
+  .filter((entryPath) => {
+    const manifest = readManifestFromEntryPath(entryPath)
+    if (!manifest) return true
+    return isExtensionEnabled(registry, manifest.id)
+  })
+```
+
+**Add similar filtering for community extensions in agentDir:**
+- Discover extensions in `~/.gsd/agent/extensions/` that are NOT bundled
+- Filter through `isExtensionEnabled(registry, manifest.id)`
+- Pass as disabled (via override patterns or pre-filtering) to the resource loader
+
+**Alternative approach:** Hook into `addAutoDiscoveredResources` or the `addResource` call to check the GSD registry. This might be cleaner since the auto-discovery already happens inside pi's package manager.
+
+### Task 3: Emit sort warnings as diagnostics
+
+**What:** Surface dependency warnings (missing deps, cycles) through GSD's diagnostic system so users see them.
+
+**Where:** Wherever the sort is invoked from Task 1.
+
+**Format:**
+```
+⚠ Extension 'gsd-watch' declares dependency 'gsd' which is not installed — loading anyway
+⚠ Extensions 'foo' and 'bar' form a dependency cycle — loading in alphabetical order
+```
+
+### Task 4: Clean up dead code
+
+**What:** The `discoverAndLoadExtensions()` function in `packages/pi-coding-agent/src/core/extensions/loader.ts` (lines 945-1002) is exported but never invoked. The project-local trust model inside it (`getUntrustedExtensionPaths`) also never runs.
+
+**Options:**
+- (a) Remove it entirely — it's dead
+- (b) Mark deprecated — in case upstream pi uses it
+- (c) Leave it — lowest risk
+
+Recommend (b) for now — add `@deprecated` JSDoc so it doesn't grow new callers.
+
+### Task 5: Tests
+
+- **Sort integration test:** Create two extensions where A depends on B. Verify B loads before A after sort.
+- **Registry community test:** Drop a community extension in `~/.gsd/agent/extensions/`, run `gsd extensions disable <id>`, verify it doesn't load.
+- **Conflict test:** Same extension ID in project-local and global — verify project-local wins.
+- **Missing dep test:** Extension declares dependency on non-existent extension — verify warning emitted, extension still loads.
+- **Cycle test:** Two extensions that depend on each other — verify warning, both load.
+
+---
+
+## Follow-up PR (separate)
+
+**Subagent extension forwarding:** Update `src/resources/extensions/subagent/index.ts` to forward ALL extension paths (not just bundled) to child processes. May need a second env var like `GSD_COMMUNITY_EXTENSION_PATHS` or consolidate into `GSD_EXTENSION_PATHS`.
+
+---
+
+## Open Questions
+
+1. **Where should `sortExtensionPaths` live?** Currently in `src/` (GSD side). Needs to be callable from pi's resource-loader. Options: move to pi, keep and import cross-package, or duplicate.
+2. **Should community extensions respect the same registry as bundled?** Or should they have their own enable/disable mechanism? Current plan unifies them.
+3. **Project-local trust:** The TOFU model in the dead `discoverAndLoadExtensions()` never runs. Should `addAutoDiscoveredResources` also gate project-local extensions behind trust? Or is `.gsd/extensions/` in your own project always trusted?
--- a/.plans/ollama-native-provider.md
+++ b/.plans/ollama-native-provider.md
@ -0,0 +1,241 @@
+# Ollama Extension — First-Class Local LLM Support
+
+## Status: DRAFT — Awaiting approval
+
+## Problem
+
+Ollama support in GSD2 currently requires manual `models.json` configuration. Users must:
+1. Know the OpenAI-compatibility endpoint (`localhost:11434/v1`)
+2. Manually list every model they want to use
+3. Set compat flags (`supportsDeveloperRole: false`, etc.)
+4. Use a dummy API key
+
+There's an `ollama-cloud` provider for hosted Ollama, and a discovery adapter that can list models, but no first-class **local Ollama** extension that "just works."
+
+## Goal
+
+Make Ollama the easiest way to use GSD2 — zero config when Ollama is running locally. All Ollama functionality lives in a single extension: `src/resources/extensions/ollama/`.
+
+## Architecture
+
+Everything is a self-contained extension under `src/resources/extensions/ollama/`. The extension:
+- Auto-detects Ollama on startup via health check
+- Discovers and registers local models with the model registry
+- Provides native Ollama API streaming (not OpenAI shim)
+- Exposes `/ollama` slash commands for model management
+- Registers an LLM-callable tool for model pull/status
+
+Minimal core changes — only `KnownProvider` and `KnownApi` type additions in `pi-ai`, and `env-api-keys.ts` for key resolution. Everything else is in the extension.
+
+## File Structure
+
+```
+src/resources/extensions/ollama/
+├── index.ts                  # Extension entry — wires everything on session_start
+├── ollama-client.ts          # HTTP client for Ollama REST API (/api/*)
+├── ollama-discovery.ts       # Model discovery + capability detection
+├── ollama-provider.ts        # Native /api/chat streaming provider (registers with pi-ai)
+├── ollama-commands.ts        # /ollama slash commands (status, pull, list, remove, ps)
+├── ollama-tool.ts            # LLM-callable tool for model management
+├── model-capabilities.ts     # Known model capability table (context window, vision, reasoning)
+└── types.ts                  # Shared types for Ollama API responses
+```
+
+## Scope
+
+### Phase 1: Auto-Discovery + OpenAI-Compat Routing
+
+**What:** Extension that auto-detects Ollama, discovers models, registers them using the existing `openai-completions` API provider. Zero config needed.
+
+**Extension files:**
+- `ollama/index.ts` — Main entry. On `session_start`:
+  1. Probe `localhost:11434` (or `OLLAMA_HOST`) with 1.5s timeout
+  2. If reachable, discover models via `/api/tags`
+  3. Register discovered models with `ctx.modelRegistry` using correct defaults
+  4. Show status widget if Ollama is detected
+- `ollama/ollama-client.ts` — Low-level HTTP client:
+  - `isRunning()` — `GET /` health check
+  - `getVersion()` — `GET /api/version`
+  - `listModels()` — `GET /api/tags`
+  - `showModel(name)` — `POST /api/show` (details, template, parameters, size)
+  - `getRunningModels()` — `GET /api/ps` (loaded models, VRAM usage)
+  - `pullModel(name, onProgress)` — `POST /api/pull` (streaming progress)
+  - `deleteModel(name)` — `DELETE /api/delete`
+  - `copyModel(source, dest)` — `POST /api/copy`
+  - Respects `OLLAMA_HOST` env var for non-default endpoints
+- `ollama/ollama-discovery.ts` — Enhanced model discovery:
+  - Calls `/api/tags` to get model list
+  - Calls `/api/show` per model (batch, cached) to get:
+    - `details.parameter_size` → estimate context window
+    - `details.families` → detect vision (clip), reasoning (deepseek-r1)
+    - `modelfile` → extract default parameters
+  - Returns enriched `DiscoveredModel[]` with proper capabilities
+- `ollama/model-capabilities.ts` — Known model lookup table:
+  - Maps well-known model families to capabilities
+  - e.g., `llama3.1` → `{ contextWindow: 131072, input: ["text"] }`
+  - e.g., `llava` → `{ contextWindow: 4096, input: ["text", "image"] }`
+  - e.g., `deepseek-r1` → `{ reasoning: true, contextWindow: 131072 }`
+  - e.g., `qwen2.5-coder` → `{ contextWindow: 131072, input: ["text"] }`
+  - Fallback: estimate from parameter count if not in table
+- `ollama/types.ts` — Ollama API response types
+
+**Core changes (minimal):**
+- `packages/pi-ai/src/types.ts` — Add `"ollama"` to `KnownProvider`
+- `packages/pi-ai/src/env-api-keys.ts` — Add `"ollama"` key resolution (returns `"ollama"` placeholder — no real key needed)
+- `src/onboarding.ts` — Add `"ollama"` to provider selection list
+- `src/wizard.ts` — Add `ollama` entry (no key required)
+
+**Model registration details:**
+Each discovered model registers as:
+```typescript
+{
+  id: "llama3.1:8b",           // from /api/tags
+  name: "Llama 3.1 8B",        // humanized
+  api: "openai-completions",    // uses existing provider
+  provider: "ollama",
+  baseUrl: "http://localhost:11434/v1",
+  cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
+  reasoning: false,             // from capabilities table
+  input: ["text"],              // from capabilities table
+  contextWindow: 131072,        // from capabilities table or /api/show
+  maxTokens: 16384,             // conservative default
+  compat: {
+    supportsDeveloperRole: false,
+    supportsReasoningEffort: false,
+    supportsUsageInStreaming: false,
+    maxTokensField: "max_tokens",
+  },
+}
+```
+
+**Behavior:**
+- `gsd --list-models` shows all locally-pulled Ollama models automatically
+- `/model ollama/llama3.1:8b` works without any config file
+- If Ollama isn't running, extension is silent — no errors, no models listed
+- `models.json` overrides still work (user config wins over auto-discovery)
+
+### Phase 2: Native Ollama API Provider (`/api/chat`)
+
+**What:** A dedicated streaming provider that talks Ollama's native protocol instead of the OpenAI compatibility shim.
+
+**Extension files:**
+- `ollama/ollama-provider.ts` — Native `/api/chat` streaming:
+  - Registers `"ollama-chat"` API with `registerApiProvider()`
+  - Implements `stream()` and `streamSimple()`:
+    - Maps GSD `Context` → Ollama messages format
+    - Maps GSD `Tool[]` → Ollama tool format
+    - Streams NDJSON responses, maps back to `AssistantMessage` events
+    - Extracts `<think>` blocks for reasoning models (deepseek-r1, qwq)
+  - Ollama-specific options:
+    - `keep_alive` — control model memory retention (default: "5m")
+    - `num_ctx` — pass through model's context window
+    - `num_predict` — max output tokens
+    - Temperature, top_p, top_k
+  - Response metadata:
+    - `eval_count` / `eval_duration` → tokens/sec in usage stats
+    - `total_duration`, `load_duration` → performance visibility
+  - Vision support: converts image content to base64 for multimodal models
+
+**Core changes:**
+- `packages/pi-ai/src/types.ts` — Add `"ollama-chat"` to `KnownApi`
+
+**Phase 1 models switch to `api: "ollama-chat"` by default.** Users can force OpenAI-compat via `models.json` override if needed.
+
+**Why native over OpenAI-compat:**
+- Full `keep_alive` / `num_ctx` control
+- Better error messages (Ollama-native vs generic OpenAI)
+- More reliable tool calling on Ollama's native format
+- Performance metrics in response (tokens/sec)
+- Foundation for model management commands
+
+### Phase 3: Local LLM Management UX
+
+**What:** `/ollama` slash commands and an LLM tool for model management.
+
+**Extension files:**
+- `ollama/ollama-commands.ts` — Slash commands registered via `pi.registerCommand()`:
+  - `/ollama` — Status overview:
+    ```
+    Ollama v0.5.7 — running (localhost:11434)
+
+    Loaded:
+      llama3.1:8b       4.7 GB VRAM   idle 3m
+
+    Available:
+      llama3.1:8b       (4.7 GB)
+      qwen2.5-coder:7b  (4.4 GB)
+      deepseek-r1:8b    (4.9 GB)
+    ```
+  - `/ollama pull <model>` — Pull with streaming progress via `ctx.ui.setWidget()`
+  - `/ollama list` — List all local models with sizes and families
+  - `/ollama remove <model>` — Delete a model (with confirmation)
+  - `/ollama ps` — Running models + VRAM usage
+- `ollama/ollama-tool.ts` — LLM-callable tool registered via `pi.registerTool()`:
+  - `ollama_manage` tool — lets the agent pull/list/check models
+  - Parameters: `{ action: "list" | "pull" | "status" | "ps", model?: string }`
+  - Use case: agent detects it needs a model, pulls it automatically
+
+**UX Flow:**
+```
+$ gsd
+> /ollama
+Ollama v0.5.7 — running (localhost:11434)
+Loaded:
+  llama3.1:8b    — 4.7 GB VRAM, idle 3m
+Available:
+  llama3.1:8b    (4.7 GB)
+  qwen2.5-coder:7b (4.4 GB)
+  deepseek-r1:8b (4.9 GB)
+
+> /ollama pull codestral:22b
+Pulling codestral:22b...
+████████████████████████████░░░░ 78% (14.2 GB / 18.1 GB)
+✓ codestral:22b ready
+
+> /model ollama/codestral:22b
+Switched to codestral:22b (local, Ollama)
+```
+
+## Implementation Order
+
+1. **Phase 1** — Auto-discovery with OpenAI-compat routing. Biggest user impact, smallest risk.
+2. **Phase 3** — Management UX (`/ollama` commands). Valuable even before native API.
+3. **Phase 2** — Native `/api/chat` provider. Optimization over OpenAI-compat; do last.
+
+## Core Changes Summary (minimal)
+
+| File | Change |
+|------|--------|
+| `packages/pi-ai/src/types.ts` | Add `"ollama"` to `KnownProvider`, `"ollama-chat"` to `KnownApi` (Phase 2) |
+| `packages/pi-ai/src/env-api-keys.ts` | Add `"ollama"` → always returns `"ollama"` placeholder |
+| `src/onboarding.ts` | Add `"ollama"` to provider picker |
+| `src/wizard.ts` | Add `"ollama"` key mapping (no key required) |
+
+Everything else lives in `src/resources/extensions/ollama/`.
+
+## Risks & Mitigations
+
+| Risk | Mitigation |
+|------|------------|
+| Ollama not running — startup probe latency | 1.5s timeout; cache result; probe async so it doesn't block TUI paint |
+| Model capabilities unknown | Known-model table + `/api/show` fallback + parameter_size estimation |
+| Tool calling unreliable on small models | Detect param count; warn on <7B models |
+| Ollama API changes between versions | Version detect via `/api/version`; stable endpoints only |
+| Conflicts with `models.json` Ollama config | User config always wins; auto-discovered models merge beneath manual config |
+| Extension disabled — no impact on core | Extension is additive; disabling removes all Ollama features cleanly |
+
+## Testing Strategy
+
+- Unit tests: `ollama-client.ts` with mocked fetch responses
+- Unit tests: `ollama-discovery.ts` model capability parsing
+- Unit tests: `ollama-provider.ts` message format mapping + NDJSON stream parsing
+- Unit tests: `model-capabilities.ts` known model lookups
+- Integration test: mock HTTP server simulating Ollama `/api/tags`, `/api/chat`, `/api/pull`
+- Manual test: real Ollama instance with llama3.1, qwen2.5-coder, deepseek-r1
+
+## Open Questions
+
+1. **Startup probe** — Probe Ollama on `session_start` (adds ~1.5s if not running) or lazy on first `/model`? **Recommendation: async probe on session_start (non-blocking), eager if `OLLAMA_HOST` is set.**
+2. **Auto-start** — Try to launch Ollama if installed but not running? **Recommendation: no — too invasive. Show helpful message in `/ollama` status.**
+3. **Vision support** — Support multimodal models (llava, etc.) in Phase 2 native API? **Recommendation: yes, detected via capabilities table.**
+4. **Model refresh** — How often to re-probe Ollama for new models? **Recommendation: on `/ollama list`, on `/model` command, and every 5 min (existing TTL).**
--- a/.prompt-injection-scanignore
+++ b/.prompt-injection-scanignore
@ -0,0 +1,2 @@
+# False positives in GSD prompt templates — these are legitimate LLM instructions, not injection
+src/resources/extensions/gsd/prompts/doctor-heal.md:You are now responsible
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -6,6 +6,592 @@ Format based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).

 ## [Unreleased]

+## [2.71.0] - 2026-04-11
+
+### Added
+- **mcp-server**: add secure_env_collect tool via MCP form elicitation
+
+### Fixed
+- **tui**: clear pinned output on message_end to prevent duplicate display
+- **tui**: clear pinned latest output on turn completion
+- **tui**: restore pinned output above editor during tool execution
+- TOCTOU file locking race conditions in event log and custom workflow graph
+- **tui**: mask secure extension input values in interactive mode
+- **claude-code**: harden MCP elicitation schema handling
+- **claude-code**: accept secure_env_collect MCP elicitation forms
+- **interactive**: keep MCP tool output ordered and restore secure prompt fallback
+- **interactive**: preserve MCP tool output stream ordering
+- **gsd**: resolve workflow MCP test typing regressions
+- **mcp**: return isError flag on workflow tool execution failures
+- **discuss**: add structuredQuestionsAvailable conditional to all gates
+- **discuss**: add multi-round questioning to new-project discuss phase
+- **gsd**: harden claude-code workflow MCP bootstrap
+- **web**: drop provisional pre-tool question text
+
+### Changed
+- extract deriveStateFromDb logic into composable helpers
+- **pr**: drop web-layer changes from MCP stream-order fix
+
+## [2.70.1] - 2026-04-11
+
+### Fixed
+- **routing**: address codex review — complete interactive bypass and accurate banner
+- **routing**: skip dynamic routing for interactive dispatches, always show model changes (#3962)
+- **ci**: trim windows portability integration load
+- **ci**: narrow windows portability coverage
+- **ci**: skip validate-pack in windows portability job
+- **ci**: unblock windows portability follow-up
+- **windows**: harden portability across runtime and tooling
+- **auto**: use pathToFileURL for cross-platform import and reconcile regression test
+- **auto**: resolve resource-loader.js from GSD_PKG_ROOT on resume (#3949)
+- **mcp-server**: importLocalModule resolves src/ paths from dist/ context
+- **gsd**: surface scoped doctor health warnings
+- **gsd**: skip skipped slices in milestone prompts
+- **gsd**: handle doubled-backtick pre-exec paths
+- **update**: fetch latest version from registry
+
+## [2.70.0] - 2026-04-10
+
+### Added
+- **mcp-server**: expose ask_user_questions via elicitation
+
+### Fixed
+- **pi-ai**: remove Anthropic OAuth flow for TOS compliance
+- **mcp-server**: hydrate model credentials into env
+- **mcp-server**: hydrate stored tool credentials on startup
+- **gsd**: auto-enable cmux when detected instead of prompting
+- **mcp-server**: URL scheme regex no longer matches Windows drive letters
+
+## [2.69.0] - 2026-04-10
+
+### Added
+- **gsd**: implement ADR-005 multi-model provider and tool strategy
+- **gsd**: complete ADR-004 capability-aware model routing implementation
+
+### Fixed
+- **gsd**: add missing directories to codebase generator exclude list
+- **gsd**: wire ADR-005 infrastructure into live paths
+- **gsd**: replace empty catch with logWarning for CI compliance
+- **gsd**: merge enhanced context sections into standard template, clean up stale gate patterns
+- **gsd**: remove broken discuss-prepared template, inject briefs into discuss.md
+
+## [2.68.1] - 2026-04-10
+
+### Fixed
+- **ci**: update FILE-SYSTEM-MAP.md path after docs reorganization
+- **test**: update discord invite test path after docs reorganization
+- **gsd**: resolve resource-loader import for deployed extensions
+
+## [2.68.0] - 2026-04-10
+
+### Added
+- expose slice replanning over workflow MCP
+- expose milestone workflow tools over MCP
+- expose slice completion over workflow MCP
+- expose task completion alias over workflow MCP
+- expose GSD planning tools over MCP
+- gate workflow MCP units by provider transport capabilities
+- expose core GSD workflow tools over MCP
+- add contextual tips system for TUI and web terminal
+
+### Fixed
+- **state**: prevent false degraded-mode warning when DB not yet initialized
+- **gsd**: use debugLog in catch block to satisfy empty-catch lint
+- **gsd**: avoid false manifest and skipped-slice warnings
+- **gsd**: replace empty catch block with descriptive comment
+- guard autoCommitDirtyState and restore cwd on MergeConflictError (#2929)
+- Claude Code MCP tool output rendering and real-time streaming
+- **gsd**: surface warnings when DB or STATE.md init fails
+- **gsd**: create gsd.db, runtime/, and STATE.md during init (#3880)
+- **gsd**: suppress workflow stderr during /gsd
+- **gsd**: enforce workflow write gates over MCP
+- restore autoStartTime on resume + replace empty catch blocks (#3585)
+- **mcp**: harden workflow tool boundary
+- **gsd**: accept em-dash none verification rationale
+- **gsd**: resync managed resources on auto resume
+- **gsd**: stop stale forensics context hijacks
+- **gsd**: serialize workflow MCP execution state
+- **gsd**: restore milestone status db preflight
+- **claude-code-cli**: suppress streamed internal tool noise
+- **gsd**: skip same-path planning artifact copies
+- **claude-code-cli**: suppress internal tool call noise
+- **pi-coding-agent**: avoid oauth login for api-key providers
+- **gsd**: snapshot new untracked files before dispatch
+- **platform**: harden command execution and stabilize onboarding sync
+- **pi-ai**: restore event stream factory export
+- **gsd**: use valid codebase refresh logger
+- **gsd**: auto-refresh codebase cache
+- **gsd**: align model switching and prefs surfaces
+- route slice and validation artifacts through DB tools
+- make gsd_complete_task the only execute-task summary path
+- **docs**: stop pointing repo documentation to gsd.build
+- add activeEngineId and activeRunDir to PausedSessionMetadata interface
+- **gsd**: address QA round 4
+- **gsd**: address QA round 3
+- **gsd**: address QA round 2
+- **gsd**: address QA round 1
+- **gsd**: address review feedback from trek-e
+- **gsd**: assess recovery from paused worktree state
+- **gsd**: satisfy extension typecheck for interrupted recovery
+- **gsd**: restore hook dispatch export and guided flow imports
+- **gsd**: clear stale paused metadata in guided flow
+- **gsd**: preserve interrupted-session resume mode
+- preserve explicit interrupted-session resume mode
+- preserve step-mode and suppress stale paused resumes
+- suppress stale interrupted-session resume prompts
+
+### Changed
+- harden workflow MCP executor loading
+- **ci**: add weekly workflow to regenerate model registry
+- **deps**: refresh audited package locks
+
+## [2.67.0] - 2026-04-09
+
+### Added
+- **context**: implement R005 decision scope cascade and derive scope from slice metadata
+- **M005**: Tiered Context Injection - relevance-scoped context with 65%+ reduction
+
+### Fixed
+- **test**: align auto-loop test timers with updated session timeout
+- **gsd**: repair CI after branch split
+- **gsd**: repair CI after branch split
+- **gsd**: repair CI after branch split
+- **gsd**: fail closed for discussion gate enforcement
+- **gsd**: harden auto merge recovery and session safety
+- **gsd**: repair overlay, shortcut, and widget surfaces
+- **gsd**: prevent stale workflow reconcile state writes
+- **gsd**: align prompt contracts and validation flow
+- **pi-tui**: harden input parsing and editor focus behavior
+- **remote-questions**: cancel local TUI when remote answer wins the race
+- **auto**: increase session timeout to 120s and treat timeout as recoverable pause (#3767)
+- **ui**: apply anthropic-api display name to all model/provider UI surfaces
+- **ui**: display 'anthropic-api' in GSD preferences wizard provider list
+- **remote-questions**: race local TUI against remote channel instead of remote-only routing
+- **ui**: display 'anthropic-api' in model selector to distinguish from claude-code
+- **gates**: add mechanical enforcement for discussion question gates
+- **prompts**: harden non-bypassable gates and exclude dot-folders from scanning
+- **gsd**: ignore filename headings in parsePlan
+- **providers**: match 'out of extra usage' error and respect claude-code provider in model resolution (#3772)
+- **pi-ai**: recover XML parameters trapped in JSON strings
+- **retry**: guard claude-code fallback to anthropic provider only
+- **providers**: route Anthropic subscription users through Claude Code CLI (#3772)
+- **claude-code**: use native Windows claude lookup
+- **gsd**: suppress repeated preferences section warnings
+- **gsd**: normalize described expected output paths
+- **auto**: resilient transient error recovery — defer to Core RetryHandler and fix cmdCtx race
+
+## [2.66.1] - 2026-04-08
+
+### Fixed
+- **pi-tui**: revert contentCursorRow, use hardwareCursorRow as movement baseline
+- **pi-tui**: use contentCursorRow for render movement baseline instead of cursorRow
+- **gsd**: add logWarning to empty catch block in orphaned worktree cleanup
+- **gsd**: add consecutiveFinalizeTimeouts to LoopState in journal tests
+- **gsd**: add escalation and unit-detach guards to finalize timeout handlers
+- **gsd**: add timeout guard around postUnitPreVerification to prevent auto-loop hang
+- **gsd**: OS-specific keyboard shortcut hints via formatShortcut helper
+- **subagent**: support list-style tools frontmatter
+- clear autocomplete rows from content bottom
+- parse annotated pre-exec file paths
+- **gsd**: add orphaned milestone branch audit at auto-mode bootstrap
+
+## [2.66.0] - 2026-04-08
+
+### Added
+- **gsd**: add fast path for queued milestone discussion
+- **gsd**: add /gsd show-config command
+- **reactive**: graph diagnostics and subagent_model config
+- **dispatch**: parallel research slices and parallel milestone validation
+- **parallel**: worker model override for parallel milestone workers
+
+### Fixed
+- **gsd**: validate depth verification answer before unlocking write-gate
+- **gsd**: revert unknown artifact check to warn-and-proceed
+- **gsd**: add missing cmd field to test base WorkflowEvent
+- **gsd**: address remaining adversarial review findings for wave 3
+- **gsd**: detect concurrent event log growth during reconcile
+- **gsd**: address adversarial review findings for wave 3
+- **gsd**: address adversarial review findings for wave 2
+- **gsd**: address adversarial review findings for wave 1
+- **gsd**: WAL-safe migration backup + stronger regression tests
+- **gsd**: consistency and cleanup (wave 5/5)
+- **gsd**: write safety — atomic writes and randomized tmp paths (wave 4/5)
+- **gsd**: session and recovery robustness (wave 3/5)
+- **gsd**: event log and reconciliation robustness (wave 2/5)
+- **gsd**: critical state machine data integrity fixes (wave 1/5)
+- **gsd**: critical state machine data integrity fixes (wave 1/5)
+- **gsd**: remove ecosystem research stub and address adversarial review
+- **gsd**: suppress model change notification in auto-mode unless verbose
+- **gsd**: exclude task.files from checkTaskOrdering to prevent false positives
+- **state**: skip ghost check for queued milestones in registry build
+- **ci**: replace empty catch blocks and raw stderr with logWarning
+- **logging**: add debugLog to empty catch in reopen-milestone
+- **state-machine**: 9 resilience fixes + 86 regression tests (#3161)
+- **gsd**: add incremental persistence to discuss prompts
+- replace empty catch with logWarning for silent-catch-diagnostics test
+- **test**: escape regex metacharacters in skip-by-preference pattern test
+- **test**: search for numbered step definitions in prompt ordering test
+- **test**: update notes loop test for notesVisible guard behavior
+- **test**: update action count for note captures now included in results
+- **test**: remove extraneous test file from wrong branch
+- **test**: update worktree sync tests to use separate milestone IDs
+- **gsd**: use valid LogComponent type for stale branch guard warning
+- **test**: update rogue detection test for auto-remediation behavior
+- **test**: update stuck-planning test to expect executing after reconciliation
+- **test**: update file path consistency tests for inputs-only checking
+- **test**: add CONTEXT file to queued milestone ghost detection test
+- **test**: update needs-remediation test to expect validating-milestone phase
+- **gsd**: import all-done milestones as complete during DB migration
+- **gsd**: allow milestone completion when validation skipped by preference
+- **gsd**: set slice sequence at all three insertion sites
+- **gsd**: four prompt/runtime fixes for completion and session stability
+- **gsd**: default insertMilestone status to queued instead of active
+- **gsd**: suppress repeated frontmatter YAML parse warnings
+- **gsd**: normalize list inputs in complete-task + fix roadmap dep parsing
+- **gsd**: open DB before status derivation + respect isolation:none in quick
+- **gsd**: add .bg-shell/ to baseline gitignore patterns
+- **tui**: prevent Enter key infinite loop in interview notes mode
+- **provider**: handle Enter key to initiate auth setup in provider manager
+- **gsd**: cap run-uat dispatch attempts to prevent infinite replay loop
+- **mcp**: use createRequire to resolve SDK wildcard subpath imports
+- **gsd**: mark note captures as executed in executeTriageResolutions
+- **gsd**: validate main_branch preference exists before using in merge
+- **gsd**: handle deleted cwd in projectRoot to prevent ENOENT crash
+- **gsd**: skip current milestone in syncWorktreeStateBack to prevent merge conflicts
+- **gsd**: add structuredQuestionsAvailable conditional to slice discuss
+- **gsd**: restore full tool set after discuss flow scoping
+- **gsd**: tighten verifyExpectedArtifact to prevent rogue-write false positives
+- **gsd**: add verification gate to complete-slice tool
+- **gsd**: fix pre-execution-checks false positives from backticks and task.files
+- **gsd**: stop renderAllProjections from overwriting authoritative PLAN.md
+- **gsd**: auto-checkout to main when isolation:none finds stale milestone branch
+- **gsd**: auto-remediate stale slice DB status when SUMMARY exists on disk
+- **gsd**: open DB on demand in gsd_milestone_status for non-auto sessions
+- **gsd**: detect phantom milestones from abandoned gsd_milestone_generate_id
+- **gsd**: force re-validation when verdict is needs-remediation
+- **gsd**: exclude closed slices from findMissingSummaries check
+- **gsd**: recover from stale lockfile after crash or SIGKILL
+- **gsd**: add createdAt timestamp and 30s age guard to staleness check
+- **gsd**: clear stale pendingAutoStart after /clear interrupts discussion
+- **gsd**: suppress misleading warnings for expected ENOENT/EISDIR conditions
+- **gsd**: extract real error from message content when errorMessage is useless
+- **gsd**: extract real error from message content when errorMessage is useless
+- **gsd**: show accurate pause message for queued-user-message skip
+- **gsd**: treat queued-user-message skip as non-retryable interruption
+- **gsd**: recognize "Not provided." default in isVerificationNotApplicable
+- **gsd**: discoverManifests skips symlinked extension directories
+- **gsd**: recognize "Not provided." default in isVerificationNotApplicable
+- **gsd**: reconcile plan-file tasks into DB when planner skips persistence (#3600)
+- **gsd**: use isClosedStatus() in dispatch guard instead of raw complete check
+- **browser-tools**: make sharp an optional lazy dependency
+- **gsd**: pass required arguments in defer-milestone-stamp test
+- **gsd**: replace remaining empty catch with logWarning
+- **gsd**: use logWarning instead of raw stderr in catch blocks
+- **gsd**: log error instead of empty catch in STATE.md rebuild
+- **gsd**: log error instead of empty catch in skip_slice
+- **gsd**: cast milestone classification to string for type safety
+- **gsd**: treat zero-slice roadmap as pre-planning in guided flow
+- **gsd**: rebuild STATE.md after skip-slice and strengthen rethink prompt
+- **gsd**: use main_branch preference in worktree creation
+- **gsd**: stamp defer and milestone captures as executed after triage
+- **tui**: treat absolute file paths as plain text, not commands
+- **tui**: break infinite re-render loop for images in cmux
+- **gsd**: rebuild STATE.md before guided-flow dispatch
+- **gsd**: defer queued shells in active milestone selection
+- **retry**: prevent 429 quota cascade and 30-min lockout
+- **gsd**: add fastPathInstruction to buildDiscussMilestonePrompt loadPrompt call
+
+### Changed
+- auto-commit after quick-task
+- auto-commit after quick-task
+- auto-commit after quick-task
+- auto-commit after quick-task
+- auto-commit after quick-task
+- auto-commit after quick-task
+- auto-commit after quick-task
+
+## [2.65.0] - 2026-04-07
+
+### Added
+- **gsd**: persistent notification panel with TUI overlay, widget, and web API
+- **gsd**: wire blocking behavior and strict mode for enhanced verification
+- **gsd**: add post-execution cross-task consistency checks
+- **gsd**: add pre-execution plan verification checks
+
+### Fixed
+- **gsd**: wrap long notification messages and fit overlay to content
+- **gsd**: remove background color from backdrop, fix message truncation
+- **gsd**: restore consistent overlay height to prevent ghost artifacts
+- **gsd**: improve notification overlay backdrop and content-fit sizing
+- **gsd**: only unlink notification lock when owned, prevent foreign lock deletion
+- **gsd**: add backdrop dimming and viewport padding to notification overlay
+- **gsd**: add intent + phase guards to resume context fallback (#3615)
+- **gsd**: inject task context for unstructured resume prompts (#3615)
+- **pi-coding-agent**: restore extension tools after session switch (#3616)
+- **agent-loop**: schema overload cap ignores bash execution errors (#3618)
+- **bg-shell**: prevent signal handler accumulation + cap alert queue
+- **gsd**: coerce plain-string provides field to array in complete-slice (#3585)
+- address PR #3468 review findings
+- **gsd**: persist autoStartTime across session resume so elapsed timer survives /exit
+- **gsd**: add enhanced_verification preferences to mergePreferences
+- **headless**: treat discuss and plan as multi-turn commands
+
+### Changed
+- **interactive**: cap rendered chat components + kill orphan descendants
+- **tui**: render-skip, frame isolation, Text cache guard, dispose
+
+## [2.64.0] - 2026-04-06
+
+### Added
+- **gsd**: add LLM safety harness for auto-mode damage control
+- **ollama**: native /api/chat provider with full option exposure
+- **parallel**: slice-level parallelism with dependency-aware dispatch (#3315)
+- **mcp-client**: add OAuth auth provider for HTTP transport (#3295)
+
+### Fixed
+- **ui**: remove 200-column cap on welcome screen width
+- address adversarial review findings for #3576
+- **gsd**: replace hardcoded agent skill paths with dynamic resolution (#3575)
+- **headless**: sync resources and use agent dir for query
+- **cli**: show latest version and bypass npm cache in update check
+- **gsd**: follow CONTRIBUTING standards for #3565
+- **gsd**: address Codex adversarial review findings for #3565
+- **gsd**: coerce string arrays to objects in complete-slice/task tools (#3565)
+- **gsd**: harden flat-rate routing guard against alias/resolution gaps
+- **pi-coding-agent**: register models.json providers and await Ollama probe in headless mode
+- **ollama**: use apiKey auth mode to avoid streamSimple crash
+- **gsd**: disable dynamic model routing for flat-rate providers
+- **gsd**: address Codex adversarial review findings
+- **gsd**: prevent LLM from querying gsd.db directly via bash (#3541)
+- **gsd**: seed requirements table from REQUIREMENTS.md on first update
+- **gsd**: inject S##-CONTEXT.md from slice discussion into all prompt builders
+- **cli**: guard model re-apply against session restore and async rejection
+- **pi-coding-agent**: resolve model fallback race that ignores configured provider (#3534)
+- **detection**: add xcodegen and Xcode bundle support to project detection (#1882)
+- **perf**: share jiti module cache across extension loads (#3308)
+- **resource-sync**: prune removed bundled subdirectory extensions on upgrade (#1972)
+- recognize U+2705 checkmark emoji as completion marker in prose roadmaps (#1897)
+- **web**: use safePackageRootFromImportUrl for cross-platform package root (#1881) (#1893)
+- isolate CmuxClient stdio to prevent TUI hangs in CMUX (#3306)
+- worktree health check walks parent dirs for monorepo support (#3313)
+- **gsd**: promote milestone status from queued to active in plan-milestone (#3317)
+- **worktree**: correct merge failure notification command from /complete-milestone to /gsd dispatch complete-milestone (#1901)
+- detect and block Gemini CLI OAuth tokens used as API keys (#3296)
+- **auto**: break retry loop on tool invocation errors (malformed JSON) (#3298)
+- **git**: use git add -u in symlink .gsd fallback to prevent hang (#3299)
+- handle complete-slice context exhaustion to unblock downstream slices (#3300)
+- cap consecutive tool validation failures to prevent stuck-loop (#3301)
+- make enrichment tool params optional for limited-toolcall models (#3302)
+- add filesystem safety guard to complete-slice.md (#3304)
+- **extensions**: use bundledExtensionKeys for conflict detection instead of broken path heuristic (#3305)
+- scope tools during discuss flows to prevent grammar overflow (#3307)
+- **preferences**: warn on silent parse failure for non-frontmatter files (#3310)
+- track remote-questions in managed-resources manifest (#3312)
+- **auto**: add timeout guard for postUnitPostVerification in runFinalize (#3314)
+- **gsd**: handle large markdown parameters in complete-milestone JSON parsing (#3316)
+- **metrics**: deduplicate idle-watchdog entries and fix forensics false-positives (#1973)
+- prevent milestone/slice artifact rendering corruption (#3293)
+- **doctor**: strip --fix flag before positional parse (#1919) (#1926)
+- resolve external-state worktree DB path (#2952) (#3303)
+- **gsd**: worktree teardown path validation prevents data loss (#3311)
+- prevent auto-mode from dispatching deferred slices (#3309)
+- preserve completed slice status on plan-milestone re-plan (#3318)
+- reopen DB on cold resume, recognize heavy check mark (#3319)
+- dashboard model label shows dispatched model, not stale previous unit (#3320)
+
+### Changed
+- **gsd**: remove copyright line from test file
+- **gsd**: trim promptGuidelines to 1 line to reduce per-turn token cost
+- **web**: consolidate subprocess boilerplate into shared runner (#1899)
+
+## [2.63.0] - 2026-04-05
+
+### Added
+- **mcp-server**: add 6 read-only tools for project state queries (#3515)
+
+### Fixed
+- **gsd**: enrich vague diagnostic messages with root-cause context
+- **test**: reset dedup cache between ask-user-freetext tests
+- **db**: delete orphaned WAL/SHM files alongside empty gsd.db (#2478)
+- **gsd**: prevent auto-wrapup from interrupting in-flight tool calls (#3512)
+- **gsd**: handle bare model IDs in resolveDefaultSessionModel (#3517)
+- **gsd**: wrap decision and requirement saves in transaction to prevent ID races
+- **gsd**: prefer PREFERENCES.md over settings.json for session bootstrap model (#3517)
+- **gsd**: add Claude Code official skill directories to skill resolution
+- **dedup**: hash full question payload, not just IDs
+- **gsd**: prevent duplicate ask_user_questions dispatches with per-turn dedup cache
+- **pi-ai**: extend repairToolJson to handle XML tags and truncated numbers
+- **pi-coding-agent**: cancel stale retries after model switch
+
+### Changed
+- untrack .repowise/ and add to .gitignore
+
+## [2.62.1] - 2026-04-05
+
+### Fixed
+- **gsd**: gate steer worktree routing on active session, fix messaging
+- **gsd**: resolve steer overrides to worktree path when worktree is active
+
+## [2.62.0] - 2026-04-04
+
+### Added
+- **gsd**: enhance /gsd codebase with preferences, --collapse-threshold, and auto-init
+- **01-05**: fire before_model_select hook, add verbose scoring output, load capability overrides
+- **01-04**: register before_model_select placeholder handler in GSD hooks
+- **01-04**: add BeforeModelSelectEvent to extension API and wire emission
+- **01-03**: wire taskMetadata from selectAndApplyModel to resolveModelForComplexity
+- **01-03**: insert STEP 2 capability scoring into resolveModelForComplexity
+- **01-01**: add taskMetadata to ClassificationResult and export extractTaskMetadata
+- **01-01**: add capability types, data tables, and scoring functions to model-router
+
+### Fixed
+- **gsd**: add codebase validation in validatePreferences so preferences are not silently dropped
+- **test**: update db-path-worktree-symlink test for simplified diagnostic logging
+- **gsd**: update tests for errors-only audit persistence, fix empty catch blocks
+- **gsd**: harden audit log persistence — errors-only, sanitized, demote probe warnings
+- **gsd**: address adversarial review findings on workflow-logger migration
+- **gsd**: fail-closed stop guard, harden backtrack parsing, fix prompt params
+- **gsd**: add diagnostic logging to empty catch blocks in auto-mode
+- **lsp**: add legacy alias for renamed kotlin-language-server key
+- break infinite notes loop when selecting "None of the above"
+- align defaultRoutingConfig capability_routing to true
+- **pi-coding-agent**: upgrade Kotlin LSP to official Kotlin/kotlin-lsp
+- **test**: use correct RequirementCounts type fields in edge case tests
+- **remote-questions**: fire configured channels in interactive mode
+
+### Changed
+- **gsd**: migrate all catch blocks to centralized workflow-logger
+- init gsd
+
+## [2.61.0] - 2026-04-04
+
+### Added
+- stop/backtrack capture classifications for milestone regression (#3488)
+- GSD context optimization with model routing and context masking
+
+## [2.60.0] - 2026-04-04
+
+### Added
+- add /btw skill — ephemeral side questions from conversation context
+
+### Fixed
+- **btw**: remove LLM-specific references from skill description
+
+## [2.59.0] - 2026-04-03
+
+### Added
+- **extensions**: add Ollama extension for first-class local LLM support (#3371)
+- **doctor**: stale commit safety check with gsd snapshot and auto-cleanup
+- **extensions**: wire up topological sort and unified registry filtering (#3152)
+- **widget**: add last commit display and dashboard layout improvements (#3226)
+- **model-routing**: enable dynamic routing by default (#3120)
+- **vscode**: sidebar redesign, SCM provider, checkpoints, diagnostics [3/3]
+- **splash**: add remote channel indicator to welcome screen tools row
+- stream full text and thinking output in headless verbose mode (#2934)
+- **gsd**: add codebase map — structural orientation for fresh agent contexts
+
+### Fixed
+- **worktree**: resolve merge conflict for PR #3322 — adopt comprehensive pre-merge cleanup
+- **merge**: clean stale MERGE_HEAD before squash merge (#2912)
+- **state**: always run disk→DB reconciliation when DB is available (#2631)
+- **git-service**: fix merge-base ancestry check and .gsd/ leakage in snapshot absorption
+- **extensions**: update provides.hooks in 7 extension manifests to match actual registrations (#3157)
+- surface nativeCommit errors in reconcileMergeState instead of silently swallowing (#3052)
+- **parallel**: scope commits to milestone boundaries in parallel mode (#3047)
+- add windowsHide to all web-mode subprocess spawns (#2628) (#3046)
+- skip auto-mode pause on empty-content aborted messages (#2695) (#3045)
+- detect and remove nested .git dirs in worktree cleanup to prevent data loss (#3044)
+- prevent data loss when git isolation default changes (#2625) (#3043)
+- **read-tool**: clamp offset to file bounds instead of throwing (#3007) (#3042)
+- **gsd**: preserve queued milestones with worktrees in ghost detection (#3041)
+- **compaction**: add chunked fallback when messages exceed model context window (#3038)
+- preserve interactive terminal across tab switches and project changes (#3055)
+- call cleanupQuickBranch on turn_end to squash-merge quick branch back (#3054)
+- align run-uat artifact path to ASSESSMENT, preventing false stuck retries (#3053)
+- replace invalid Discord invite links with canonical URL (#3056)
+- add Windows shell guard to remaining spawn sites (#3058)
+- route `gsd auto` to headless runner to prevent hang on piped stdin/stdout (#3057)
+- respect .gitignore for .gsd/ in rethink prompt (#3059)
+- migrate unit ownership from JSON to SQLite to eliminate read-modify-write race (#3061)
+- **roadmap**: handle numbered, bracketed, and indented prose H3 headers in slice parser (#3063)
+- add worktree-merge to resolveModelWithFallbacksForUnit switch and update KNOWN_UNIT_TYPES (#3066)
+- clean up MERGE_HEAD on all error paths in mergeMilestoneToMain (#2912) (#3068)
+- prevent LLM from confusing background task output with user input (#3069)
+- add openai-codex provider and modern OpenAI models to MODEL_CAPABILITY_TIER and cost tables (#3070)
+- preserve active tab when switching projects (#3071)
+- include project name in desktop notifications (#3072)
+- recover from many-image dimension overflow by stripping older images (#3075)
+- resolve bare model IDs to anthropic over claude-code provider (#3076)
+- **auto**: move selectAndApplyModel before updateProgressWidget (#3079)
+- detect project relocation and recover state without data loss (#3080)
+- add free-text input to ask-user-questions when "None of the above" is selected (#3081)
+- block work execution during /gsd queue mode (#2545) (#3082)
+- detect worktree basePath in gsdRoot() to prevent escaping to project root (#3083)
+- invalidate stale quick-task captures across milestone boundaries (#3084)
+- defer model validation until after extensions register (#3089)
+- repair YAML bullet lists in malformed tool-call JSON (#3090)
+- unify SUMMARY.md render paths for projection fidelity (#3091)
+- chat mode misrepresents terminal output, looks stuck, omits user messages (#3092)
+- resolve 4 state corruption bugs in milestone/slice completion (#2945) (#3093)
+- isolate guided-flow session state and key discussion milestone queries (#2985) (#3094)
+- **guided-flow**: route dispatchWorkflow through dynamic routing pipeline (#3153)
+- skip external state migration inside git worktrees (#2970) (#3227)
+- coerce non-numeric strings in DB columns during manifest serialization (#2962) (#3229)
+- route allDiscussed and zero-slices paths to queued milestone discussion (#3150) (#3230)
+- use loose equality for null checks in secure_env_collect (#2997) (#3231)
+- prevent prompt explosion from $' in template replacement values (#2968) (#3232)
+- resolve OAuth API key in buildMemoryLLMCall via modelRegistry (#2959) (#3233)
+- **forensics**: read completion status from DB instead of legacy file (#3129) (#3234)
+- use camelCase parameter names in execute-task and complete-slice prompts (#2933) (#3236)
+- check bootstrap completeness in init wizard gate, not just .gsd/ existence (#2942) (#3237)
+- specify write tool for PROJECT.md in milestone/slice prompts (#3238)
+- widen completing-milestone gate to accept "None required" and similar phrasings (#2931) (#3239)
+- prevent ask_user_questions from poisoning auto-mode dispatch (#2936) (#3240)
+- guard null s.currentUnit in runUnitPhase closeout after stopAuto race (#2939) (#3241)
+- replace `web_search` with `search-the-web` in prompts and agent frontmatter (#2920) (#3245)
+- preserve milestone title in upsertMilestonePlanning when DB row pre-exists (#2879) (#3247)
+- invalidate stale milestone validation on roadmap reassessment (#2957) (#3242)
+- **discuss**: add roadmap fallback when DB is open but empty (#2892) (#3244)
+- integrate Codex & Gemini CLI into provider routes and rate-limit handling (#2922) (#3246)
+- **error-classifier**: widen STREAM_RE to cover all 7 V8 JSON parse error variants (#2916) (#3243)
+- prevent git stash from destroying queued milestone CONTEXT files (#2505) (#3273)
+- skip staleness rebuild in npm tarball installs (#2877) (#3250)
+- **parallel**: check worktree DB for milestone completion in merge (#2812) (#3256)
+- make claude-code provider stateful with full context and sidechain events (#2859) (#3254)
+- **worktree**: preserve non-empty gsd.db during sync to prevent truncation (#2815) (#3255)
+- align @gsd/native module type with compiled output (#3253)
+- parse hook/* completed-unit keys correctly in forensics + doctor (#2826) (#3252)
+- copy mcp.json into auto-mode worktrees (#2791) (#3251)
+- add gsd_requirement_save and upsert path for requirement updates (#3249)
+- handle pause_turn stop reason to prevent 400 errors with native web search (#2869) (#3248)
+- use authoritative milestone status in web roadmap (#2807) (#3258)
+- classify long-context entitlement 429 as quota_exhausted, not rate_limit (#2803) (#3257)
+- **docs**: use ~/.pi/agent/extensions/ for community extension install path (#3131) (#3259)
+- add disk→DB slice reconciliation in deriveStateFromDb (#2533) (#3262)
+- run forensics duplicate detection before investigation (#2704) (#3260)
+- skip TUI render loop on non-TTY stdout to prevent CPU burn (#3095) (#3263)
+- persist forensics report context across follow-up turns (#2941) (#3261)
+- invalidate workspace state on turn_end so milestones list stays current (#2706) (#3266)
+- eliminate 3 recurring doctor audit false positives (#3105) (#3264)
+- **web**: reconcile auto-mode state with on-disk lock in dashboard (#2705) (#3265)
+- treat ghost milestones as ineligible for parallel execution (#2501) (#3268)
+- redirect auto-mode to headless when stdout is piped (#2732) (#3269)
+- attempt VACUUM recovery when initSchema fails with corrupt freelist (#2519) (#3270)
+- resolve db_unavailable loop in worktree/symlink layouts (#2517) (#3271)
+- correct OAuth fallback request shape for google_search (#2963) (#3272)
+- prevent UAT stuck-loop and orphaned worktree after milestone completion (#3065)
+- **mcp**: handle server names with spaces in mcp_discover (#3037)
+- **gsd**: detect markdown body verdicts and guard plan-milestone against completed slices (#2960) (#3035)
+- **error-classifier**: replace STREAM_RE whack-a-mole with catch-all V8 JSON.parse pattern
+- type _borderColorKey as 'dim' | 'bashMode' to match ThemeColor
+- **tui**: comprehensive TUI review — layout, flow, rendering, and state fixes
+- **gsd**: harden codebase-map — bug fixes, UX polish, and expanded tests
+
+### Changed
+- **state**: centralize pipeline logging through workflow logger (#3282)
+- **gitignore**: exclude src/ build artifacts, scratch files, and .plans/
+- **complexity**: reclassify planning phases from standard to heavy tier
+
 ## [2.58.0] - 2026-03-28

 ### Added
@ -2154,7 +2740,24 @@ Format based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
 ### Changed
 - License updated to MIT

-[Unreleased]: https://github.com/gsd-build/gsd-2/compare/v2.58.0...HEAD
+[Unreleased]: https://github.com/gsd-build/gsd-2/compare/v2.71.0...HEAD
+[2.71.0]: https://github.com/gsd-build/gsd-2/compare/v2.70.1...v2.71.0
+[2.70.1]: https://github.com/gsd-build/gsd-2/compare/v2.70.0...v2.70.1
+[2.70.0]: https://github.com/gsd-build/gsd-2/compare/v2.69.0...v2.70.0
+[2.69.0]: https://github.com/gsd-build/gsd-2/compare/v2.68.1...v2.69.0
+[2.68.1]: https://github.com/gsd-build/gsd-2/compare/v2.68.0...v2.68.1
+[2.68.0]: https://github.com/gsd-build/gsd-2/compare/v2.67.0...v2.68.0
+[2.67.0]: https://github.com/gsd-build/gsd-2/compare/v2.66.1...v2.67.0
+[2.66.1]: https://github.com/gsd-build/gsd-2/compare/v2.66.0...v2.66.1
+[2.66.0]: https://github.com/gsd-build/gsd-2/compare/v2.65.0...v2.66.0
+[2.65.0]: https://github.com/gsd-build/gsd-2/compare/v2.64.0...v2.65.0
+[2.64.0]: https://github.com/gsd-build/gsd-2/compare/v2.63.0...v2.64.0
+[2.63.0]: https://github.com/gsd-build/gsd-2/compare/v2.62.1...v2.63.0
+[2.62.1]: https://github.com/gsd-build/gsd-2/compare/v2.62.0...v2.62.1
+[2.62.0]: https://github.com/gsd-build/gsd-2/compare/v2.61.0...v2.62.0
+[2.61.0]: https://github.com/gsd-build/gsd-2/compare/v2.60.0...v2.61.0
+[2.60.0]: https://github.com/gsd-build/gsd-2/compare/v2.59.0...v2.60.0
+[2.59.0]: https://github.com/gsd-build/gsd-2/compare/v2.58.0...v2.59.0
 [2.58.0]: https://github.com/gsd-build/gsd-2/compare/v2.57.0...v2.58.0
 [2.57.0]: https://github.com/gsd-build/gsd-2/compare/v2.56.0...v2.57.0
 [2.56.0]: https://github.com/gsd-build/gsd-2/compare/v2.55.0...v2.56.0
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@ -146,9 +146,14 @@ The codebase is organized into these areas. All are open to contributions:
 | AI/LLM layer | `packages/pi-ai` | Provider integrations, model handling |
 | Agent core | `packages/pi-agent-core` | Agent orchestration — RFC required for changes |
 | Coding agent | `packages/pi-coding-agent` | The main coding agent |
+| MCP server | `packages/mcp-server` | Project state tools and MCP protocol |
 | GSD extension | `src/resources/extensions/gsd/` | GSD workflow — RFC required for auto-mode |
-| Native bindings | `native/` | Platform-specific native code |
+| Other extensions | `src/resources/extensions/` | Browser, search, voice, MCP client, etc. |
+| Native engine | `native/` | Rust N-API modules (grep, git, AST, etc.) |
+| VS Code extension | `vscode-extension/` | Chat participant, sidebar, RPC integration |
+| Web interface | `web/` | Browser-based dashboard |
 | CI/Build | `.github/`, `scripts/` | Workflows, build scripts |
+| Documentation | `docs/` | User guides, ADRs, SDK docs |

 ## Review process

--- a/README.md
+++ b/README.md
@ -7,7 +7,7 @@
 [![npm version](https://img.shields.io/npm/v/gsd-pi?style=for-the-badge&logo=npm&logoColor=white&color=CB3837)](https://www.npmjs.com/package/gsd-pi)
 [![npm downloads](https://img.shields.io/npm/dm/gsd-pi?style=for-the-badge&logo=npm&logoColor=white&color=CB3837)](https://www.npmjs.com/package/gsd-pi)
 [![GitHub stars](https://img.shields.io/github/stars/gsd-build/GSD-2?style=for-the-badge&logo=github&color=181717)](https://github.com/gsd-build/GSD-2)
-[![Discord](https://img.shields.io/badge/Discord-Join%20us-5865F2?style=for-the-badge&logo=discord&logoColor=white)](https://discord.gg/gsd)
+[![Discord](https://img.shields.io/badge/Discord-Join%20us-5865F2?style=for-the-badge&logo=discord&logoColor=white)](https://discord.com/invite/nKXTsAcmbT)
 [![License](https://img.shields.io/badge/license-MIT-blue?style=for-the-badge)](LICENSE)
 [![$GSD Token](https://img.shields.io/badge/$GSD-Dexscreener-1C1C1C?style=for-the-badge&logo=data:image/svg+xml;base64,PHN2ZyB3aWR0aD0iMjQiIGhlaWdodD0iMjQiIHZpZXdCb3g9IjAgMCAyNCAyNCIgZmlsbD0ibm9uZSIgeG1sbnM9Imh0dHA6Ly93d3cudzMub3JnLzIwMDAvc3ZnIj48Y2lyY2xlIGN4PSIxMiIgY3k9IjEyIiByPSIxMCIgZmlsbD0iIzAwRkYwMCIvPjwvc3ZnPg==&logoColor=00FF00)](https://dexscreener.com/solana/dwudwjvan7bzkw9zwlbyv6kspdlvhwzrqy6ebk8xzxkv)

@ -21,187 +21,107 @@ One command. Walk away. Come back to a built project with clean git history.

 > GSD now provisions a managed [RTK](https://github.com/rtk-ai/rtk) binary on supported macOS, Linux, and Windows installs to compress shell-command output in `bash`, `async_bash`, `bg_shell`, and verification flows. GSD forces `RTK_TELEMETRY_DISABLED=1` for all managed invocations. Set `GSD_RTK_DISABLED=1` to disable the integration.

-> **📋 NOTICE: New to Node on Mac?** If you installed Node.js via Homebrew, you may be running a development release instead of LTS. **[Read this guide](./docs/node-lts-macos.md)** to pin Node 24 LTS and avoid compatibility issues.
+> **📋 NOTICE: New to Node on Mac?** If you installed Node.js via Homebrew, you may be running a development release instead of LTS. **[Read this guide](./docs/user-docs/node-lts-macos.md)** to pin Node 24 LTS and avoid compatibility issues.

 </div>

 ---

-## What's New in v2.52.0
+## What's New in v2.71

-### VS Code Extension & Web UI
+### MCP Secure Env Collect

- **VS Code integration** — status bar, file decorations, bash terminal, session tree, conversation history, and code lens. (#2651)
- **Dark mode contrast** — raised token floor and flattened opacity tier system for better readability. (#2734)
- **Auth token gate** — synthetic 401 on missing token, unauthenticated boot state, and recovery screen. (#2740)
+- **Secure credential collection over MCP** — the new `secure_env_collect` tool uses MCP form elicitation to collect secrets (API keys, tokens) from external clients without exposing values in tool output. Masks input in interactive mode.
+- **Hardened elicitation schema** — MCP elicitation schema handling is stricter, with proper validation and fallback for providers that don't support forms.

-### Capability Metadata & Model Routing
+### MCP Reliability

- **Capability-based model selection** — replaced model-ID pattern matching with capability metadata, making custom provider integration more reliable. (#2548)
+- **Stream ordering preserved** — MCP tool output now renders in the correct order, fixing interleaved output in Claude Code and other MCP clients.
+- **isError flag propagation** — workflow tool execution failures now correctly return `isError: true`, so MCP clients can distinguish success from failure.
+- **Multi-round discuss questions** — new-project discuss phase supports multi-round questioning with structured question gates.

-### Key Changes
+### TUI Fixes

- **`--bare` mode** — wired across headless, pi-coding-agent, and resource-loader for minimal-output operation.
- **RPC protocol v2** — new types, init handshake with version detection, and runId generation on prompt/steer/follow_up commands.
- **PREFERENCES.md rename** — `preferences.md` renamed to `PREFERENCES.md` for consistency. (#2700, #2738)
- **Comprehensive SQLite audit** — indexes, caching, safety, and reconciliation fixes across gsd-db.
- **Unified error classifier** — three overlapping error classifiers consolidated into a single classify-decide-act pipeline.
+- **Pinned output restored** — pinned output bar displays above the editor during tool execution again.
+- **Turn completion cleanup** — pinned latest output is cleared on turn completion, preventing stale output from persisting.
+- **Secure input masking** — extension input values are masked in interactive mode when collecting secrets.

-### Key Fixes
+### Reliability & Internals

- **Auto-mode stops on provider errors** — auto loop now halts after provider errors instead of retrying indefinitely. (#2762, #2764)
- **Transaction safety** — state machine guards moved inside transactions in 5 tool handlers (#2752), and `transaction()` made re-entrant.
- **Worktree seeding** — `preferences.md` seeded into auto-mode worktrees and included in worktree sync. (#2693)
- **Idle watchdog** — interactive tools exempted from stall detection (#2676), and filesystem activity no longer overrides stalled-tool detection. (#2697)
- **Milestone guards** — `allSlicesDone` guarded against vacuous truth on empty slice arrays (#2679), and `complete-milestone` dispatch blocked when validation is `needs-remediation`. (#2682)
- **Docker overhaul** — fragile setup replaced with proven container patterns. (#2716)
- **Windows** — EINVAL prevented by disabling detached process groups on Win32. (#2744)
- **Audit log** — `setLogBasePath` wired into engine init to resurrect audit logging. (#2745)
+- **TOCTOU file locking** — race conditions in event log and custom workflow graph file locking are fixed with proper atomic lock acquisition.
+- **State derive refactor** — `deriveStateFromDb` god function extracted into composable, testable helpers.
+- **Windows portability** — hardened cross-platform portability across runtime, tooling, and CI.
+- **Model routing transparency** — dynamic routing is skipped for interactive dispatches; model changes are always shown in the banner.
+- **Capability-aware routing (ADR-004)** — full implementation of capability scoring, `before_model_select` hook, and task metadata extraction.
+- **Multi-model provider strategy (ADR-005)** — infrastructure for multi-provider model selection wired into live paths.

-### v2.51.0 — Skills, RTK, and Verification
+See the full [Changelog](./CHANGELOG.md) for details on every release.

- **`/terminal` command** — direct shell execution from the slash command interface. (#2349)
- **Managed RTK integration** — RTK binary auto-provisioned with opt-in preference and web UI toggle. (#2620)
- **Verification classes** — compliance checked before milestone completion, with classes injected into validation prompts. (#2621, #2623)
- **Skills overhaul** — 30+ new skill packs covering major frameworks, databases, and cloud platforms; curated catalog with `~/.agents/skills/` as primary directory.
+<details>
+<summary>Previous highlights (v2.70 and earlier)</summary>

-### v2.50.0 — Quality Gates
+- **Full workflow over MCP (v2.68)** — slice replanning, milestone management, slice completion, task completion, and core planning tools exposed over MCP
+- **Transport-gated MCP (v2.68)** — workflow tool availability adapts to provider transport capabilities automatically
+- **Contextual tips system (v2.68)** — TUI and web terminal surface contextual tips based on workflow state
+- **Ask user questions over MCP (v2.70)** — interactive questions exposed via elicitation for external integrations
+- **Tiered Context Injection (M005)** — relevance-scoped context with 65%+ token reduction
+- **Resilient transient error recovery** — defers to Core RetryHandler and fixes cmdCtx race conditions
+- **Anthropic subscription routing** — auto-routed through Claude Code CLI provider with proper display names
+- **5-wave state machine hardening** — critical data integrity fixes across atomic writes, event log reconciliation, session recovery
+- **Discussion gate enforcement** — mechanical enforcement with fail-closed behavior
+- **Slice-level parallelism** — dependency-aware parallel dispatch within a milestone
+- **Persistent notification panel** — TUI overlay, widget, and web API for real-time notifications
+- **MCP server** — 6 read-only project state tools for external integrations, auto-wrapup guard, and question dedup
+- **Ollama extension** — first-class local LLM support via Ollama, with dynamic routing enabled by default
+- **Discord bot & daemon** — dedicated daemon package, Discord bot, and headless text mode with tool calls
+- **Capability-aware model routing (ADR-004)** — capability scoring, `before_model_select` hook, and task metadata extraction
+- **VS Code sidebar redesign** — SCM provider, checkpoints, diagnostics panel, activity feed, workflow controls, session forking
+- **`/gsd parallel watch`** — native TUI overlay for real-time worker monitoring
+- **Codebase map** — automatic codebase map injection for fresh agent contexts
+- **`--resume` flag** — resume previous sessions from the CLI
+- **Concurrent invocation guard** — prevents overlapping auto-mode runs
+- **VS Code integration** — status bar, file decorations, bash terminal, session tree, conversation history, and code lens
+- **Skills overhaul** — 30+ skill packs covering major frameworks, databases, and cloud platforms
+- **Single-writer state engine** — disciplined state transitions with machine guards and TOCTOU hardening
+- **DB-backed planning tools** — atomic SQLite tool calls for state transitions
+- **Declarative workflow engine** — YAML workflows through auto-loop
+- **Doctor: worktree lifecycle checks** — validates worktree health, detects orphans, consolidates cleanup

- **Quality gates** — 8-question quality gates added to planning and completion templates, with parallel evaluation via `evaluating-gates` phase.
- **Structured error propagation** — errors wired through `UnitResult` for better diagnostics.
-
-### v2.49.0 — Git Trailers & Yolo Mode
-
- **`--yolo` flag** — `/gsd auto --yolo` for non-interactive project init.
- **Git trailers** — GSD metadata moved from commit subject scopes to git trailers.
-
-### v2.48.0 — Forensics & Discussion
-
- **`/gsd discuss` for queued milestones** — target milestones still in the queue. (#2349)
- **Enhanced forensics** — journal and activity log awareness added to `/gsd forensics`.
-
-### v2.47.0 — External Providers
-
- **External tool execution mode** — `externalToolExecution` mode for external providers in agent-core.
- **Claude Code CLI provider** — new provider extension for Claude Code CLI. (#2382)
-
-### Previous highlights (v2.42–v2.46)
-
- **Single-writer state engine** — disciplined state transitions with machine guards, actor identity, reversibility, and TOCTOU hardening. (#2494)
- **`/gsd rethink`** — conversational project reorganization. (#2459)
- **`/gsd mcp`** — MCP server status and connectivity. (#2362)
- **Complete offline mode** — fully offline with local models. (#2429)
- **Global KNOWLEDGE.md injection** — cross-project knowledge via `~/.gsd/agent/KNOWLEDGE.md`. (#2331)
- **Mobile-responsive web UI** — browser interface works on phones and tablets. (#2354)
- **Default isolation mode changed to `none`** — set `git.isolation: worktree` explicitly if needed. (#2481)
- **Non-API-key provider extensions** — support for Claude Code CLI and similar providers. (#2382)
- **Docker sandbox template** — official Docker template for isolated auto mode. (#2360)
- **DB-backed planning tools** — write-side state transitions use atomic SQLite tool calls. (#2141)
- **Declarative workflow engine** — YAML workflows through auto-loop. (#2024)
- **`/gsd fast`** — toggle service tier for prioritized API routing. (#1862)
-
---
-
-## What's New in v2.41.0
-
-### New Features
-
- **Browser-based web interface** — run GSD from the browser with `gsd --web`. Full project management, real-time progress, and multi-project support via server-sent events. (#1717)
- **Doctor: worktree lifecycle checks** — `/gsd doctor` now validates worktree health, detects orphaned worktrees, consolidates cleanup, and enhances `/worktree list` with lifecycle status. (#1814)
- **CI: docs-only PR detection** — PRs that only change documentation skip build and test steps, with a new prompt injection scan for security. (#1699)
- **Custom Models guide** — new documentation for adding custom providers (Ollama, vLLM, LM Studio, proxies) via `models.json`. (#1670)
-
-### Data Loss Prevention (Critical Fixes)
-
-This release includes 7 fixes preventing silent data loss in auto-mode:
-
- **Hallucination guard** — execute-task agents that complete with zero tool calls are now rejected as hallucinated. Previously, agents could produce detailed but fabricated summaries without writing any code, wasting ~$25/milestone. (#1838)
- **Merge anchor verification** — before deleting a milestone worktree/branch, GSD now verifies the code is actually on the integration branch. Prevents orphaning commits when squash-merge produces an empty diff. (#1829)
- **Dirty working tree detection** — `nativeMergeSquash` now distinguishes dirty-tree rejections from content conflicts, preventing silent commit loss when synced `.gsd/` files block the merge. (#1752)
- **Doctor cleanup safety** — the `orphaned_completed_units` check no longer auto-fixes during post-task health checks. Previously, timing races could cause the doctor to remove valid completion keys, reverting users to earlier tasks. (#1825)
- **Root file reverse-sync** — worktree teardown now syncs root-level `.gsd/` files (PROJECT.md, REQUIREMENTS.md, completed-units.json) back to the project root. Previously these were lost on milestone closeout. (#1831)
- **Empty merge guard** — milestone branches with unanchored code changes are preserved instead of deleted when squash-merge produces nothing to commit. (#1755)
- **Crash-safe task closeout** — orphaned checkboxes in PLAN.md are unchecked on retry, preventing phantom task completion. (#1759)
-
-### Auto-Mode Stability
-
- **Terminal hang fix** — `stopAuto()` now resolves pending promises, preventing the terminal from freezing permanently after stopping auto-mode. (#1818)
- **Signal handler coverage** — SIGHUP and SIGINT now clean up lock files, not just SIGTERM. Prevents stranded locks on VS-Code crash. (#1821)
- **Needs-discussion routing** — milestones in `needs-discussion` phase now route to the smart entry UI instead of hard-stopping, breaking the infinite loop. (#1820)
- **Infrastructure error handling** — auto-mode stops immediately on ENOSPC, ENOMEM, and similar unrecoverable errors instead of retrying. (#1780)
- **Dependency-aware dispatch** — slice dispatch now uses declared `depends_on` instead of positional ordering. (#1770)
- **Queue mode depth verification** — the write gate now processes depth verification in queue mode, fixing a deadlock where CONTEXT.md writes were permanently blocked. (#1823)
-
-### Roadmap Parser Improvements
-
- **Table format support** — roadmaps using markdown tables (`| S01 | Title | Risk | Status |`) are now parsed correctly. (#1741)
- **Prose header fallback** — when `## Slices` contains H3 headers instead of checkboxes, the prose parser is invoked as a fallback. (#1744)
- **Completion marker detection** — prose headers with `✓` or `(Complete)` markers are correctly identified as done. (#1816)
- **Zero-slice stub handling** — stub roadmaps from `/gsd queue` return `pre-planning` instead of `blocked`. (#1826)
- **Immediate roadmap fix** — roadmap checkbox and UAT stub are fixed immediately after last task instead of deferring to `complete-slice`. (#1819)
-
-### State & Git Improvements
-
- **CONTEXT-DRAFT.md fallback** — `depends_on` is read from CONTEXT-DRAFT.md when CONTEXT.md doesn't exist, preventing draft milestones from being promoted past dependency constraints. (#1743)
- **Unborn branch support** — `nativeBranchExists` handles repos with zero commits, preventing dispatch deadlock on new repos. (#1815)
- **Ghost milestone detection** — empty `.gsd/milestones/` directories are skipped instead of crashing `deriveState()`. (#1817)
- **Default branch detection** — milestone merge detects `master` vs `main` instead of hardcoding. (#1669)
- **Milestone title extraction** — titles are pulled from CONTEXT.md headings when no ROADMAP exists. (#1729)
-
-### Windows & Platform
-
- **Windows path handling** — 8.3 short paths, `pathToFileURL` for ESM imports, and `realpathSync.native` fixes across the test suite and verification gate. (#1804)
- **DEP0190 fix** — `spawnSync` deprecation warning eliminated by passing commands to shell explicitly. (#1827)
- **Web build skip on Windows** — Next.js webpack EPERM errors on system directories are handled gracefully.
-
-### Developer Experience
-
- **@ file finder fix** — typing `@` no longer freezes the TUI. The fix adds debounce, dedup, and empty-query short-circuit. (#1832)
- **Tool-call loop guard** — detects and breaks infinite tool-call loops within a single unit, preventing stack overflow. (#1801)
- **Completion deferral fix** — roadmap checkbox and UAT stub are fixed at task level, closing the fragile handoff window between last task and `complete-slice`. (#1819)
-
-See the full [Changelog](./CHANGELOG.md) for all 70+ fixes in this release.
-
-### Previous highlights (v2.39–v2.41)
-
- **Browser-based web interface** — run GSD from the browser with `gsd --web`
- **GitHub sync extension** — auto-sync milestones to GitHub Issues, PRs, and Milestones
- **Skill tool resolution** — skills auto-activate in dispatched prompts
- **Health check phase 2** — real-time doctor issues in dashboard and visualizer
- **Forensics upgrade** — full-access GSD debugger with anomaly detection
- **7 data-loss prevention fixes** — hallucination guard, merge anchor verification, dirty tree detection, and more
- **Pipeline decomposition** — auto-loop rewritten as linear phase pipeline
- **Sliding-window stuck detection** — pattern-aware, fewer false positives
- **Data-loss recovery** — automatic detection and recovery from v2.30–v2.38 migration issues
+</details>

 ---

 ## Documentation

-Full documentation is available at **[gsd.build](https://gsd.build)** (powered by Mintlify) and in the [`docs/`](./docs/) directory:
+Full documentation is in the [`docs/`](./docs/) directory:

- **[Getting Started](./docs/getting-started.md)** — install, first run, basic usage
- **[Auto Mode](./docs/auto-mode.md)** — autonomous execution deep-dive
- **[Configuration](./docs/configuration.md)** — all preferences, models, git, and hooks
- **[Custom Models](./docs/custom-models.md)** — add custom providers (Ollama, vLLM, LM Studio, proxies)
- **[Token Optimization](./docs/token-optimization.md)** — profiles, context compression, complexity routing
- **[Cost Management](./docs/cost-management.md)** — budgets, tracking, projections
- **[Git Strategy](./docs/git-strategy.md)** — worktree isolation, branching, merge behavior
- **[Parallel Orchestration](./docs/parallel-orchestration.md)** — run multiple milestones simultaneously
- **[Working in Teams](./docs/working-in-teams.md)** — unique IDs, shared artifacts
- **[Skills](./docs/skills.md)** — bundled skills, discovery, custom authoring
- **[Commands Reference](./docs/commands.md)** — all commands and keyboard shortcuts
- **[Architecture](./docs/architecture.md)** — system design and dispatch pipeline
- **[Troubleshooting](./docs/troubleshooting.md)** — common issues, doctor, forensics, recovery
- **[CI/CD Pipeline](./docs/ci-cd-pipeline.md)** — three-stage promotion pipeline (Dev → Test → Prod)
- **[VS Code Extension](./vscode-extension/README.md)** — chat participant, sidebar dashboard, RPC integration
- **[Visualizer](./docs/visualizer.md)** — workflow visualizer with stats and discussion status
- **[Remote Questions](./docs/remote-questions.md)** — route decisions to Slack or Discord when human input is needed
- **[Dynamic Model Routing](./docs/dynamic-model-routing.md)** — complexity-based model selection and budget pressure
- **[Web Interface](./docs/web-interface.md)** — browser-based project management and real-time progress
- **[Pipeline Simplification (ADR-003)](./docs/ADR-003-pipeline-simplification.md)** — merged research into planning, mechanical completion
+### User Guides
+
+- **[Getting Started](./docs/user-docs/getting-started.md)** — install, first run, basic usage
+- **[Auto Mode](./docs/user-docs/auto-mode.md)** — autonomous execution deep-dive
+- **[Configuration](./docs/user-docs/configuration.md)** — all preferences, models, git, and hooks
+- **[Custom Models](./docs/user-docs/custom-models.md)** — add custom providers (Ollama, vLLM, LM Studio, proxies)
+- **[Token Optimization](./docs/user-docs/token-optimization.md)** — profiles, context compression, complexity routing
+- **[Cost Management](./docs/user-docs/cost-management.md)** — budgets, tracking, projections
+- **[Git Strategy](./docs/user-docs/git-strategy.md)** — worktree isolation, branching, merge behavior
+- **[Parallel Orchestration](./docs/user-docs/parallel-orchestration.md)** — run multiple milestones simultaneously
+- **[Working in Teams](./docs/user-docs/working-in-teams.md)** — unique IDs, shared artifacts
+- **[Skills](./docs/user-docs/skills.md)** — bundled skills, discovery, custom authoring
+- **[Commands Reference](./docs/user-docs/commands.md)** — all commands and keyboard shortcuts
+- **[Troubleshooting](./docs/user-docs/troubleshooting.md)** — common issues, doctor, forensics, recovery
+- **[Visualizer](./docs/user-docs/visualizer.md)** — workflow visualizer with stats and discussion status
+- **[Remote Questions](./docs/user-docs/remote-questions.md)** — route decisions to Slack or Discord when human input is needed
+- **[Dynamic Model Routing](./docs/user-docs/dynamic-model-routing.md)** — complexity-based model selection and budget pressure
+- **[Web Interface](./docs/user-docs/web-interface.md)** — browser-based project management and real-time progress
+- **[Migration from v1](./docs/user-docs/migration.md)** — `.planning` → `.gsd` migration
 - **[Docker Sandbox](./docker/README.md)** — run GSD auto mode in an isolated Docker container
- **[Migration from v1](./docs/migration.md)** — `.planning` → `.gsd` migration
+
+### Developer Docs
+
+- **[Architecture](./docs/dev/architecture.md)** — system design and dispatch pipeline
+- **[CI/CD Pipeline](./docs/dev/ci-cd-pipeline.md)** — three-stage promotion pipeline (Dev → Test → Prod)
+- **[Pipeline Simplification (ADR-003)](./docs/dev/ADR-003-pipeline-simplification.md)** — merged research into planning, mechanical completion
+- **[VS Code Extension](./vscode-extension/README.md)** — chat participant, sidebar dashboard, RPC integration

 ---

@ -417,7 +337,7 @@ gsd headless query
 gsd headless dispatch plan
 ```

-Headless auto-responds to interactive prompts, detects completion, and exits with structured codes: `0` complete, `1` error/timeout, `2` blocked. Auto-restarts on crash with exponential backoff. Use `gsd headless query` for instant, machine-readable state inspection — returns phase, next dispatch preview, and parallel worker costs as a single JSON object without spawning an LLM session. Pair with [remote questions](./docs/remote-questions.md) to route decisions to Slack or Discord when human input is needed.
+Headless auto-responds to interactive prompts, detects completion, and exits with structured codes: `0` complete, `1` error/timeout, `2` blocked. Auto-restarts on crash with exponential backoff. Use `gsd headless query` for instant, machine-readable state inspection — returns phase, next dispatch preview, and parallel worker costs as a single JSON object without spawning an LLM session. Pair with [remote questions](./docs/user-docs/remote-questions.md) to route decisions to Slack or Discord when human input is needed.

 **Multi-session orchestration** — headless mode supports file-based IPC in `.gsd/parallel/` for coordinating multiple GSD workers across milestones. Build orchestrators that spawn, monitor, and budget-cap a fleet of GSD workers.

@ -590,9 +510,8 @@ auto_report: true
 | `verification_commands`| Array of shell commands to run after task execution (e.g., `["npm run lint", "npm run test"]`)        |
 | `verification_auto_fix`| Auto-retry on verification failures (default: true)                                                   |
 | `verification_max_retries` | Max retries for verification failures (default: 2)                                               |
-| `require_slice_discussion` | Pause auto-mode before each slice for human discussion review                                    |
+| `phases.require_slice_discussion` | Pause auto-mode before each slice for human discussion review                                    |
 | `auto_report`          | Auto-generate HTML reports after milestone completion (default: true)                                 |
-| `searchExcludeDirs`    | Directories to exclude from `@` file autocomplete (e.g., `["node_modules", ".git", "dist"]`)          |

 ### Agent Instructions

@ -622,11 +541,11 @@ token_profile: budget      # or balanced (default), quality

 **Budget pressure** graduates model downgrading as you approach your budget ceiling — 50%, 75%, and 90% thresholds progressively shift work to cheaper tiers.

-See the full [Token Optimization Guide](./docs/token-optimization.md) for details.
+See the full [Token Optimization Guide](./docs/user-docs/token-optimization.md) for details.

 ### Bundled Tools

-GSD ships with 19 extensions, all loaded automatically:
+GSD ships with 24 extensions, all loaded automatically:

 | Extension              | What it provides                                                                                                       |
 | ---------------------- | ---------------------------------------------------------------------------------------------------------------------- |
@ -648,17 +567,24 @@ GSD ships with 19 extensions, all loaded automatically:
 | **Remote Questions**   | Route decisions to Slack/Discord when human input is needed in headless/CI mode                                         |
 | **Universal Config**   | Discover and import MCP servers and rules from other AI coding tools                                                    |
 | **AWS Auth**           | Automatic Bedrock credential refresh for AWS-hosted models                                                              |
-| **TTSR**               | Tool-use type-safe runtime validation                                                                                   |
+| **Ollama**             | First-class local LLM support via Ollama                                                                                |
+| **Claude Code CLI**    | External provider extension for Claude Code CLI                                                                         |
+| **cmux**               | Claude multiplexer integration — desktop notifications, sidebar metadata, visual subagent splits                        |
+| **GitHub Sync**        | Auto-sync milestones to GitHub Issues, PRs, and Milestones                                                              |
+| **LSP**                | Language Server Protocol — diagnostics, definitions, references, hover, rename                                          |
+| **TTSR**               | Tool-triggered system rules — conditional context injection based on tool usage                                         |

 ### Bundled Agents

-Three specialized subagents for delegated work:
+Five specialized subagents for delegated work:

-| Agent          | Role                                                         |
-| -------------- | ------------------------------------------------------------ |
-| **Scout**      | Fast codebase recon — returns compressed context for handoff |
-| **Researcher** | Web research — finds and synthesizes current information     |
-| **Worker**     | General-purpose execution in an isolated context window      |
+| Agent               | Role                                                         |
+| ------------------- | ------------------------------------------------------------ |
+| **Scout**           | Fast codebase recon — returns compressed context for handoff |
+| **Researcher**      | Web research — finds and synthesizes current information     |
+| **Worker**          | General-purpose execution in an isolated context window      |
+| **JavaScript Pro**  | JavaScript-specialized execution and debugging               |
+| **TypeScript Pro**  | TypeScript-specialized execution and debugging               |

 ---

@ -733,9 +659,8 @@ gsd (CLI binary)
          ├─ resource-loader.ts  Syncs bundled extensions + agents to ~/.gsd/agent/
          └─ src/resources/
              ├─ extensions/gsd/    Core GSD extension (auto, state, commands, ...)
-              ├─ extensions/...     18 supporting extensions
-              ├─ agents/            scout, researcher, worker
-              ├─ AGENTS.md          Agent routing instructions
+              ├─ extensions/...     21 supporting extensions
+              ├─ agents/            scout, researcher, worker, javascript-pro, typescript-pro
              └─ GSD-WORKFLOW.md    Manual bootstrap protocol
 ```

--- a/docs/README.md
+++ b/docs/README.md
@ -4,51 +4,67 @@ Welcome to the GSD documentation. This covers everything from getting started to

 ## User Documentation

+Guides for installing, configuring, and using GSD day-to-day. Located in [`user-docs/`](./user-docs/).
+
 | Guide | Description |
 |-------|-------------|
-| [Getting Started](./getting-started.md) | Installation, first run, and basic usage |
-| [Auto Mode](./auto-mode.md) | How autonomous execution works — the state machine, crash recovery, and steering |
-| [Commands Reference](./commands.md) | All commands, keyboard shortcuts, and CLI flags |
-| [Remote Questions](./remote-questions.md) | Discord and Slack integration for headless auto-mode |
-| [Configuration](./configuration.md) | Preferences, model selection, git settings, and token profiles |
-| [Custom Models](./custom-models.md) | Add custom providers (Ollama, vLLM, LM Studio, proxies) via models.json |
-| [Token Optimization](./token-optimization.md) | Token profiles, context compression, complexity routing, and adaptive learning (v2.17) |
-| [Dynamic Model Routing](./dynamic-model-routing.md) | Complexity-based model selection, cost tables, escalation, and budget pressure (v2.19) |
-| [Captures & Triage](./captures-triage.md) | Fire-and-forget thought capture during auto-mode with automated triage (v2.19) |
-| [Workflow Visualizer](./visualizer.md) | Interactive TUI overlay for progress, dependencies, metrics, and timeline (v2.19) |
-| [Cost Management](./cost-management.md) | Budget ceilings, cost tracking, projections, and enforcement modes |
-| [Git Strategy](./git-strategy.md) | Worktree isolation, branching model, and merge behavior |
-| [Parallel Orchestration](./parallel-orchestration.md) | Run multiple milestones simultaneously with worker isolation and coordination |
-| [Working in Teams](./working-in-teams.md) | Unique milestone IDs, `.gitignore` setup, and shared planning artifacts |
-| [Skills](./skills.md) | Bundled skills, skill discovery, and custom skill authoring |
-| [Migration from v1](./migration.md) | Migrating `.planning` directories from the original GSD |
-| [Troubleshooting](./troubleshooting.md) | Common issues, `/gsd doctor` (real-time visibility v2.40), `/gsd forensics` (full debugger v2.40), and recovery procedures |
-| [Web Interface](./web-interface.md) | Browser-based project management with `gsd --web` (v2.41) |
+| [Getting Started](./user-docs/getting-started.md) | Installation, first run, and basic usage |
+| [Auto Mode](./user-docs/auto-mode.md) | How autonomous execution works — the state machine, crash recovery, and steering |
+| [Commands Reference](./user-docs/commands.md) | All commands, keyboard shortcuts, and CLI flags |
+| [Remote Questions](./user-docs/remote-questions.md) | Discord and Slack integration for headless auto-mode |
+| [Configuration](./user-docs/configuration.md) | Preferences, model selection, git settings, and token profiles |
+| [Provider Setup](./user-docs/providers.md) | Step-by-step setup for OpenRouter, Ollama, LM Studio, vLLM, and all supported providers |
+| [Custom Models](./user-docs/custom-models.md) | Advanced model configuration — models.json schema, compat flags, overrides |
+| [Token Optimization](./user-docs/token-optimization.md) | Token profiles, context compression, complexity routing, and adaptive learning (v2.17) |
+| [Dynamic Model Routing](./user-docs/dynamic-model-routing.md) | Complexity-based model selection, cost tables, escalation, and budget pressure (v2.19) |
+| [Captures & Triage](./user-docs/captures-triage.md) | Fire-and-forget thought capture during auto-mode with automated triage (v2.19) |
+| [Workflow Visualizer](./user-docs/visualizer.md) | Interactive TUI overlay for progress, dependencies, metrics, and timeline (v2.19) |
+| [Cost Management](./user-docs/cost-management.md) | Budget ceilings, cost tracking, projections, and enforcement modes |
+| [Git Strategy](./user-docs/git-strategy.md) | Worktree isolation, branching model, and merge behavior |
+| [Parallel Orchestration](./user-docs/parallel-orchestration.md) | Run multiple milestones simultaneously with worker isolation and coordination |
+| [Working in Teams](./user-docs/working-in-teams.md) | Unique milestone IDs, `.gitignore` setup, and shared planning artifacts |
+| [Skills](./user-docs/skills.md) | Bundled skills, skill discovery, and custom skill authoring |
+| [Migration from v1](./user-docs/migration.md) | Migrating `.planning` directories from the original GSD |
+| [Troubleshooting](./user-docs/troubleshooting.md) | Common issues, `/gsd doctor` (real-time visibility v2.40), `/gsd forensics` (full debugger v2.40), and recovery procedures |
+| [Web Interface](./user-docs/web-interface.md) | Browser-based project management with `gsd --web` (v2.41) |
 | [VS Code Extension](../vscode-extension/README.md) | Chat participant, sidebar dashboard, and RPC integration for VS Code |

 ## Architecture & Internals

+Design documents, ADRs, and internal references. Located in [`dev/`](./dev/).
+
 | Guide | Description |
 |-------|-------------|
-| [Architecture Overview](./architecture.md) | System design, extension model, state-on-disk, and dispatch pipeline |
+| [Architecture Overview](./dev/architecture.md) | System design, extension model, state-on-disk, and dispatch pipeline |
 | [Native Engine](../native/README.md) | Rust N-API modules for performance-critical operations |
-| [ADR-001: Branchless Worktree Architecture](./ADR-001-branchless-worktree-architecture.md) | Decision record for the v2.14 git architecture |
-| [ADR-003: Pipeline Simplification](./ADR-003-pipeline-simplification.md) | Research merged into planning, mechanical completion (v2.30) |
-| [ADR-004: Capability-Aware Model Routing](./ADR-004-capability-aware-model-routing.md) | Extend routing from tier/cost selection to task-capability matching |
+| [ADR-001: Branchless Worktree Architecture](./dev/ADR-001-branchless-worktree-architecture.md) | Decision record for the v2.14 git architecture |
+| [ADR-003: Pipeline Simplification](./dev/ADR-003-pipeline-simplification.md) | Research merged into planning, mechanical completion (v2.30) |
+| [ADR-004: Capability-Aware Model Routing](./dev/ADR-004-capability-aware-model-routing.md) | Extend routing from tier/cost selection to task-capability matching |
+| [ADR-007: Model Catalog Split](./dev/ADR-007-model-catalog-split.md) | Separate model metadata from routing logic for extensibility |
+| [ADR-008: GSD Tools over MCP](./dev/ADR-008-gsd-tools-over-mcp-for-provider-parity.md) | Native tools over MCP for provider parity |
+| [ADR-008: Implementation Plan](./dev/ADR-008-IMPLEMENTATION-PLAN.md) | Implementation plan for ADR-008 |
+| [Context Optimization Opportunities](./dev/pi-context-optimization-opportunities.md) | Analysis of context window usage and optimization strategies |
+| [File System Map](./dev/FILE-SYSTEM-MAP.md) | Complete file system reference |
+| [CI/CD Pipeline](./dev/ci-cd-pipeline.md) | Continuous integration and deployment pipeline |
+| [Frontier Techniques](./dev/FRONTIER-TECHNIQUES.md) | Advanced techniques and research |
+| [PRD: Branchless Worktree](./dev/PRD-branchless-worktree-architecture.md) | Product requirements for branchless worktree architecture |
+| [Agent Knowledge Index](./dev/agent-knowledge-index.md) | Index of agent knowledge resources |

 ## Pi SDK Documentation

-These guides cover the underlying Pi SDK that GSD is built on. Useful if you want to extend GSD or build your own agent application.
+Guides for the underlying Pi SDK that GSD is built on. Located in [`dev/`](./dev/).

 | Guide | Description |
 |-------|-------------|
-| [What is Pi](./what-is-pi/README.md) | Core concepts — modes, agent loop, sessions, tools, providers |
-| [Extending Pi](./extending-pi/README.md) | Building extensions — tools, commands, UI, events, state |
-| [Context & Hooks](./context-and-hooks/README.md) | Context pipeline, hook reference, inter-extension communication |
-| [Pi UI / TUI](./pi-ui-tui/README.md) | Terminal UI components, theming, keyboard input, rendering |
+| [What is Pi](./dev/what-is-pi/README.md) | Core concepts — modes, agent loop, sessions, tools, providers |
+| [Extending Pi](./dev/extending-pi/README.md) | Building extensions — tools, commands, UI, events, state |
+| [Context & Hooks](./dev/context-and-hooks/README.md) | Context pipeline, hook reference, inter-extension communication |
+| [Pi UI / TUI](./dev/pi-ui-tui/README.md) | Terminal UI components, theming, keyboard input, rendering |

 ## Research

 | Guide | Description |
 |-------|-------------|
-| [Building Coding Agents](./building-coding-agents/README.md) | Research notes on agent design — decomposition, context engineering, cost/quality tradeoffs |
+| [Building Coding Agents](./dev/building-coding-agents/README.md) | Research notes on agent design — decomposition, context engineering, cost/quality tradeoffs |
+| [Proposals](./dev/proposals/) | Feature proposals and workflow definitions |
+| [Superpowers](./dev/superpowers/) | Plans and specs for superpower features |
--- a/docs/dev/ADR-001-branchless-worktree-architecture.md
+++ b/docs/dev/ADR-001-branchless-worktree-architecture.md
--- a/docs/dev/ADR-003-pipeline-simplification.md
+++ b/docs/dev/ADR-003-pipeline-simplification.md
--- a/docs/dev/ADR-004-capability-aware-model-routing.md
+++ b/docs/dev/ADR-004-capability-aware-model-routing.md
@ -1,8 +1,8 @@
 # ADR-004: Capability-Aware Model Routing

-**Status:** Proposed (Revised)
+**Status:** Implemented (Phase 2)
 **Date:** 2026-03-26
-**Revised:** 2026-03-26
+**Revised:** 2026-04-03
 **Deciders:** Jeremy McSpadden
 **Related:** ADR-003 (pipeline simplification), [Issue #2655](https://github.com/gsd-build/gsd-2/issues/2655), `docs/dynamic-model-routing.md`

--- a/docs/dev/ADR-005-multi-model-provider-tool-strategy.md
+++ b/docs/dev/ADR-005-multi-model-provider-tool-strategy.md
@ -0,0 +1,67 @@
+# ADR-005: Multi-Model, Multi-Provider, and Tool Strategy
+
+**Status:** Accepted
+**Date:** 2026-03-27
+**Deciders:** Jeremy McSpadden
+**Related:** ADR-004 (capability-aware model routing), ADR-003 (pipeline simplification), [Issue #2790](https://github.com/gsd-build/gsd-2/issues/2790)
+
+## Context
+
+PR #2755 lands capability-aware model routing (ADR-004), extending the router from a one-dimensional complexity-tier system to a two-dimensional system that scores models across 7 capability dimensions. GSD can now intelligently pick the best model for a task from a heterogeneous pool.
+
+But model selection is only one piece of the multi-model puzzle. The system faces structural gaps as users configure diverse provider pools:
+
+1. **Tool compatibility is assumed, not verified** — Every registered tool is sent to every model regardless of provider capabilities.
+2. **No tool-aware model routing** — ADR-004 scores 7 capability dimensions but none encode whether a model can actually use the tools a task requires.
+3. **Provider failover loses context fidelity** — Cross-provider switches silently degrade conversation quality (thinking blocks dropped, tool IDs remapped).
+4. **Tool availability is static across a session** — The same tools are presented regardless of the selected model's capabilities.
+5. **No provider capability registry** — Provider quirks are scattered across `*-shared.ts` files.
+
+## Decision
+
+Introduce a provider capability registry and tool compatibility layer that integrates with ADR-004's capability-aware model router.
+
+### Design Principles
+
+1. **Layered on ADR-004, not replacing it.** Capability scoring remains primary. This adds tool compatibility as a hard constraint.
+2. **Hard constraints filter; soft scores rank.** Tool support is binary — it filters the eligible set before scoring.
+3. **Provider knowledge is declarative, not scattered.** Provider capabilities move to an explicit registry.
+4. **Tool sets adapt to model capabilities.** Active tool set adjusts when the router selects a different model.
+5. **Graceful degradation preserved.** Unknown providers get full tool access — same as today.
+
+### Implementation Phases
+
+1. **Phase 1:** Provider Capabilities Registry (`packages/pi-ai/src/providers/provider-capabilities.ts`)
+2. **Phase 2:** Tool Compatibility Metadata (extend `ToolDefinition` with `compatibility` field)
+3. **Phase 3:** Tool-compatibility filter in routing pipeline + `ProviderSwitchReport` in `transform-messages.ts`
+4. **Phase 4:** `adjustToolSet` extension hook
+
+## Consequences
+
+### Positive
+- Eliminates silent tool failures when routing to incompatible providers
+- Makes cross-provider routing safe by default
+- Provider knowledge becomes queryable (registry vs scattered code)
+- Cross-provider context loss becomes visible via `ProviderSwitchReport`
+
+### Negative
+- More metadata to maintain (provider capabilities, tool compatibility)
+- Tool filtering adds a pipeline step (sub-millisecond, O(models × tools))
+- Risk of over-filtering (mitigated: opt-in per tool, permissive defaults)
+
+### Neutral
+- Existing behavior unchanged without metadata
+- ADR-004 scoring is unmodified
+- Provider implementations simplify over time as registry replaces scattered workarounds
+
+## Appendix: Architecture Reference
+
+| File | Role |
+|------|------|
+| `packages/pi-ai/src/providers/register-builtins.ts` | Provider registration |
+| `packages/pi-ai/src/providers/*-shared.ts` | Provider-specific handling |
+| `packages/pi-ai/src/providers/transform-messages.ts` | Cross-provider normalization |
+| `packages/pi-ai/src/types.ts` | Core types |
+| `packages/pi-coding-agent/src/core/extensions/types.ts` | ToolDefinition, ExtensionAPI |
+| `src/resources/extensions/gsd/model-router.ts` | Capability scoring (ADR-004) |
+| `src/resources/extensions/gsd/auto-model-selection.ts` | Model selection orchestration |
--- a/docs/dev/ADR-007-model-catalog-split.md
+++ b/docs/dev/ADR-007-model-catalog-split.md
@ -0,0 +1,285 @@
+# ADR-007: Model Catalog Split and Provider API Encapsulation
+
+**Status:** Proposed
+**Date:** 2026-04-03
+**Deciders:** Jeremy McSpadden
+**Related:** ADR-004 (capability-aware model routing), [ADR-005](https://github.com/gsd-build/gsd-2/issues/2790), [ADR-006](https://github.com/gsd-build/gsd-2/issues/2995), `packages/pi-ai/src/providers/`, `packages/pi-ai/src/models.ts`
+
+## Context
+
+The model/provider system in `pi-ai` has two structural problems worth fixing — but the system is **not fundamentally broken**. The heavy lifting (lazy SDK imports, registry-based dispatch, extension-based registration) is already well-designed. This ADR targets the two areas where the current design creates real friction without proposing unnecessary runtime changes.
+
+### Current Architecture
+
+```
+stream.ts
+  └─ import "./providers/register-builtins.js"  ← side-effect import at load time
+       ├─ import anthropic.ts            (6.8 KB)
+       ├─ import anthropic-vertex.ts     (3.9 KB)
+       ├─ import openai-completions.ts   (26 KB)
+       ├─ import openai-responses.ts     (6.4 KB)
+       ├─ import openai-codex-responses.ts (29 KB)
+       ├─ import azure-openai-responses.ts (7.8 KB)
+       ├─ import google.ts              (13.6 KB)
+       ├─ import google-vertex.ts       (14.5 KB)
+       ├─ import google-gemini-cli.ts   (30 KB)
+       ├─ import mistral.ts             (18.9 KB)
+       └─ amazon-bedrock.ts             (24 KB) ← only lazy-loaded provider
+
+models.ts
+  └─ import models.generated.ts   ← 13,848 lines, ALL providers, loaded at init
+  └─ import models.custom.ts      ← 197 lines, additional providers
+```
+
+### What Already Works Well
+
+1. **SDK lazy loading.** Every provider file uses `async function getXxxClass()` with a cached dynamic `import()`. The heavy npm packages (`@anthropic-ai/sdk`, `openai`, `@google/genai`, `@aws-sdk/*`, `@mistralai/*`) are only loaded on first API call. This is where the real startup cost would be — and it's already handled.
+
+2. **Registry-based dispatch.** `api-registry.ts` cleanly maps API types to stream functions. Callers use `stream(model, context)` and the registry routes to the right provider. This pattern is sound.
+
+3. **Extension registration.** Ollama and Claude Code CLI register via `registerApiProvider()` at runtime. This extensibility point works correctly.
+
+4. **Provider implementation code loading (~200KB total).** While all providers load eagerly, V8 parses local `.js` files in single-digit milliseconds each. The total parse cost for all provider files is ~10-30ms — not a user-visible bottleneck on a CLI that's about to make a multi-second API call anyway.
+
+### What's Actually Worth Fixing
+
+#### Problem 1: Monolithic model catalog — developer experience, not runtime
+
+`models.generated.ts` is **13,848 lines in a single file**. This creates real friction:
+
+- **PR reviews are painful.** When the generation script runs, the diff is a wall of changes across unrelated providers. Reviewers can't tell what actually changed for a specific provider.
+- **Navigation is slow.** Finding a specific model requires scrolling or searching through thousands of lines of static object literals.
+- **Merge conflicts are frequent.** Any two PRs that touch model generation will conflict on the same monolithic file.
+- **Git blame is useless.** Every line was "last changed" by the generation script, obscuring the history of individual provider additions.
+
+The runtime cost of loading all model definitions is negligible — a Map of ~200 model objects is maybe 50-100KB of heap. The problem is purely about code organization and developer workflow.
+
+#### Problem 2: Barrel export leaks provider internals — API design
+
+`packages/pi-ai/src/index.ts` re-exports every provider module's internals:
+
+```typescript
+export * from "./providers/anthropic.js";
+export * from "./providers/google.js";
+export * from "./providers/google-gemini-cli.js";
+export * from "./providers/google-vertex.js";
+export * from "./providers/mistral.js";
+export * from "./providers/openai-completions.js";
+export * from "./providers/openai-responses.js";
+// ... etc
+```
+
+This is a public API problem:
+
+- **Consumers can bypass the registry.** Any code that `import { streamAnthropic } from "pi-ai"` has a direct dependency on an implementation detail that should be internal.
+- **Refactoring is blocked.** Renaming a function inside a provider file is a breaking change because it's re-exported from the package root.
+- **API surface is unnecessarily large.** The public API should be `stream()`, `streamSimple()`, `registerApiProvider()`, model utilities, and types. Provider-specific stream functions are implementation details.
+
+### What Is NOT Worth Changing
+
+**Lazy provider loading (converting `register-builtins.ts` to async on-demand loading).** This was considered and rejected because:
+
+1. **The SDKs are already lazy.** The heavy cost is handled. Provider implementation code (~200KB of local `.js`) parses in ~10-30ms total.
+2. **Async resolution adds complexity to the hot path.** `stream.ts` currently does a synchronous `Map.get()`. Making `resolveApiProvider` async adds a microtask hop to every API call — not just the first. Small but measurable, and for no user-visible gain.
+3. **High blast radius, low payoff.** Touching `stream.ts`, `api-registry.ts`, and the registration lifecycle simultaneously risks regressions in the core streaming path for an optimization that wouldn't show up in profiling.
+4. **Bedrock's lazy loading is a special case, not a template.** It exists because `@aws-sdk/client-bedrock-runtime` is uniquely massive. Generalizing this pattern to providers where the SDK is already lazy-imported doesn't compound the benefit.
+
+## Decision
+
+**Make two targeted improvements to code organization and API hygiene. Do not change runtime loading behavior.**
+
+### Change 1: Split `models.generated.ts` into per-provider files
+
+Replace the monolithic 13,848-line generated file with per-provider files:
+
+```
+packages/pi-ai/src/models/
+  ├── index.ts                  ← re-exports combined registry, same public API
+  ├── generated/
+  │   ├── anthropic.ts          ← Anthropic model definitions
+  │   ├── openai.ts             ← OpenAI model definitions
+  │   ├── google.ts             ← Google model definitions
+  │   ├── mistral.ts            ← Mistral model definitions
+  │   ├── amazon-bedrock.ts     ← Bedrock model definitions
+  │   ├── groq.ts               ← Groq model definitions
+  │   ├── xai.ts                ← xAI model definitions
+  │   ├── cerebras.ts           ← Cerebras model definitions
+  │   ├── openrouter.ts         ← OpenRouter model definitions
+  │   └── ...                   ← one file per provider in the catalog
+  ├── custom.ts                 ← replaces models.custom.ts (unchanged content)
+  └── capability-patches.ts     ← CAPABILITY_PATCHES extracted for clarity
+```
+
+**`models/index.ts` keeps the exact same synchronous public API:**
+
+```typescript
+// models/index.ts
+// GSD-2 — Model registry (split by provider for maintainability)
+
+import { ANTHROPIC_MODELS } from "./generated/anthropic.js";
+import { OPENAI_MODELS } from "./generated/openai.js";
+import { GOOGLE_MODELS } from "./generated/google.js";
+// ... one import per provider
+
+import { CUSTOM_MODELS } from "./custom.js";
+import { CAPABILITY_PATCHES, applyCapabilityPatches } from "./capability-patches.js";
+import type { Api, KnownProvider, Model, Usage } from "../types.js";
+
+// Combine all generated models into single registry — same as today
+const MODELS = {
+  ...ANTHROPIC_MODELS,
+  ...OPENAI_MODELS,
+  ...GOOGLE_MODELS,
+  // ...
+};
+
+// Rest of the file is identical to current models.ts:
+// modelRegistry Map construction, capability patch application,
+// getModel(), getProviders(), getModels(), calculateCost(),
+// supportsXhigh(), modelsAreEqual()
+```
+
+**Key constraint: loading stays synchronous and eager.** All model files are statically imported. The Map is built at module init exactly as today. No async, no lazy loading, no runtime behavior change. This is purely a file organization change.
+
+**Update `generate-models.ts`** to emit one file per provider instead of a single `models.generated.ts`. The script already groups models by provider internally — it just needs to write separate files instead of one.
+
+#### Why this matters
+
+| Before | After |
+|--------|-------|
+| PR diffs show 13K-line file changes | PR diffs scoped to the provider that changed |
+| Merge conflicts on any concurrent model update | Conflicts only when same provider is touched |
+| `git blame` shows "regenerate models" for every line | `git blame` shows per-provider history |
+| Finding a model = search through 13K lines | Finding a model = open the provider file |
+| One reviewer must understand all providers | Reviewers only need context for affected provider |
+
+### Change 2: Stop barrel-exporting provider internals
+
+**Update `packages/pi-ai/src/index.ts`:**
+
+```typescript
+// Before (current — 17 re-exports including all providers):
+export * from "./providers/anthropic.js";
+export * from "./providers/azure-openai-responses.js";
+export * from "./providers/google.js";
+export * from "./providers/google-gemini-cli.js";
+export * from "./providers/google-vertex.js";
+export * from "./providers/mistral.js";
+export * from "./providers/openai-completions.js";
+export * from "./providers/openai-responses.js";
+export * from "./providers/register-builtins.js";
+// ...
+
+// After (clean public API):
+export * from "./api-registry.js";
+export * from "./env-api-keys.js";
+export * from "./models/index.js";
+export * from "./providers/register-builtins.js";  // resetApiProviders() is public
+export * from "./stream.js";
+export * from "./types.js";
+export * from "./utils/event-stream.js";
+export * from "./utils/json-parse.js";
+export type { OAuthAuthInfo, OAuthCredentials, /* ... */ } from "./utils/oauth/types.js";
+export * from "./utils/overflow.js";
+export * from "./utils/typebox-helpers.js";
+export * from "./utils/repair-tool-json.js";
+export * from "./utils/validation.js";
+```
+
+Provider-specific exports (`streamAnthropic`, `streamGoogle`, etc.) are removed from the public API. Any external consumer that imported them directly should use the registry-based `stream()` / `streamSimple()` functions instead — which is how all internal callers already work.
+
+#### Why this matters
+
+- **Enforces the registry pattern.** The correct way to call a provider is `stream(model, context)`. Direct provider function imports create fragile coupling.
+- **Enables future refactoring.** Provider internal function signatures can change without breaking the package API. Today, renaming `streamAnthropic` would be a semver-breaking change.
+- **Reduces API surface.** Consumers see only what they need: `stream`, `streamSimple`, `registerApiProvider`, model utilities, and types.
+
+### What Does NOT Change
+
+- **Runtime behavior** — all providers still load eagerly, same as today
+- **The `Model<TApi>` type system** — all types, interfaces, and generics stay the same
+- **The `ApiProvider` interface** — providers still implement `{ api, stream, streamSimple }`
+- **The `api-registry.ts` registry** — synchronous `Map.get()` dispatch, unchanged
+- **`stream.ts`** — no changes to the streaming entry point
+- **`register-builtins.ts`** — still eagerly imports and registers all providers (only `resetApiProviders` remains in barrel export)
+- **The extension system** — `registerApiProvider()` continues to work for Ollama, Claude Code CLI, etc.
+- **`models.json` user config** — custom models, overrides, provider settings are unaffected
+- **Model discovery** — discovery adapters are already lazy and independent
+- **Model routing** — ADR-004's capability-aware routing is orthogonal
+
+## Consequences
+
+### Positive
+
+1. **Cleaner PRs.** Model catalog changes are scoped to the provider that changed. Reviewers see a 200-line diff in `models/generated/openai.ts` instead of a 13K-line diff in `models.generated.ts`.
+
+2. **Fewer merge conflicts.** Two PRs that update different providers no longer conflict on the same file.
+
+3. **Better navigability.** Developers can jump directly to `models/generated/anthropic.ts` to see Anthropic's model definitions instead of searching through a monolith.
+
+4. **Cleaner package API.** `pi-ai` exports only what consumers need. Provider internals are properly encapsulated.
+
+5. **Future-proofs refactoring.** Provider implementation details can evolve without breaking the public API contract.
+
+6. **Zero runtime risk.** No changes to loading, registration, streaming, or dispatch. The refactor is purely structural.
+
+### Negative
+
+1. **More files.** Instead of 1 generated file + 1 custom file, we'll have ~15-20 generated files. Marginal complexity increase, but each file is focused and small.
+
+2. **Generation script update.** `generate-models.ts` needs to write per-provider files. The script already groups by provider, so this is straightforward but requires testing.
+
+3. **Import audit for barrel export change.** Any code that directly imports `streamAnthropic` (etc.) from `pi-ai` needs to be updated. Based on research, the main consumer is `register-builtins.ts` itself, which imports providers directly (not through the barrel). External usage should be minimal.
+
+## Alternatives Considered
+
+### 1. Full lazy provider loading (original ADR-005 proposal)
+
+Make all providers load on-demand via async dynamic imports, generalizing the Bedrock pattern. **Rejected** because:
+- SDK imports are already lazy — the heavy cost is handled
+- Provider implementation parsing is ~10-30ms total — not a bottleneck
+- Adds async complexity to the synchronous stream dispatch hot path
+- High migration effort and regression risk for unmeasurable performance gain
+
+### 2. Plugin architecture with separate npm packages
+
+Move each provider to its own package (`@gsd/provider-anthropic`, etc.). Maximum isolation but dramatically more complex build/release/versioning. Overkill for a monorepo where all providers ship together.
+
+### 3. Do nothing
+
+The current architecture works. This is a valid choice. The split is justified by the developer experience friction (13K-line file, merge conflicts, unusable git blame) and the API hygiene issue (leaking provider internals), not by a runtime problem. If the team is not experiencing these friction points, deferring is reasonable.
+
+## Implementation Plan
+
+### Wave 1: Split Model Catalog (Low-Medium Risk)
+1. Update `generate-models.ts` to emit per-provider files into `models/generated/`
+2. Create `models/index.ts` that imports all per-provider files and builds the same registry
+3. Extract `CAPABILITY_PATCHES` into `models/capability-patches.ts`
+4. Move `models.custom.ts` to `models/custom.ts`
+5. Update imports in `models.ts` (or replace it with the new `models/index.ts`)
+6. Verify `npm run build` and `npm run test` pass
+7. Delete `models.generated.ts` and `models.custom.ts`
+
+### Wave 2: Clean Up Barrel Export (Low Risk)
+1. Remove provider re-exports from `index.ts`
+2. Grep for direct provider imports from `"pi-ai"` across the codebase
+3. Migrate any found usages to use `stream()` / `streamSimple()` through the registry
+4. Verify build and tests
+
+### Wave 3: Validate
+1. Run full test suite
+2. Verify extension registration (Ollama, Claude Code CLI) still works
+3. Verify `resetApiProviders()` test helper still works
+4. Spot-check a few providers end-to-end
+
+## References
+
+- Current model catalog: `packages/pi-ai/src/models.generated.ts` (13,848 lines)
+- Current barrel export: `packages/pi-ai/src/index.ts`
+- Model registry: `packages/pi-ai/src/models.ts`
+- API provider registry: `packages/pi-ai/src/api-registry.ts`
+- Eager registration: `packages/pi-ai/src/providers/register-builtins.ts`
+- Stream dispatch: `packages/pi-ai/src/stream.ts`
+- Generation script: `packages/pi-ai/scripts/generate-models.ts`
+- Extension registration: `packages/pi-coding-agent/src/core/model-registry.ts`
+- ADR-004: `docs/ADR-004-capability-aware-model-routing.md`
--- a/docs/dev/ADR-008-IMPLEMENTATION-PLAN.md
+++ b/docs/dev/ADR-008-IMPLEMENTATION-PLAN.md
@ -0,0 +1,335 @@
+# ADR-008 Implementation Plan
+
+**Related ADR:** [ADR-008-gsd-tools-over-mcp-for-provider-parity.md](/Users/jeremymcspadden/Github/gsd-2/docs/ADR-008-gsd-tools-over-mcp-for-provider-parity.md)
+**Status:** Draft
+**Date:** 2026-04-09
+
+## Objective
+
+Implement the ADR-008 decision by exposing the core GSD workflow tool contract over MCP, then wiring MCP-backed access into provider paths that cannot use the native in-process GSD tool registry directly.
+
+The first usable outcome is:
+
+- a Claude Code-backed execution session can complete a task using canonical GSD tools
+- no manual summary-writing fallback is needed
+- native provider behavior remains unchanged
+
+## Non-Goals
+
+- Replacing native in-process GSD tools with MCP
+- Exporting every historical alias in the first rollout
+- Reworking the entire session-oriented MCP server before proving the workflow-tool surface
+- Supporting every provider path before Claude Code is working end-to-end
+
+## Constraints
+
+- Native and MCP tool paths must share business logic
+- MCP must not bypass write-gate or discussion-gate protections
+- Canonical GSD state transitions must remain DB-backed
+- Provider capability mismatches must fail early, not degrade silently
+
+## Workstreams
+
+### 1. Shared Handler Extraction
+
+Goal: separate business logic from transport registration.
+
+Targets:
+
+- `src/resources/extensions/gsd/bootstrap/db-tools.ts`
+- `src/resources/extensions/gsd/bootstrap/query-tools.ts`
+- `src/resources/extensions/gsd/tools/complete-task.ts`
+- sibling modules used by planning/summary/validation tools
+
+Deliverables:
+
+- transport-neutral handler entrypoints for the minimum workflow tool set
+- thin native registration wrappers that call those handlers
+- thin MCP registration wrappers that call those handlers
+
+Exit criteria:
+
+- native tool behavior is unchanged
+- no workflow tool logic is duplicated in MCP server code
+
+### 2. Workflow-Tool MCP Surface
+
+Goal: add an MCP server surface for real GSD workflow tools, distinct from the current session/read API.
+
+Preferred first-cut tool set:
+
+- `gsd_summary_save`
+- `gsd_decision_save`
+- `gsd_plan_milestone`
+- `gsd_plan_slice`
+- `gsd_plan_task`
+- `gsd_task_complete`
+- `gsd_slice_complete`
+- `gsd_complete_milestone`
+- `gsd_validate_milestone`
+- `gsd_replan_slice`
+- `gsd_reassess_roadmap`
+- `gsd_save_gate_result`
+- `gsd_milestone_status`
+
+Likely files:
+
+- `packages/mcp-server/src/server.ts` or a new sibling server package
+- `packages/mcp-server/src/...` supporting modules
+- shared tool-definition metadata if needed
+
+Decisions to make during implementation:
+
+- extend existing MCP package vs create `packages/mcp-gsd-tools-server`
+- canonical names only vs selected alias export
+- single combined server vs separate “session” and “workflow” server modes
+
+Exit criteria:
+
+- MCP tool discovery shows the minimum tool set
+- each MCP tool invokes the shared handlers successfully in isolation
+
+### 3. Safety and Policy Parity
+
+Goal: ensure MCP mutations enforce the same rules as native tool calls.
+
+Targets:
+
+- `src/resources/extensions/gsd/bootstrap/write-gate.ts`
+- any current tool-call gating hooks tied to native runtime only
+- MCP wrapper layer before shared handler invocation
+
+Required protections:
+
+- discussion gate blocking
+- queue-mode restrictions
+- write-path restrictions
+- canonical DB/file rendering order
+
+Exit criteria:
+
+- MCP cannot be used to bypass native write restrictions
+- blocked native scenarios remain blocked over MCP
+
+### 4. Claude Code Provider Integration
+
+Goal: attach the GSD workflow-tool MCP surface to Claude Code sessions.
+
+Targets:
+
+- `src/resources/extensions/claude-code-cli/stream-adapter.ts`
+- `src/resources/extensions/claude-code-cli/index.ts`
+
+Expected work:
+
+- build a GSD-managed `mcpServers` config for the Claude SDK session
+- attach the workflow MCP server only when the session requires GSD tools
+- keep current Claude Code streaming behavior intact
+
+Exit criteria:
+
+- Claude Code session can discover the GSD workflow MCP tools
+- task execution path can call `gsd_task_complete` successfully
+
+### 5. Capability Detection and Failure Path
+
+Goal: refuse to start tool-dependent workflows when required capabilities are unavailable.
+
+Targets:
+
+- GSD dispatch / auto-mode preflight
+- provider selection and routing checks
+- user-facing compatibility errors
+
+Required behavior:
+
+- if native GSD tools are available, proceed
+- else if GSD workflow MCP tools are available, proceed
+- else fail fast with a precise message
+
+Exit criteria:
+
+- no execution prompt is sent that requires unavailable tools
+- users with only unsupported capability combinations get a hard error, not a fake fallback
+
+### 6. Prompt and Documentation Alignment
+
+Goal: keep the workflow contract strict while removing transport assumptions from docs and runtime messaging.
+
+Targets:
+
+- `src/resources/extensions/gsd/prompts/execute-task.md`
+- related planning/discuss prompts that reference tool availability
+- provider and MCP docs
+
+Rules:
+
+- prompts should keep requiring canonical GSD completion/planning tools
+- prompts should not imply “native in-process tool only”
+- docs should explain native vs MCP-backed fulfillment paths
+
+Exit criteria:
+
+- prompt contract matches runtime reality
+- no provider is told to use a tool surface it cannot access
+
+## Phase Plan
+
+## Phase 1: Spike and Handler Extraction
+
+Scope:
+
+- extract shared logic for `gsd_summary_save`, `gsd_task_complete`, and `gsd_milestone_status`
+- prove native wrappers still work
+
+Why first:
+
+- these tools are enough to test end-to-end completion semantics without migrating the full catalog
+
+Verification:
+
+- existing native tests still pass
+- new unit tests cover shared handler entrypoints directly
+
+## Phase 2: Minimal Workflow MCP Server
+
+Scope:
+
+- expose the three extracted tools over MCP
+- ensure discovery schemas are clean and canonical
+
+Verification:
+
+- MCP discovery returns all three tools
+- direct MCP calls succeed against a fixture project
+
+## Phase 3: Claude Code End-to-End Proof
+
+Scope:
+
+- wire the minimal workflow MCP server into the Claude SDK session
+- run a single execution path that ends with task completion
+
+Verification:
+
+- Claude Code can call `gsd_task_complete`
+- summary file, DB state, and plan checkbox update correctly
+
+## Phase 4: Expand to Full Minimum Workflow Set
+
+Scope:
+
+- add planning, slice completion, milestone completion, roadmap reassessment, and gate result tools
+
+Verification:
+
+- discuss/plan/execute/complete lifecycle works over MCP for the supported flow set
+
+## Phase 5: Capability Gating and UX Hardening
+
+Scope:
+
+- add preflight capability checks
+- add clear error messaging for unsupported setups
+
+Verification:
+
+- unsupported provider/session combinations fail before execution starts
+
+## Phase 6: Prompt and Doc Cleanup
+
+Scope:
+
+- align prompts and docs with the new transport-neutral contract
+
+Verification:
+
+- prompt references are accurate
+- docs describe the supported architecture and limitations
+
+## File-Level Starting Map
+
+High-probability files for the first implementation:
+
+- `src/resources/extensions/gsd/bootstrap/db-tools.ts`
+- `src/resources/extensions/gsd/bootstrap/query-tools.ts`
+- `src/resources/extensions/gsd/bootstrap/write-gate.ts`
+- `src/resources/extensions/gsd/tools/complete-task.ts`
+- `src/resources/extensions/claude-code-cli/stream-adapter.ts`
+- `src/resources/extensions/claude-code-cli/index.ts`
+- `packages/mcp-server/src/server.ts`
+- `packages/mcp-server/src/session-manager.ts`
+- `packages/mcp-server/README.md`
+- `src/resources/extensions/gsd/prompts/execute-task.md`
+
+## Testing Strategy
+
+### Unit
+
+- shared handlers
+- MCP wrapper adapters
+- gating / capability-check helpers
+
+### Integration
+
+- direct MCP tool invocation against fixture projects
+- native tool invocation regression coverage
+- Claude Code provider path with MCP attached
+
+### End-to-End
+
+- plan or execute a small fixture task and complete it through canonical GSD tools
+- confirm DB row, rendered summary, and plan state stay in sync
+
+## Risks
+
+### Risk 1: Logic Drift
+
+If native and MCP wrappers each evolve their own behavior, parity will collapse quickly.
+
+Mitigation:
+
+- shared handler extraction before broad MCP exposure
+
+### Risk 2: Safety Regression
+
+If MCP becomes a side door around native gating, the architecture is worse than before.
+
+Mitigation:
+
+- centralize or reuse gating checks before shared handler invocation
+
+### Risk 3: Overly Broad First Rollout
+
+Exporting every tool and alias immediately increases scope and test burden.
+
+Mitigation:
+
+- ship a minimal workflow tool set first
+
+### Risk 4: Claude SDK Session Wiring Complexity
+
+Attaching MCP servers dynamically may expose edge cases around cwd, permissions, or subprocess lifecycle.
+
+Mitigation:
+
+- prove a narrow spike with 2-3 tools before expanding
+
+## Exit Criteria for ADR-008
+
+ADR-008 is considered implemented when:
+
+1. Claude Code-backed execution can use canonical GSD workflow tools over MCP.
+2. Native provider behavior remains intact.
+3. Shared handlers back both native and MCP invocation.
+4. Gating and state integrity protections apply equally to MCP mutations.
+5. Capability checks prevent prompts from requiring unavailable tools.
+
+## Recommended Next Task
+
+Start with a narrow spike:
+
+1. Extract shared handlers for `gsd_summary_save`, `gsd_task_complete`, and `gsd_milestone_status`.
+2. Expose those tools through a minimal workflow MCP server.
+3. Attach that MCP server to Claude Code sessions.
+4. Prove end-to-end task completion on a fixture project.
--- a/docs/dev/ADR-008-gsd-tools-over-mcp-for-provider-parity.md
+++ b/docs/dev/ADR-008-gsd-tools-over-mcp-for-provider-parity.md
@ -0,0 +1,240 @@
+# ADR-008: Expose GSD Workflow Tools Over MCP for Provider Parity
+
+**Status:** Proposed
+**Date:** 2026-04-09
+**Deciders:** Jeremy McSpadden
+**Related:** ADR-004 (capability-aware model routing), ADR-007 (model catalog split and provider API encapsulation), `src/resources/extensions/gsd/bootstrap/db-tools.ts`, `src/resources/extensions/claude-code-cli/stream-adapter.ts`, `packages/mcp-server/src/server.ts`
+
+## Context
+
+GSD currently has two different tool surfaces:
+
+1. **In-process extension tools** registered directly into the runtime via `pi.registerTool(...)`.
+2. **An external MCP server** that exposes session orchestration and read-only project inspection.
+
+This split is now creating a real provider compatibility problem.
+
+### What exists today
+
+The core GSD workflow tools are internal extension tools. Examples include:
+
+- `gsd_summary_save`
+- `gsd_plan_milestone`
+- `gsd_plan_slice`
+- `gsd_plan_task`
+- `gsd_task_complete` / `gsd_complete_task`
+- `gsd_slice_complete`
+- `gsd_complete_milestone`
+- `gsd_validate_milestone`
+- `gsd_replan_slice`
+- `gsd_reassess_roadmap`
+
+These are registered in `src/resources/extensions/gsd/bootstrap/db-tools.ts` and related bootstrap files. GSD prompts assume these tools are available during discuss, plan, and execute flows.
+
+Separately, `packages/mcp-server/src/server.ts` exposes a different tool surface:
+
+- session control: `gsd_execute`, `gsd_status`, `gsd_result`, `gsd_cancel`, `gsd_query`, `gsd_resolve_blocker`
+- read-only inspection: `gsd_progress`, `gsd_roadmap`, `gsd_history`, `gsd_doctor`, `gsd_captures`, `gsd_knowledge`
+
+That MCP server is useful, but it is **not** a transport for the internal workflow/mutation tools.
+
+### The current failure mode
+
+The Claude Code CLI provider uses the Anthropic Agent SDK through `src/resources/extensions/claude-code-cli/stream-adapter.ts`. That adapter starts a Claude SDK session, but it does not forward the internal GSD tool registry into the SDK session, nor does it attach a GSD MCP server for those tools.
+
+As a result:
+
+- prompts tell the model to call tools like `gsd_complete_task`
+- the tools exist in GSD
+- but Claude Code sessions do not actually receive those tools
+
+This produces a contract mismatch: the model is required to use tools that are unavailable in that provider path.
+
+### Why this matters
+
+This is not a one-off Claude Code bug. It reveals a deeper architectural issue:
+
+- GSD’s core workflow contract is transport-specific
+- prompt authors assume “internal extension tool availability”
+- provider integrations do not all share the same execution surface
+
+If GSD wants provider parity, its workflow tools need a transport-neutral exposure model.
+
+## Decision
+
+**Expose the GSD workflow tool contract over MCP as a first-class transport, and make MCP the compatibility layer for providers that cannot directly access the in-process GSD tool registry.**
+
+This means:
+
+1. GSD will keep its existing in-process tool registration for native runtime use.
+2. GSD will add an MCP execution surface for the same workflow tools.
+3. Both surfaces must call the same underlying business logic.
+4. Provider integrations such as Claude Code will use the MCP surface when they cannot access native in-process tools directly.
+
+The decision is explicitly **not** to replace the native tool system with MCP everywhere. MCP is the parity and portability layer, not the only runtime path.
+
+## Decision Details
+
+### 1. One handler layer, multiple transports
+
+GSD tool behavior must not be implemented twice.
+
+The transport-neutral business logic for workflow tools should be shared by:
+
+- native extension tool registration (`pi.registerTool(...)`)
+- MCP server tool registration
+
+The MCP server should wrap the same handlers used by `db-tools.ts`, `query-tools.ts`, and related modules. This avoids logic drift and keeps validation, DB writes, file rendering, and recovery behavior consistent.
+
+### 2. Add a workflow-tool MCP surface
+
+GSD will expose the workflow tools required for discuss, planning, execution, and completion over MCP.
+
+Initial minimum set:
+
+- `gsd_summary_save`
+- `gsd_decision_save`
+- `gsd_plan_milestone`
+- `gsd_plan_slice`
+- `gsd_plan_task`
+- `gsd_task_complete`
+- `gsd_slice_complete`
+- `gsd_complete_milestone`
+- `gsd_validate_milestone`
+- `gsd_replan_slice`
+- `gsd_reassess_roadmap`
+- `gsd_save_gate_result`
+- selected read/query tools such as `gsd_milestone_status`
+
+Aliases should be treated conservatively. MCP should prefer canonical names unless compatibility requires exposing aliases.
+
+### 3. Preserve safety semantics
+
+The current GSD safety model includes write gates, discussion gates, queue-mode restrictions, and state integrity guarantees.
+
+Those guarantees must continue to apply when tools are invoked over MCP. In particular:
+
+- MCP must not create a path that bypasses write gating
+- MCP mutations must preserve the same DB/file/state invariants as native tools
+- provider-specific fallback behavior must not allow manual summary writing in place of canonical completion tools
+
+### 4. Make provider capability checks explicit
+
+Before dispatching a workflow that requires GSD workflow tools, GSD should check whether the selected provider/session can access the required tool surface.
+
+If a provider cannot access either:
+
+- native in-process GSD tools, or
+- the GSD MCP workflow tool surface
+
+then GSD must fail early with a clear compatibility error rather than allowing execution to continue in a degraded, state-breaking mode.
+
+### 5. Keep the existing session/read MCP server
+
+The existing MCP server in `packages/mcp-server` remains valid. It serves a different purpose:
+
+- remote session orchestration
+- status/result polling
+- filesystem-backed project inspection
+
+The new workflow-tool MCP surface is complementary, not a replacement.
+
+## Alternatives Considered
+
+### Alternative A: Reroute away from Claude Code whenever tool-backed execution is needed
+
+This would fix the immediate failure for multi-provider users, but it does not solve provider parity. It also fails completely for users who only have Claude Code configured.
+
+**Rejected** because it treats the symptom, not the architectural gap.
+
+### Alternative B: Hard-fail Claude Code and require another provider
+
+This is a valid short-term guardrail and may still be used before MCP support is complete.
+
+**Rejected as the long-term architecture** because it permanently excludes a supported provider from first-class GSD execution.
+
+### Alternative C: Inject the internal GSD tool registry directly into the Claude Agent SDK without MCP
+
+This would tightly couple GSD’s internal extension runtime to a provider-specific integration path. It would not generalize well to other providers or external tool clients.
+
+**Rejected** because it creates a provider-specific bridge instead of a transport-neutral contract.
+
+### Alternative D: Replace native GSD tools entirely with MCP
+
+This would simplify the conceptual model, but it would force all runtimes through an external protocol boundary even when the native in-process path is faster and already works well.
+
+**Rejected** because MCP is needed for portability, not because the native tool system is flawed.
+
+## Consequences
+
+### Positive
+
+1. **Provider parity improves.** Providers that can consume MCP tools can participate in full GSD workflow execution.
+2. **The workflow contract becomes transport-neutral.** Prompts can rely on capabilities rather than a specific runtime implementation detail.
+3. **One compatibility story for external clients.** Claude Code, Cursor, and other MCP-capable clients can use the same workflow tool surface.
+4. **Better long-term architecture.** Internal tools and external transports converge on shared handlers instead of diverging implementations.
+
+### Negative
+
+1. **Larger surface area to secure and test.** Mutation tools over MCP are higher risk than read-only inspection tools.
+2. **Migration complexity.** Tool registration, gating, and handler extraction must be refactored carefully.
+3. **Two transport paths must remain aligned.** Native and MCP invocation semantics must stay behaviorally identical.
+
+### Neutral / Tradeoff
+
+The system will now support:
+
+- native in-process tool execution when available
+- MCP-backed tool execution when native access is unavailable
+
+That is more complex than a single-path system, but it is the cost of provider portability without sacrificing native runtime quality.
+
+## Migration Plan
+
+### Phase 1: Extract shared handlers
+
+Refactor workflow tools so MCP and native registration can call the same transport-neutral functions.
+
+Priority targets:
+
+- `gsd_summary_save`
+- `gsd_task_complete`
+- `gsd_plan_milestone`
+- `gsd_plan_slice`
+- `gsd_plan_task`
+
+### Phase 2: Stand up the workflow-tool MCP server
+
+Add a new MCP surface for workflow tool execution. This may extend the existing MCP package or live as a sibling package, but it must be clearly separated from the current session/read API.
+
+### Phase 3: Port safety enforcement
+
+Move or centralize write gates and related policy checks so MCP mutations cannot bypass the existing safety model.
+
+### Phase 4: Attach MCP workflow tools to Claude Code sessions
+
+Update the Claude Code provider integration to pass a GSD-managed `mcpServers` configuration into the Claude Agent SDK session when required.
+
+### Phase 5: Add provider capability gating
+
+Before tool-dependent flows begin, verify that the active provider can access the required GSD workflow tools via either native registration or MCP.
+
+### Phase 6: Update prompts and docs
+
+Prompt contracts should remain strict about using canonical GSD completion/planning tools, but documentation and runtime messaging must no longer assume that only native in-process tool registration satisfies that contract.
+
+## Validation
+
+Success is defined by all of the following:
+
+1. A Claude Code-backed execution session can complete a task using canonical GSD workflow tools without manual summary writing.
+2. Native provider behavior remains unchanged.
+3. MCP-invoked workflow tools produce the same DB updates, rendered artifacts, and state transitions as native tool calls.
+4. Write-gate and discussion-gate protections still hold under MCP invocation.
+5. When required capabilities are unavailable, GSD fails early with a precise compatibility error.
+
+## Scope Notes
+
+This ADR establishes the architectural direction. It does **not** require full MCP exposure of every historical alias or every auxiliary tool in the first implementation.
+
+The first implementation should prioritize the minimum workflow tool set needed to make discuss/plan/execute/complete flows work safely for MCP-capable providers.
--- a/docs/dev/FILE-SYSTEM-MAP.md
+++ b/docs/dev/FILE-SYSTEM-MAP.md
--- a/docs/dev/FRONTIER-TECHNIQUES.md
+++ b/docs/dev/FRONTIER-TECHNIQUES.md
--- a/docs/dev/PRD-branchless-worktree-architecture.md
+++ b/docs/dev/PRD-branchless-worktree-architecture.md
--- a/docs/dev/agent-knowledge-index.md
+++ b/docs/dev/agent-knowledge-index.md
--- a/docs/dev/architecture.md
+++ b/docs/dev/architecture.md
@ -14,7 +14,7 @@ gsd (CLI binary)
          ├─ resource-loader.ts  Syncs bundled extensions + agents to ~/.gsd/agent/
          └─ src/resources/
              ├─ extensions/gsd/    Core GSD extension
-              ├─ extensions/...     12 supporting extensions
+              ├─ extensions/...     23 supporting extensions
              ├─ agents/            scout, researcher, worker
              ├─ AGENTS.md          Agent routing instructions
              └─ GSD-WORKFLOW.md    Manual bootstrap protocol
@ -73,6 +73,12 @@ Every dispatch creates a new agent session. The LLM starts with a clean context
 | **Remote Questions** | Discord, Slack, and Telegram integration for headless question routing |
 | **TTSR** | Tool-triggered system rules — conditional context injection based on tool usage |
 | **Universal Config** | Discovery of existing AI tool configurations (Claude Code, Cursor, Windsurf, etc.) |
+| **AWS Auth** | AWS credential management and authentication |
+| **Claude Code CLI** | Claude Code CLI integration |
+| **cmux** | Context multiplexing for multi-session coordination |
+| **GitHub Sync** | GitHub issue and PR synchronization |
+| **Ollama** | Local Ollama model integration |
+| **Shared** | Shared utilities across extensions |

 ## Bundled Agents

@ -122,7 +128,7 @@ The auto mode dispatch pipeline:

 Phase skipping (from token profile) gates steps 2-3: if a phase is skipped, the corresponding unit type is never dispatched.

-## Key Modules (v2.33)
+## Key Modules (v2.67)

 | Module | Purpose |
 |--------|---------|
@ -160,3 +166,11 @@ Phase skipping (from token profile) gates steps 2-3: if a phase is skipped, the
 | `memory-extractor.ts` | Extract reusable knowledge from session transcripts |
 | `memory-store.ts` | Persistent memory store for cross-session knowledge |
 | `queue-order.ts` | Milestone queue ordering |
+| `context-masker.ts` | Context masking for model routing optimization |
+| `phase-anchor.ts` | Phase anchoring for dispatch pipeline |
+| `slice-parallel-orchestrator.ts` | Slice-level parallelism with dependency-aware dispatch |
+| `slice-parallel-eligibility.ts` | Slice parallel eligibility checks |
+| `slice-parallel-conflict.ts` | Slice parallel conflict detection |
+| `preferences-models.ts` | Model preferences configuration |
+| `preferences-validation.ts` | Preferences validation |
+| `preferences-types.ts` | Preferences type definitions |
--- a/docs/dev/building-coding-agents/01-work-decomposition.md
+++ b/docs/dev/building-coding-agents/01-work-decomposition.md
--- a/docs/dev/building-coding-agents/02-what-to-keep-discard-from-human-engineering.md
+++ b/docs/dev/building-coding-agents/02-what-to-keep-discard-from-human-engineering.md
--- a/docs/dev/building-coding-agents/03-state-machine-context-management.md
+++ b/docs/dev/building-coding-agents/03-state-machine-context-management.md
--- a/docs/dev/building-coding-agents/04-optimal-storage-for-project-context.md
+++ b/docs/dev/building-coding-agents/04-optimal-storage-for-project-context.md
--- a/docs/dev/building-coding-agents/05-parallelization-strategy.md
+++ b/docs/dev/building-coding-agents/05-parallelization-strategy.md
--- a/docs/dev/building-coding-agents/06-maximizing-agent-autonomy-superpowers.md
+++ b/docs/dev/building-coding-agents/06-maximizing-agent-autonomy-superpowers.md
--- a/docs/dev/building-coding-agents/07-system-prompt-llm-vs-deterministic-split.md
+++ b/docs/dev/building-coding-agents/07-system-prompt-llm-vs-deterministic-split.md
--- a/docs/dev/building-coding-agents/08-speed-optimization.md
+++ b/docs/dev/building-coding-agents/08-speed-optimization.md
--- a/docs/dev/building-coding-agents/09-top-10-tips-for-a-world-class-agent.md
+++ b/docs/dev/building-coding-agents/09-top-10-tips-for-a-world-class-agent.md
--- a/docs/dev/building-coding-agents/10-top-10-pitfalls-to-avoid.md
+++ b/docs/dev/building-coding-agents/10-top-10-pitfalls-to-avoid.md
--- a/docs/dev/building-coding-agents/11-god-tier-context-engineering.md
+++ b/docs/dev/building-coding-agents/11-god-tier-context-engineering.md
--- a/docs/dev/building-coding-agents/12-handling-ambiguity-contradiction.md
+++ b/docs/dev/building-coding-agents/12-handling-ambiguity-contradiction.md
--- a/docs/dev/building-coding-agents/13-long-running-memory-fidelity.md
+++ b/docs/dev/building-coding-agents/13-long-running-memory-fidelity.md
--- a/docs/dev/building-coding-agents/14-multi-agent-semantic-conflict-resolution.md
+++ b/docs/dev/building-coding-agents/14-multi-agent-semantic-conflict-resolution.md
--- a/docs/dev/building-coding-agents/15-legacy-code-brownfield-onboarding.md
+++ b/docs/dev/building-coding-agents/15-legacy-code-brownfield-onboarding.md
--- a/docs/dev/building-coding-agents/16-encoding-taste-aesthetics.md
+++ b/docs/dev/building-coding-agents/16-encoding-taste-aesthetics.md
--- a/docs/dev/building-coding-agents/17-irreversible-operations-safety-architecture.md
+++ b/docs/dev/building-coding-agents/17-irreversible-operations-safety-architecture.md
--- a/docs/dev/building-coding-agents/18-the-handoff-problem-agent-human-maintainability.md
+++ b/docs/dev/building-coding-agents/18-the-handoff-problem-agent-human-maintainability.md
--- a/docs/dev/building-coding-agents/19-when-to-scrap-and-start-over.md
+++ b/docs/dev/building-coding-agents/19-when-to-scrap-and-start-over.md
--- a/docs/dev/building-coding-agents/20-error-taxonomy-routing.md
+++ b/docs/dev/building-coding-agents/20-error-taxonomy-routing.md
--- a/docs/dev/building-coding-agents/21-cost-quality-tradeoff-model-routing.md
+++ b/docs/dev/building-coding-agents/21-cost-quality-tradeoff-model-routing.md
--- a/docs/dev/building-coding-agents/22-cross-project-learning-reusable-intelligence.md
+++ b/docs/dev/building-coding-agents/22-cross-project-learning-reusable-intelligence.md
--- a/docs/dev/building-coding-agents/23-evolution-across-project-scale.md
+++ b/docs/dev/building-coding-agents/23-evolution-across-project-scale.md
--- a/docs/dev/building-coding-agents/24-security-trust-boundaries.md
+++ b/docs/dev/building-coding-agents/24-security-trust-boundaries.md
--- a/docs/dev/building-coding-agents/25-designing-for-non-technical-users-vibe-coders.md
+++ b/docs/dev/building-coding-agents/25-designing-for-non-technical-users-vibe-coders.md
--- a/docs/dev/building-coding-agents/26-cross-cutting-themes-where-all-4-models-converge.md
+++ b/docs/dev/building-coding-agents/26-cross-cutting-themes-where-all-4-models-converge.md
--- a/docs/dev/building-coding-agents/README.md
+++ b/docs/dev/building-coding-agents/README.md
--- a/docs/dev/ci-cd-pipeline.md
+++ b/docs/dev/ci-cd-pipeline.md
--- a/docs/dev/context-and-hooks/01-the-context-pipeline.md
+++ b/docs/dev/context-and-hooks/01-the-context-pipeline.md
--- a/docs/dev/context-and-hooks/02-hook-reference.md
+++ b/docs/dev/context-and-hooks/02-hook-reference.md
--- a/docs/dev/context-and-hooks/03-context-injection-patterns.md
+++ b/docs/dev/context-and-hooks/03-context-injection-patterns.md
--- a/docs/dev/context-and-hooks/04-message-types-and-llm-visibility.md
+++ b/docs/dev/context-and-hooks/04-message-types-and-llm-visibility.md
--- a/docs/dev/context-and-hooks/05-inter-extension-communication.md
+++ b/docs/dev/context-and-hooks/05-inter-extension-communication.md
--- a/docs/dev/context-and-hooks/06-advanced-patterns-from-source.md
+++ b/docs/dev/context-and-hooks/06-advanced-patterns-from-source.md
--- a/docs/dev/context-and-hooks/07-the-system-prompt-anatomy.md
+++ b/docs/dev/context-and-hooks/07-the-system-prompt-anatomy.md
--- a/docs/dev/context-and-hooks/README.md
+++ b/docs/dev/context-and-hooks/README.md
--- a/docs/dev/extending-pi/01-what-are-extensions.md
+++ b/docs/dev/extending-pi/01-what-are-extensions.md
--- a/docs/dev/extending-pi/02-architecture-mental-model.md
+++ b/docs/dev/extending-pi/02-architecture-mental-model.md
--- a/docs/dev/extending-pi/03-getting-started.md
+++ b/docs/dev/extending-pi/03-getting-started.md
--- a/docs/dev/extending-pi/04-extension-locations-discovery.md
+++ b/docs/dev/extending-pi/04-extension-locations-discovery.md
--- a/docs/dev/extending-pi/05-extension-structure-styles.md
+++ b/docs/dev/extending-pi/05-extension-structure-styles.md
--- a/docs/dev/extending-pi/06-the-extension-lifecycle.md
+++ b/docs/dev/extending-pi/06-the-extension-lifecycle.md
--- a/docs/dev/extending-pi/07-events-the-nervous-system.md
+++ b/docs/dev/extending-pi/07-events-the-nervous-system.md
--- a/docs/dev/extending-pi/08-extensioncontext-what-you-can-access.md
+++ b/docs/dev/extending-pi/08-extensioncontext-what-you-can-access.md
--- a/docs/dev/extending-pi/09-extensionapi-what-you-can-do.md
+++ b/docs/dev/extending-pi/09-extensionapi-what-you-can-do.md
--- a/docs/dev/extending-pi/10-custom-tools-giving-the-llm-new-abilities.md
+++ b/docs/dev/extending-pi/10-custom-tools-giving-the-llm-new-abilities.md
--- a/docs/dev/extending-pi/11-custom-commands-user-facing-actions.md
+++ b/docs/dev/extending-pi/11-custom-commands-user-facing-actions.md
--- a/docs/dev/extending-pi/12-custom-ui-visual-components.md
+++ b/docs/dev/extending-pi/12-custom-ui-visual-components.md
--- a/docs/dev/extending-pi/13-state-management-persistence.md
+++ b/docs/dev/extending-pi/13-state-management-persistence.md
--- a/docs/dev/extending-pi/14-custom-rendering-controlling-what-the-user-sees.md
+++ b/docs/dev/extending-pi/14-custom-rendering-controlling-what-the-user-sees.md
--- a/docs/dev/extending-pi/15-system-prompt-modification.md
+++ b/docs/dev/extending-pi/15-system-prompt-modification.md
--- a/docs/dev/extending-pi/16-compaction-session-control.md
+++ b/docs/dev/extending-pi/16-compaction-session-control.md
--- a/docs/dev/extending-pi/17-model-provider-management.md
+++ b/docs/dev/extending-pi/17-model-provider-management.md
--- a/docs/dev/extending-pi/18-remote-execution-tool-overrides.md
+++ b/docs/dev/extending-pi/18-remote-execution-tool-overrides.md
--- a/docs/dev/extending-pi/19-packaging-distribution.md
+++ b/docs/dev/extending-pi/19-packaging-distribution.md
--- a/docs/dev/extending-pi/20-mode-behavior.md
+++ b/docs/dev/extending-pi/20-mode-behavior.md
--- a/docs/dev/extending-pi/21-error-handling.md
+++ b/docs/dev/extending-pi/21-error-handling.md
--- a/docs/dev/extending-pi/22-key-rules-gotchas.md
+++ b/docs/dev/extending-pi/22-key-rules-gotchas.md
--- a/docs/dev/extending-pi/23-file-reference-documentation.md
+++ b/docs/dev/extending-pi/23-file-reference-documentation.md
--- a/docs/dev/extending-pi/24-file-reference-example-extensions.md
+++ b/docs/dev/extending-pi/24-file-reference-example-extensions.md
--- a/docs/dev/extending-pi/25-slash-command-subcommand-patterns.md
+++ b/docs/dev/extending-pi/25-slash-command-subcommand-patterns.md
--- a/docs/dev/extending-pi/README.md
+++ b/docs/dev/extending-pi/README.md
--- a/docs/dev/pi-context-optimization-opportunities.md
+++ b/docs/dev/pi-context-optimization-opportunities.md
@ -0,0 +1,198 @@
+# pi-coding-agent: Context Optimization Opportunities
+
+> **Status**: Research only — not planned for implementation.
+> Scope: `packages/pi-coding-agent` and `packages/pi-agent-core` infrastructure.
+> These changes would benefit every consumer of the pi engine, not just GSD.
+
+---
+
+## 1. Prompt Caching (`cache_control`) — Highest Impact
+
+**Current state**: Every LLM call re-pays full input token cost for the system prompt, tool definitions, and context files. No `cache_control` breakpoints are set anywhere in the API call path.
+
+**Opportunity**: Anthropic's KV cache delivers 90% cost reduction on cached tokens (0.1x input rate). Claude Code achieves 92–98% cache hit rates by placing stable content before volatile content.
+
+**Where to instrument** (`packages/pi-ai/src/providers/anthropic.ts`):
+- Set `cache_control: { type: "ephemeral" }` on the last tool definition block
+- Set `cache_control` after the static system prompt sections (base boilerplate + context files)
+- Leave the per-turn user message uncached
+
+**Critical constraint**: The cache breakpoint must be placed *after* all static content and *before* any dynamic content (timestamps, per-request variables). Moving a timestamp before a cache breakpoint defeats it on every call.
+
+**Cache hierarchy**: Tools → system → messages. Changing a tool definition invalidates system and message caches. Tool definitions should be sorted deterministically (alphabetically) to prevent spurious cache misses.
+
+**Expected savings**: 80–90% reduction in input token cost for multi-turn sessions (the dominant cost pattern in GSD auto-mode).
+
+---
+
+## 2. Observation Masking in the Message Pipeline
+
+**Current state**: `agent-loop.ts` passes the full `context.messages` array to the LLM on every turn. Tool results from 50 turns ago are re-read in full on every subsequent call. The `transformContext` hook exists on `AgentContext` and fires before every LLM call, but has no default implementation — extensions are responsible for any pruning.
+
+**Opportunity**: Replace old tool result content with lightweight placeholders after N turns. JetBrains Research tested this on SWE-bench Verified (500 tasks, up to 250-turn trajectories) and found:
+- 50%+ cost reduction vs. unmanaged history
+- Performance matched or slightly exceeded LLM summarization
+- Zero overhead (no extra LLM call required)
+
+**Proposed implementation** (default `transformContext` in `pi-agent-core`):
+```typescript
+// Keep last KEEP_RECENT_TURNS verbatim; mask older tool results
+const KEEP_RECENT_TURNS = 8;
+
+function defaultObservationMask(messages: AgentMessage[]): AgentMessage[] {
+  const cutoff = findTurnBoundary(messages, KEEP_RECENT_TURNS);
+  return messages.map((m, i) => {
+    if (i >= cutoff) return m;
+    if (m.type === "toolResult" || m.type === "bashExecution") {
+      return { ...m, content: "[result masked — within summarized history]", excludeFromContext: false };
+    }
+    return m;
+  });
+}
+```
+
+**Compaction interaction**: Observation masking reduces the token accumulation rate, pushing the compaction threshold further out. The two mechanisms are complementary — masking handles the steady state, compaction handles the rare deep-session case.
+
+---
+
+## 3. Earlier Compaction Threshold
+
+**Current state** (`packages/pi-coding-agent/src/core/constants.ts`):
+```typescript
+COMPACTION_RESERVE_TOKENS = 16_384   // triggers at contextWindow - 16K
+COMPACTION_KEEP_RECENT_TOKENS = 20_000
+```
+
+For a 200K context window, compaction fires at ~183K tokens — 91.5% utilization.
+
+**Problem**: Context drift (not raw exhaustion) causes ~65% of enterprise agent failures. Performance degrades measurably beyond ~30K tokens per Zylos production data. The current threshold lets sessions run degraded for a long stretch before compaction fires.
+
+**Opportunity**: Lower the trigger to 70% utilization. For a 200K window, this means compacting at ~140K tokens — 43K tokens earlier.
+
+```typescript
+// Proposed
+COMPACTION_THRESHOLD_PERCENT = 0.70   // fire at 70% of contextWindow
+COMPACTION_RESERVE_TOKENS = contextWindow * (1 - COMPACTION_THRESHOLD_PERCENT)
+```
+
+**Trade-off**: More frequent compactions, each happening earlier when there's more "fresh" content to keep. Summary quality improves because less material needs to be discarded at each cut.
+
+---
+
+## 4. Tool Result Truncation at Write Time
+
+**Current state**: `TOOL_RESULT_MAX_CHARS = 2_000` in `constants.ts`, but this limit is only applied *during compaction summarization*, not when the tool result enters the message store. A bash result returning 50KB of log output is stored and re-sent verbatim until compaction fires.
+
+**Opportunity**: Truncate at write time in `messages.ts` → `convertToLlm()` or in the tool result handler. Two strategies:
+
+- **Hard truncation**: Slice at N chars, append `"\n[truncated — {original_length} chars]"`. Simple, zero overhead.
+- **Semantic head/tail**: Keep first 500 chars (context, command echo) + last 1000 chars (final output, errors). Better for bash results where the end contains the error.
+
+**Recommendation**: Semantic head/tail as the default, configurable per tool type. File read results benefit from head; bash/test output benefits from head+tail.
+
+---
+
+## 5. Context File Deduplication and Trim
+
+**Current state** (`packages/pi-coding-agent/src/core/resource-loader.ts`, lines 84–109):
+- Searches from `~/.gsd/agent/` → ancestor dirs → cwd
+- Deduplicates by *file path* but not by *content*
+- Entire file content concatenated verbatim into system prompt — no trimming, no summarization
+
+**Anti-pattern**: A project with AGENTS.md at 3 ancestor levels (repo root, workspace, home) injects all three in full. If they share common boilerplate, that content is re-injected multiple times.
+
+**Opportunities**:
+1. **Content deduplication**: Hash paragraph-level chunks; skip any chunk already seen in a previously-loaded file
+2. **Section-aware loading**: Parse `## ` headings in AGENTS.md; only include sections relevant to the current task type (e.g., `## Testing` section only when running tests)
+3. **Token budget enforcement**: If total context files exceed N tokens, summarize oldest/most-distant file rather than including verbatim
+
+---
+
+## 6. Skill Content Lazy Loading and Summarization
+
+**Current state**: When `/skill:name` is invoked, the full skill file content is injected inline as `<skill>...</skill>` in the user message. No chunking, no summarization. A 10KB skill file adds ~2,500 tokens to that turn.
+
+**Opportunity**:
+- **Cached skill injection**: If the same skill is used across multiple turns (rare but possible), it's re-injected each time. Cache with `cache_control` after first injection.
+- **Skill digest mode**: Inject a 200-token summary of the skill on first reference; full content only if the model requests it via a `get_skill_detail` tool call. Reduces cost for skills that don't end up being followed.
+- **Skill prefetching**: Before a known long session (e.g., auto-mode start), pre-inject all likely skills with `cache_control` so they're cached for the entire session.
+
+---
+
+## 7. Token Estimation Accuracy
+
+**Current state** (`compaction.ts`, line 216): `chars / 4` heuristic. This overestimates token count for English prose (~3.5 chars/token) and underestimates for code with short identifiers or Unicode.
+
+**Opportunity**: Use a proper tokenizer.
+- `@anthropic-ai/tokenizer` (tiktoken-compatible, ships with the SDK) — accurate but ~5ms per call
+- Tiered approach: use chars/4 for display; use proper tokenizer only for compaction threshold decisions (where accuracy matters)
+
+**Impact**: More accurate compaction timing, fewer unnecessary compactions, slightly better `COMPACTION_KEEP_RECENT_TOKENS` boundary placement.
+
+---
+
+## 8. Format: Markdown over XML for Internal Context
+
+**Current state**: The message pipeline uses `<skill>`, `<summary>`, `<compaction>` XML wrappers in several places. System prompt sections are largely prose Markdown.
+
+**Findings**: XML tags carry 15–40% more tokens than equivalent Markdown for the same semantic content, due to paired open/close tags. However, Claude was optimized for XML and shows higher accuracy on tasks requiring precise section parsing.
+
+**Recommendation**: Audit XML usage in the pipeline and convert to Markdown where the content is:
+- Non-nested (flat instructions, status messages)
+- Human-readable rather than machine-parsed by the model
+- Not requiring precise boundary detection
+
+Keep XML for: few-shot examples with ambiguous boundaries, skill content (requires precise isolation from surrounding text), compaction summaries that the model must treat as authoritative history.
+
+**Estimated savings**: 5–15% reduction in system prompt token count.
+
+---
+
+## 9. Dynamic Tool Set Delivery
+
+**Current state**: All tool definitions are included in every LLM request. Tool descriptions consume 60–80% of input tokens in static configurations. As new extensions register tools, the baseline grows linearly.
+
+**Opportunity** (higher complexity): Implement the three-function Dynamic Toolset pattern:
+1. `search_tools(query)` — semantic search over tool catalog
+2. `describe_tools(ids[])` — fetch full schemas on demand
+3. `execute_tool(id, params)` — unchanged execution
+
+Speakeasy measured 91–97% token reduction with 100% task success rate. Trade-off: 2–3x more tool calls, ~50% longer wall time. Net cost dramatically lower.
+
+**Feasibility for pi**: The tool registry (`packages/pi-coding-agent/src/core/tool-registry.ts`) already stores tool metadata separately from definitions. The primary engineering work is the semantic search index and the `describe_tools` / `search_tools` tool implementations.
+
+---
+
+## 10. Cost Attribution and Per-Phase Reporting
+
+**Current state**: `SessionManager.getUsageTotals()` accumulates cost across the entire session. No per-phase or per-agent breakdown is stored. Cost visibility is limited to the footer total and `GSD_SHOW_TOKEN_COST=1` per-turn display.
+
+**Opportunity**: Emit structured cost events that extensions can subscribe to:
+```typescript
+interface CostCheckpointEvent {
+  type: "cost_checkpoint";
+  label: string;          // "discuss-phase", "execute-slice-3"
+  deltaTokens: Usage;     // tokens since last checkpoint
+  cumulativeTokens: Usage;
+  cumulativeCost: number;
+}
+```
+
+GSD extension could consume these events to surface per-milestone cost in `/gsd stats` and flag milestones that are disproportionately expensive — enabling budget-aware planning.
+
+---
+
+## Implementation Ordering (if pursued)
+
+| Priority | Item | Effort | Expected Impact |
+|----------|------|--------|-----------------|
+| 1 | Prompt caching (`cache_control`) | Low | 80–90% input cost reduction |
+| 2 | Earlier compaction threshold (70%) | Trivial | Reduces drift in long sessions |
+| 3 | Tool result truncation at write time | Low | Reduces context bloat between compactions |
+| 4 | Context file deduplication | Medium | Variable — high for multi-level AGENTS.md setups |
+| 5 | Observation masking (default `transformContext`) | Medium | 50%+ on long-running agents |
+| 6 | Token estimation (proper tokenizer) | Low | Accuracy improvement, minor cost impact |
+| 7 | Markdown over XML audit | Low | 5–15% system prompt reduction |
+| 8 | Skill caching with `cache_control` | Low | Meaningful for skill-heavy sessions |
+| 9 | Dynamic tool set delivery | High | 90%+ on large tool catalogs; major architecture change |
+| 10 | Per-phase cost attribution events | Medium | Visibility only; enables future budget routing |
--- a/docs/dev/pi-ui-tui/01-the-ui-architecture.md
+++ b/docs/dev/pi-ui-tui/01-the-ui-architecture.md
--- a/docs/dev/pi-ui-tui/02-the-component-interface-foundation-of-everything.md
+++ b/docs/dev/pi-ui-tui/02-the-component-interface-foundation-of-everything.md
--- a/docs/dev/pi-ui-tui/03-entry-points-how-ui-gets-on-screen.md
+++ b/docs/dev/pi-ui-tui/03-entry-points-how-ui-gets-on-screen.md
--- a/docs/dev/pi-ui-tui/04-built-in-dialog-methods.md
+++ b/docs/dev/pi-ui-tui/04-built-in-dialog-methods.md
--- a/docs/dev/pi-ui-tui/05-persistent-ui-elements.md
+++ b/docs/dev/pi-ui-tui/05-persistent-ui-elements.md
--- a/docs/dev/pi-ui-tui/06-ctx-ui-custom-full-custom-components.md
+++ b/docs/dev/pi-ui-tui/06-ctx-ui-custom-full-custom-components.md
--- a/docs/dev/pi-ui-tui/07-built-in-components-the-building-blocks.md
+++ b/docs/dev/pi-ui-tui/07-built-in-components-the-building-blocks.md
--- a/docs/dev/pi-ui-tui/08-high-level-components-from-pi-coding-agent.md
+++ b/docs/dev/pi-ui-tui/08-high-level-components-from-pi-coding-agent.md
--- a/docs/dev/pi-ui-tui/09-keyboard-input-how-to-handle-keys.md
+++ b/docs/dev/pi-ui-tui/09-keyboard-input-how-to-handle-keys.md
--- a/docs/dev/pi-ui-tui/10-line-width-the-cardinal-rule.md
+++ b/docs/dev/pi-ui-tui/10-line-width-the-cardinal-rule.md
--- a/docs/dev/pi-ui-tui/11-theming-colors-and-styles.md
+++ b/docs/dev/pi-ui-tui/11-theming-colors-and-styles.md
--- a/docs/dev/pi-ui-tui/12-overlays-floating-modals-and-panels.md
+++ b/docs/dev/pi-ui-tui/12-overlays-floating-modals-and-panels.md
--- a/docs/dev/pi-ui-tui/13-custom-editors-replacing-the-input.md
+++ b/docs/dev/pi-ui-tui/13-custom-editors-replacing-the-input.md
--- a/docs/dev/pi-ui-tui/14-tool-rendering-custom-tool-display.md
+++ b/docs/dev/pi-ui-tui/14-tool-rendering-custom-tool-display.md
--- a/Show more
+++ b/Show more