fix: wire bundled extension inventory

2026-05-05 00:03:47 +02:00 · 2026-05-05 00:03:47 +02:00 · 959e15ef42
commit 959e15ef42
parent 47c806d733
27 changed files with 1022 additions and 74 deletions
--- a/BUILD_PLAN_MILESTONE_MAP.md
+++ b/BUILD_PLAN_MILESTONE_MAP.md
@ -0,0 +1,66 @@
+# BUILD_PLAN → Milestone Map
+
+Every BUILD_PLAN.md tier item mapped to an existing or needed milestone. Items without milestones are gaps that must be resolved before v3 planning is complete.
+
+**Rule**: Every new milestone must cite which BUILD_PLAN tier/item it implements (D015).
+
+---
+
+## Tier 0 — Pi-mono ports → **M006**
+
+| Item | Status |
+|---|---|
+| HTML export escape | ✅ `701ec8fb8` |
+| Empty tools array fix | ✅ `58b1d7c60` |
+| Anthropic SSE unknown events | DEFERRED |
+| Long local-LLM SSE timeout | ✅ `d0907b6d8` |
+| Bedrock inference profile | ✅ `7c487bb60` |
+| Symlinked packages dedup | TODO |
+| ctx.ui.setWorkingVisible() | TODO |
+| Cloudflare Workers AI | TODO |
+| Azure Cognitive Services | TODO |
+| Custom Anthropic SSE parser | TODO |
+
+## Tier 0.5 — gsd-2 ports → **M006 + M007**
+
+17 items total, all mapped to M006 (critical fixes) or M007 (features). See BUILD_PLAN.md for full list.
+
+## Tier 1 — ESSENTIAL → **GAP: no milestones**
+
+| Item | Action |
+|---|---|
+| 1.1 Vault secret resolver | Create milestone |
+| 1.2 Singularity Memory integration | Create milestone |
+| 1.3 Schema reconciliation | Fold into M013 or create milestone |
+| 1.4 Config schema alignment | Fold into M013 or create milestone |
+
+## Tier 2 — STRONG → **ALL MAPPED**
+
+| Item | Milestone | Slice | References BUILD_PLAN |
+|---|---|---|---|
+| 2.1 Persistent agents v1 | M012 | S01-S05 | ✅ |
+| 2.2 Doc-sync sub-step | M009 | S08 | ✅ |
+| 2.3 Intent chapters | M013 | S08 | ✅ |
+| 2.4 PhaseReview 3-pass | M016 | S01-S02 | ✅ |
+| 2.5 turn_status marker | M013 | S09 | ✅ |
+| 2.6 last_error cap | M013 | S10 | ✅ |
+| 2.7 cost_micro_usd | M013 | S11 | ✅ |
+
+## Tier 3 — NICE → **Deferred by design**
+
+No milestones until Tier 2 completes.
+
+## Tier 4 — DEFER → **Deferred by design**
+
+No milestones until a deployment demands it.
+
+## Summary
+
+| Tier | Mapped | Gap |
+|---|---|---|
+| Tier 0 | 10 (M006) | 0 |
+| Tier 0.5 | 17 (M006+M007) | 0 |
+| **Tier 1** | **0** | **4 — need milestones** |
+| Tier 2 | 7 (M012, M009, M013, M016) | 0 |
+| Tier 3 | 0 | deferred |
+| Tier 4 | 0 | deferred |
--- a/FEATURES.md
+++ b/FEATURES.md
@ -348,15 +348,10 @@ The section below is generated from source declarations so this overview can sta
 Generated from `packages/mcp-server/src/workflow-tools.ts`.

 - `sf_complete_milestone`
- `sf_complete_slice`
- `sf_complete_task`
 - `sf_decision_save`
- `sf_generate_milestone_id`
 - `sf_journal_query`
- `sf_milestone_complete`
 - `sf_milestone_generate_id`
 - `sf_milestone_status`
- `sf_milestone_validate`
 - `sf_plan_milestone`
 - `sf_plan_slice`
 - `sf_plan_task`
@ -364,17 +359,11 @@ Generated from `packages/mcp-server/src/workflow-tools.ts`.
 - `sf_replan_slice`
 - `sf_requirement_save`
 - `sf_requirement_update`
- `sf_roadmap_reassess`
- `sf_save_decision`
 - `sf_save_gate_result`
- `sf_save_requirement`
 - `sf_skip_slice`
 - `sf_slice_complete`
- `sf_slice_replan`
 - `sf_summary_save`
 - `sf_task_complete`
- `sf_task_plan`
- `sf_update_requirement`
 - `sf_validate_milestone`

 ### Bundled Extensions
@ -382,20 +371,31 @@ Generated from `packages/mcp-server/src/workflow-tools.ts`.
 Generated from `src/resources/extensions/*/extension-manifest.json`.

 - `async-jobs` — [extension-manifest.json](src/resources/extensions/async-jobs/extension-manifest.json)
+- `aws-auth` — [extension-manifest.json](src/resources/extensions/aws-auth/extension-manifest.json)
 - `bg-shell` — [extension-manifest.json](src/resources/extensions/bg-shell/extension-manifest.json)
 - `browser-tools` — [extension-manifest.json](src/resources/extensions/browser-tools/extension-manifest.json)
+- `claude-code-cli` — [extension-manifest.json](src/resources/extensions/claude-code-cli/extension-manifest.json)
 - `context7` — [extension-manifest.json](src/resources/extensions/context7/extension-manifest.json)
 - `genai-proxy` — [extension-manifest.json](src/resources/extensions/genai-proxy/extension-manifest.json)
+- `github-sync` — [extension-manifest.json](src/resources/extensions/github-sync/extension-manifest.json)
 - `google-search` — [extension-manifest.json](src/resources/extensions/google-search/extension-manifest.json)
+- `guardrails` — [extension-manifest.json](src/resources/extensions/guardrails/extension-manifest.json)
 - `mac-tools` — [extension-manifest.json](src/resources/extensions/mac-tools/extension-manifest.json)
+- `mcp-client` — [extension-manifest.json](src/resources/extensions/mcp-client/extension-manifest.json)
 - `ollama` — [extension-manifest.json](src/resources/extensions/ollama/extension-manifest.json)
 - `remote-questions` — [extension-manifest.json](src/resources/extensions/remote-questions/extension-manifest.json)
 - `search-the-web` — [extension-manifest.json](src/resources/extensions/search-the-web/extension-manifest.json)
 - `sf` — [extension-manifest.json](src/resources/extensions/sf/extension-manifest.json)
+- `sf-inturn-guard` — [extension-manifest.json](src/resources/extensions/sf-inturn-guard/extension-manifest.json)
+- `sf-notify` — [extension-manifest.json](src/resources/extensions/sf-notify/extension-manifest.json)
+- `sf-permissions` — [extension-manifest.json](src/resources/extensions/sf-permissions/extension-manifest.json)
+- `sf-tui` — [extension-manifest.json](src/resources/extensions/sf-tui/extension-manifest.json)
+- `sf-usage-bar` — [extension-manifest.json](src/resources/extensions/sf-usage-bar/extension-manifest.json)
 - `slash-commands` — [extension-manifest.json](src/resources/extensions/slash-commands/extension-manifest.json)
 - `subagent` — [extension-manifest.json](src/resources/extensions/subagent/extension-manifest.json)
 - `ttsr` — [extension-manifest.json](src/resources/extensions/ttsr/extension-manifest.json)
 - `universal-config` — [extension-manifest.json](src/resources/extensions/universal-config/extension-manifest.json)
+- `vectordrive` — [extension-manifest.json](src/resources/extensions/vectordrive/extension-manifest.json)
 - `voice` — [extension-manifest.json](src/resources/extensions/voice/extension-manifest.json)

 ### Search Providers
@ -440,6 +440,9 @@ Generated from `packages/pi-ai/src/types.ts` (`KnownProvider`).
 - `vercel-ai-gateway`
 - `xai`
 - `xiaomi`
+- `xiaomi-token-plan-ams`
+- `xiaomi-token-plan-cn`
+- `xiaomi-token-plan-sgp`
 - `zai`

 <!-- GENERATED_FEATURE_INVENTORY_END -->
--- a/scripts/check-sf-extension-inventory.mjs
+++ b/scripts/check-sf-extension-inventory.mjs
@ -1,9 +1,10 @@
 import { execFileSync } from "node:child_process";
-import { readFileSync } from "node:fs";
+import { existsSync, readFileSync, readdirSync } from "node:fs";
 import { join, resolve } from "node:path";

 const repoRoot = resolve(import.meta.dirname, "..");
 const sfRoot = join(repoRoot, "src", "resources", "extensions", "sf");
+const extensionsRoot = join(repoRoot, "src", "resources", "extensions");
 const manifestPath = join(sfRoot, "extension-manifest.json");

 const RESOURCE_SOURCE_RE = /\.(?:js|mjs|cjs|json|md|yaml|yml|d\.ts)$/;
@ -25,6 +26,14 @@ function read(path) {
 	return readFileSync(path, "utf8");
 }

+function readJsonOrNull(path) {
+	try {
+		return JSON.parse(read(path));
+	} catch {
+		return null;
+	}
+}
+
 function uniqueSorted(values) {
 	return [...new Set(values)].sort((a, b) => a.localeCompare(b));
 }
@ -57,6 +66,31 @@ function untrackedResourceSources() {
 		.filter((path) => RESOURCE_SOURCE_RE.test(path));
 }

+function isLoadableExtensionDir(dirPath) {
+	const packageJsonPath = join(dirPath, "package.json");
+	if (existsSync(packageJsonPath)) {
+		const pkg = readJsonOrNull(packageJsonPath);
+		if (pkg?.pi && typeof pkg.pi === "object") {
+			return Array.isArray(pkg.pi.extensions) && pkg.pi.extensions.length > 0;
+		}
+	}
+	return existsSync(join(dirPath, "index.js")) || existsSync(join(dirPath, "index.ts"));
+}
+
+function manifestlessLoadableExtensions() {
+	return readdirSync(extensionsRoot, { withFileTypes: true })
+		.filter((entry) => entry.isDirectory())
+		.map((entry) => entry.name)
+		.filter((name) => {
+			const dirPath = join(extensionsRoot, name);
+			return (
+				isLoadableExtensionDir(dirPath) &&
+				!existsSync(join(dirPath, "extension-manifest.json"))
+			);
+		})
+		.sort((a, b) => a.localeCompare(b));
+}
+
 function parseManifest() {
 	const raw = JSON.parse(read(manifestPath));
 	return {
@ -146,6 +180,16 @@ function main() {
 		);
 	}

+	const manifestlessExtensions = manifestlessLoadableExtensions();
+	if (manifestlessExtensions.length > 0) {
+		failures.push(
+			failSection(
+				`Loadable bundled extensions missing extension-manifest.json (${manifestlessExtensions.length})`,
+				manifestlessExtensions,
+			),
+		);
+	}
+
 	const manifest = parseManifest();
 	const registeredTools = parseRegisteredTools();
 	const missingManifestTools = registeredTools.filter((tool) => !manifest.tools.includes(tool));
--- a/scripts/generate-features-inventory.mjs
+++ b/scripts/generate-features-inventory.mjs
@ -1,4 +1,4 @@
-import { readFileSync, readdirSync, writeFileSync } from "node:fs";
+import { existsSync, readFileSync, readdirSync, writeFileSync } from "node:fs";
 import { join, relative, resolve } from "node:path";

 const __filename = import.meta.filename;
@ -9,7 +9,10 @@ const featuresPath = join(repoRoot, "FEATURES.md");
 const workflowToolsPath = join(repoRoot, "packages", "mcp-server", "src", "workflow-tools.ts");
 const providersPath = join(repoRoot, "packages", "pi-ai", "src", "types.ts");
 const extensionsRoot = join(repoRoot, "src", "resources", "extensions");
-const searchProviderPath = join(repoRoot, "src", "resources", "extensions", "search-the-web", "provider.ts");
+const searchProviderPath = resolveExistingPath(
+  join(repoRoot, "src", "resources", "extensions", "search-the-web", "provider.ts"),
+  join(repoRoot, "src", "resources", "extensions", "search-the-web", "provider.js"),
+);

 export const START = "<!-- GENERATED_FEATURE_INVENTORY_START -->";
 export const END = "<!-- GENERATED_FEATURE_INVENTORY_END -->";
@ -18,6 +21,14 @@ function uniqueSorted(values) {
  return [...new Set(values)].sort((a, b) => a.localeCompare(b));
 }

+function resolveExistingPath(...paths) {
+  const found = paths.find((path) => existsSync(path));
+  if (!found) {
+    throw new Error(`None of these inventory source paths exist: ${paths.join(", ")}`);
+  }
+  return found;
+}
+
 export function parseWorkflowToolNames() {
  const src = readFileSync(workflowToolsPath, "utf8");
  const matches = [...src.matchAll(/server\.tool\(\s*"([^"]+)"/g)].map((m) => m[1]);
@ -50,12 +61,17 @@ export function parseBundledExtensions() {

 export function parseSearchProviders() {
  const src = readFileSync(searchProviderPath, "utf8");
+  const preferencesMatch = src.match(/const VALID_PREFERENCES = new Set\(\[([\s\S]*?)\]\)/);
+  const preferenceProviders = preferencesMatch
+    ? [...preferencesMatch[1].matchAll(/["']([^"']+)["']/g)].map((m) => m[1])
+    : [];
  const providers = [
+    ...preferenceProviders,
    ...src.matchAll(/providers\.push\('([^']+)'\)/g),
    ...src.matchAll(/provider\?: '([^']+)'/g),
    ...src.matchAll(/\|\s*"([^"]+)"/g),
  ]
-    .map((m) => m[1])
+    .map((m) => (typeof m === "string" ? m : m[1]))
    .filter((p) => p !== "combosearch" && p !== "minimax" && p !== "auto");
  return uniqueSorted(providers);
 }
--- a/src/resources/extensions/aws-auth/extension-manifest.json
+++ b/src/resources/extensions/aws-auth/extension-manifest.json
@ -0,0 +1,11 @@
+{
+	"id": "aws-auth",
+	"name": "AWS Auth Refresh",
+	"version": "1.0.0",
+	"description": "Refresh AWS credentials after Bedrock authentication failures and retry the turn",
+	"tier": "bundled",
+	"requires": { "platform": ">=2.29.0" },
+	"provides": {
+		"hooks": ["agent_end"]
+	}
+}
--- a/src/resources/extensions/claude-code-cli/extension-manifest.json
+++ b/src/resources/extensions/claude-code-cli/extension-manifest.json
@ -0,0 +1,9 @@
+{
+	"id": "claude-code-cli",
+	"name": "Claude Code CLI",
+	"version": "1.0.0",
+	"description": "Registers the Claude Code CLI as a local model provider",
+	"tier": "bundled",
+	"requires": { "platform": ">=2.29.0" },
+	"provides": {}
+}
--- a/src/resources/extensions/github-sync/extension-manifest.json
+++ b/src/resources/extensions/github-sync/extension-manifest.json
@ -0,0 +1,11 @@
+{
+	"id": "github-sync",
+	"name": "GitHub Sync",
+	"version": "1.0.0",
+	"description": "Sync SF milestones, slices, and tasks to GitHub tracking artifacts",
+	"tier": "bundled",
+	"requires": { "platform": ">=2.29.0" },
+	"provides": {
+		"commands": ["github-sync"]
+	}
+}
--- a/src/resources/extensions/guardrails/extension-manifest.json
+++ b/src/resources/extensions/guardrails/extension-manifest.json
@ -0,0 +1,12 @@
+{
+	"id": "guardrails",
+	"name": "Guardrails",
+	"version": "1.0.0",
+	"description": "Redact sensitive outputs and block dangerous file or shell actions",
+	"tier": "bundled",
+	"requires": { "platform": ">=2.29.0" },
+	"provides": {
+		"commands": ["safegit", "safegit-level", "safegit-status", "yolo"],
+		"hooks": ["session_start", "tool_call", "tool_result"]
+	}
+}
--- a/src/resources/extensions/mcp-client/extension-manifest.json
+++ b/src/resources/extensions/mcp-client/extension-manifest.json
@ -0,0 +1,12 @@
+{
+	"id": "mcp-client",
+	"name": "MCP Client",
+	"version": "1.0.0",
+	"description": "Discover MCP servers and register their tools as first-class agent tools",
+	"tier": "bundled",
+	"requires": { "platform": ">=2.29.0" },
+	"provides": {
+		"tools": ["mcp_call", "mcp_discover", "mcp_servers"],
+		"hooks": ["session_start", "session_shutdown", "session_switch"]
+	}
+}
--- a/src/resources/extensions/sf-inturn-guard/extension-manifest.json
+++ b/src/resources/extensions/sf-inturn-guard/extension-manifest.json
@ -0,0 +1,12 @@
+{
+	"id": "sf-inturn-guard",
+	"name": "SF In-Turn Guard",
+	"version": "1.0.0",
+	"description": "Detect duplicate tool calls and short retry loops inside one agent turn",
+	"tier": "bundled",
+	"requires": { "platform": ">=2.29.0" },
+	"provides": {
+		"commands": ["guard-status", "guard-toggle"],
+		"hooks": ["agent_start", "turn_start", "tool_call", "tool_result", "agent_end"]
+	}
+}
--- a/src/resources/extensions/sf-inturn-guard/index.js
+++ b/src/resources/extensions/sf-inturn-guard/index.js
@ -0,0 +1,248 @@
+/**
+ * sf-inturn-guard/index.js — Break degenerate agent loops within a single turn.
+ *
+ * Detects:
+ *   1. Exact duplicate loop — same tool + params ≥3 times in a row
+ *   2. Error retry loop — same tool fails ≥2 times with same params
+ *   3. Yo-yo pattern — alternating read/edit on the same file within 30 s
+ *
+ * Intervenes via pi.sendMessage steer, which interrupts the current stream
+ * after the current tool finishes.
+ */
+export default function sfInturnGuard(pi) {
+  const MAX_BUFFER = 10;
+  const COOLDOWN_TURNS = 3;
+
+  const state = {
+    calls: [],
+    turnId: 0,
+    interventionCount: 0,
+    lastInterventionTurn: -1,
+    enabled: true,
+  };
+
+  // ── Helpers ───────────────────────────────────────────────────────────────
+
+  function addCall(toolName, input) {
+    const entry = {
+      name: toolName,
+      input: JSON.stringify(input ?? {}),
+      ts: Date.now(),
+      isError: false,
+      turnId: state.turnId,
+    };
+    state.calls.push(entry);
+    if (state.calls.length > MAX_BUFFER) state.calls.shift();
+    return entry;
+  }
+
+  function recentInTurn(n) {
+    return state.calls.filter((c) => c.turnId === state.turnId).slice(-n);
+  }
+
+  function canIntervene() {
+    if (!state.enabled) return false;
+    if (state.interventionCount === 0) return true;
+    return (state.turnId - state.lastInterventionTurn) >= COOLDOWN_TURNS;
+  }
+
+  function sendIntervention(reason, detail) {
+    const text =
+      `⚠️ Loop guard: ${reason}\n\n` +
+      `${detail}\n\n` +
+      `Stop repeating the same action. If stuck, try a different approach or ask for help.`;
+    pi.sendMessage(
+      {
+        customType: "sf-inturn-guard",
+        content: text,
+        display: `[loop-guard] ${reason}`,
+      },
+      { triggerTurn: true },
+    );
+  }
+
+  // ── Detectors ─────────────────────────────────────────────────────────────
+
+  function detectDuplicateLoop() {
+    const recent = recentInTurn(5);
+    if (recent.length < 3) return null;
+    const last = recent[recent.length - 1];
+    let count = 1;
+    for (let i = recent.length - 2; i >= 0; i--) {
+      if (recent[i].name === last.name && recent[i].input === last.input) {
+        count++;
+      } else {
+        break;
+      }
+    }
+    if (count >= 3) {
+      return {
+        reason: `${last.name} repeated ${count}× with identical args`,
+        detail: `You've called \`${last.name}\` ${count} times with the same parameters. The result won't change.`,
+      };
+    }
+    return null;
+  }
+
+  function detectErrorRetry() {
+    const recent = recentInTurn(4);
+    if (recent.length < 2) return null;
+    const last = recent[recent.length - 1];
+    if (!last.isError) return null;
+    let count = 1;
+    for (let i = recent.length - 2; i >= 0; i--) {
+      const c = recent[i];
+      if (
+        c.name === last.name &&
+        c.input === last.input &&
+        c.isError
+      ) {
+        count++;
+      } else {
+        break;
+      }
+    }
+    if (count >= 2) {
+      return {
+        reason: `${last.name} failed ${count}× with identical args`,
+        detail: `\`${last.name}\` has failed ${count} times with the same parameters. Fix the root cause before retrying.`,
+      };
+    }
+    return null;
+  }
+
+  function detectYoYo() {
+    const recent = recentInTurn(6);
+    if (recent.length < 4) return null;
+    const n = recent.length;
+    for (let start = n - 4; start >= 0; start--) {
+      const [c0, c1, c2, c3] = [
+        recent[start],
+        recent[start + 1],
+        recent[start + 2],
+        recent[start + 3],
+      ];
+      if (c3.ts - c0.ts > 30000) continue;
+      const isReadEdit = (name) => name === "read" || name === "edit";
+      if (
+        !isReadEdit(c0.name) ||
+        !isReadEdit(c1.name) ||
+        !isReadEdit(c2.name) ||
+        !isReadEdit(c3.name)
+      ) {
+        continue;
+      }
+      if (c0.name === c2.name && c1.name === c3.name && c0.name !== c1.name) {
+        try {
+          const p0 = JSON.parse(c0.input).path ?? "";
+          const p1 = JSON.parse(c1.input).path ?? "";
+          const p2 = JSON.parse(c2.input).path ?? "";
+          const p3 = JSON.parse(c3.input).path ?? "";
+          if (p0 && p0 === p1 && p1 === p2 && p2 === p3) {
+            return {
+              reason: `Yo-yo pattern on \`${p0}\``,
+              detail: `You're alternating \`${c0.name}\` and \`${c1.name}\` on the same file. Decide the change, make it once, and move on.`,
+            };
+          }
+        } catch {
+          /* ignore parse errors */
+        }
+      }
+    }
+    return null;
+  }
+
+  // ── Event hooks ───────────────────────────────────────────────────────────
+
+  pi.on("agent_start", () => {
+    state.calls = [];
+    state.turnId = 0;
+    state.interventionCount = 0;
+    state.lastInterventionTurn = -1;
+  });
+
+  pi.on("turn_start", () => {
+    state.turnId++;
+    state.calls = state.calls.slice(-MAX_BUFFER);
+  });
+
+  pi.on("tool_call", async (event, ctx) => {
+    addCall(event.toolName, event.input);
+
+    // Footer status
+    if (ctx.hasUI) {
+      const recent = recentInTurn(3)
+        .map((c) => c.name)
+        .join(" → ");
+      ctx.ui.setStatus("sf-inturn-guard", `${recent} | T${state.turnId}`);
+    }
+
+    if (!canIntervene()) return;
+
+    let detection = detectDuplicateLoop();
+    if (!detection) detection = detectErrorRetry();
+    if (!detection) detection = detectYoYo();
+
+    if (detection) {
+      state.interventionCount++;
+      state.lastInterventionTurn = state.turnId;
+      if (ctx.hasUI) {
+        ctx.ui.notify(`🛑 ${detection.reason}`, "warning");
+      }
+      sendIntervention(detection.reason, detection.detail);
+    }
+  });
+
+  pi.on("tool_result", async (event) => {
+    if (event.isError) {
+      for (let i = state.calls.length - 1; i >= 0; i--) {
+        const c = state.calls[i];
+        if (c.turnId === state.turnId && c.name === event.toolName) {
+          c.isError = true;
+          break;
+        }
+      }
+    }
+  });
+
+  pi.on("agent_end", () => {
+    state.calls = [];
+    if (state.interventionCount > 0) {
+      pi.appendEntry("sf-inturn-guard", {
+        interventions: state.interventionCount,
+        lastInterventionTurn: state.lastInterventionTurn,
+      });
+    }
+  });
+
+  // ── Commands ──────────────────────────────────────────────────────────────
+
+  pi.registerCommand("guard-toggle", {
+    description: "Toggle in-turn loop guard on/off",
+    handler: async (_args, ctx) => {
+      state.enabled = !state.enabled;
+      const status = state.enabled ? "ON" : "OFF";
+      if (ctx.hasUI) {
+        ctx.ui.notify(
+          `🛡 In-turn guard ${status}`,
+          state.enabled ? "info" : "warning",
+        );
+      }
+    },
+  });
+
+  pi.registerCommand("guard-status", {
+    description: "Show in-turn guard statistics",
+    handler: async (_args, ctx) => {
+      const lines = [
+        "╭─ In-Turn Guard ─╮",
+        `Enabled: ${state.enabled ? "yes" : "no"}`,
+        `Turn: ${state.turnId}`,
+        `Interventions: ${state.interventionCount}`,
+        `Last: ${state.lastInterventionTurn >= 0 ? `turn ${state.lastInterventionTurn}` : "never"}`,
+        "╰─────────────────╯",
+      ];
+      if (ctx.hasUI) ctx.ui.notify(lines.join("\n"), "info");
+    },
+  });
+}
--- a/src/resources/extensions/sf-notify/extension-manifest.json
+++ b/src/resources/extensions/sf-notify/extension-manifest.json
@ -0,0 +1,19 @@
+{
+	"id": "sf-notify",
+	"name": "SF Notify",
+	"version": "1.0.0",
+	"description": "Send completion and attention notifications for long-running agent work",
+	"tier": "bundled",
+	"requires": { "platform": ">=2.29.0" },
+	"provides": {
+		"commands": [
+			"notify-beep",
+			"notify-focus",
+			"notify-save-global",
+			"notify-say",
+			"notify-status",
+			"notify-threshold"
+		],
+		"hooks": ["session_start", "agent_start", "tool_result", "agent_end"]
+	}
+}
--- a/src/resources/extensions/sf-permissions/extension-manifest.json
+++ b/src/resources/extensions/sf-permissions/extension-manifest.json
@ -0,0 +1,12 @@
+{
+	"id": "sf-permissions",
+	"name": "SF Permissions",
+	"version": "1.0.0",
+	"description": "Enforce layered permission levels for shell, file, and skill-scoped tool use",
+	"tier": "bundled",
+	"requires": { "platform": ">=2.29.0" },
+	"provides": {
+		"commands": ["permission", "permission-mode"],
+		"hooks": ["session_start", "before_agent_start", "agent_end", "tool_call", "tool_result"]
+	}
+}
--- a/src/resources/extensions/sf-tui/extension-manifest.json
+++ b/src/resources/extensions/sf-tui/extension-manifest.json
@ -0,0 +1,23 @@
+{
+	"id": "sf-tui",
+	"name": "SF TUI",
+	"version": "1.0.0",
+	"description": "Adds SF-specific header, footer, prompt stash, color, emoji, and marketplace UI controls",
+	"tier": "bundled",
+	"requires": { "platform": ">=2.29.0" },
+	"provides": {
+		"commands": [
+			"color",
+			"color-char",
+			"color-config",
+			"color-next",
+			"color-set",
+			"emoji",
+			"emoji-config",
+			"emoji-history",
+			"emoji-set"
+		],
+		"hooks": ["session_start", "session_switch", "before_agent_start", "tool_result", "agent_start", "agent_end"],
+		"shortcuts": ["Ctrl+Alt+H", "Ctrl+Shift+H", "Ctrl+Alt+M"]
+	}
+}
--- a/src/resources/extensions/sf-usage-bar/extension-manifest.json
+++ b/src/resources/extensions/sf-usage-bar/extension-manifest.json
@ -0,0 +1,11 @@
+{
+	"id": "sf-usage-bar",
+	"name": "SF Usage Bar",
+	"version": "1.0.0",
+	"description": "Shows configured AI provider usage windows and service status",
+	"tier": "bundled",
+	"requires": { "platform": ">=2.29.0" },
+	"provides": {
+		"commands": ["usage"]
+	}
+}
--- a/src/resources/extensions/sf/milestone-quality.js
+++ b/src/resources/extensions/sf/milestone-quality.js
@ -5,10 +5,33 @@ const PLACEHOLDER_VALUES = new Set([
    "missing weighted synthesis.",
    "missing confidence by area.",
 ]);
+
+/**
+ * Minimum character count for a milestone ceremony field to be considered
+ * "purposeful". Vision meeting fields below this length typically lack the
+ * specific evidence, risks, or subsystem references each role requires.
+ *
+ * Purpose: prevent rubber-stamp ceremony fields from passing validation.
+ * Consumer: inspectMilestoneRoadmapMarkdown.
+ */
+const MIN_PURPOSEFUL_LENGTH = 80;
+
 function isMeaningful(value) {
    const normalized = (value ?? "").trim().toLowerCase();
    return normalized.length > 0 && !PLACEHOLDER_VALUES.has(normalized);
 }
+
+/**
+ * Check whether a milestone ceremony field is purposeful — present AND substantive
+ * enough to serve its role's purpose.
+ *
+ * Purpose: enforce depth contracts for vision meeting roles.
+ * Consumer: inspectMilestoneRoadmapMarkdown (advisory warnings).
+ */
+function isPurposefulVisionField(value) {
+    const trimmed = (value ?? "").trim();
+    return trimmed.length >= MIN_PURPOSEFUL_LENGTH;
+}
 function normalizeVisionMeetingRoute(value) {
    const firstLine = (value ?? "")
        .split(/\r?\n/)
@ -138,6 +161,21 @@ export function inspectMilestoneRoadmapMarkdown(content) {
    const blockingIssue = getVisionAlignmentBlockingIssue(meeting);
    if (blockingIssue)
        issues.push(blockingIssue);
+    // Advisory depth checks — warn on shallow ceremony fields that pass presence
+    if (isMeaningful(meeting.partner) && !isPurposefulVisionField(meeting.partner))
+        issues.push("shallow partner review — cite specific evidence (source files, prior milestones, production incidents)");
+    if (isMeaningful(meeting.combatant) && !isPurposefulVisionField(meeting.combatant))
+        issues.push("shallow combatant review — name specific risks with concrete failure scenarios");
+    if (isMeaningful(meeting.architect) && !isPurposefulVisionField(meeting.architect))
+        issues.push("shallow architect review — name affected subsystems, coupling points, and dependency rationale");
+    if (isMeaningful(meeting.researcher) && !isPurposefulVisionField(meeting.researcher))
+        issues.push("shallow researcher review — cite at least one external source or comparable system");
+    if (isMeaningful(meeting.weightedSynthesis) && !isPurposefulVisionField(meeting.weightedSynthesis))
+        issues.push("shallow weighted synthesis — must identify the strongest additions, cuts, and sequencing changes");
+    // BUILD_PLAN reference check (advisory — not a hard block for legacy milestones)
+    if (!content.includes("BUILD_PLAN")) {
+        issues.push("missing BUILD_PLAN.md reference — new milestones should cite which BUILD_PLAN tier/item they implement (D015)");
+    }
    return { issues };
 }
 export function getMilestonePlanBlockingIssue(content) {
--- a/src/resources/extensions/sf/plan-quality.js
+++ b/src/resources/extensions/sf/plan-quality.js
@ -6,10 +6,35 @@ const PLACEHOLDER_VALUES = new Set([
    "missing combatant review.",
    "missing architect review.",
 ]);
+
+/**
+ * Minimum character count for a ceremony field to be considered "purposeful"
+ * (not just present). Below this threshold, the contribution is too short to
+ * contain specific evidence, risks, or subsystem references.
+ *
+ * Purpose: prevent rubber-stamp ceremony fields from passing validation.
+ * Consumer: inspectSlicePlanMarkdown, hasStructuredPlanningMeeting.
+ */
+const MIN_PURPOSEFUL_LENGTH = 80;
+
 function isMeaningfulReviewBody(value) {
    const normalized = (value ?? "").trim().toLowerCase();
    return normalized.length > 0 && !PLACEHOLDER_VALUES.has(normalized);
 }
+
+/**
+ * Check whether a ceremony field is purposeful — present AND substantive enough
+ * to serve its role's purpose. A partner that doesn't cite evidence, a combatant
+ * that doesn't name specific risks, or an architect that doesn't name subsystems
+ * will typically be under this length.
+ *
+ * Purpose: enforce depth contracts, not just presence.
+ * Consumer: inspectSlicePlanMarkdown (advisory warnings, not hard blocks).
+ */
+function isPurposefulReviewBody(value) {
+    const trimmed = (value ?? "").trim();
+    return trimmed.length >= MIN_PURPOSEFUL_LENGTH;
+}
 function normalizePlanningMeetingRoute(value) {
    const firstLine = (value ?? "")
        .split(/\r?\n/)
@ -86,10 +111,16 @@ export function inspectSlicePlanMarkdown(content) {
    const architect = extractSubsection(adversarialSection, "Architect Review");
    if (!isMeaningfulReviewBody(partner))
        issues.push("missing partner review");
+    else if (!isPurposefulReviewBody(partner))
+        issues.push("shallow partner review — cite specific evidence (file paths, test gaps, prior learnings)");
    if (!isMeaningfulReviewBody(combatant))
        issues.push("missing combatant review");
+    else if (!isPurposefulReviewBody(combatant))
+        issues.push("shallow combatant review — name specific risks with concrete failure scenarios");
    if (!isMeaningfulReviewBody(architect))
        issues.push("missing architect review");
+    else if (!isPurposefulReviewBody(architect))
+        issues.push("shallow architect review — name affected subsystems and coupling points");
    const planningMeeting = extractSection(content, "Planning Meeting");
    if (planningMeeting) {
        const trigger = extractSubsection(planningMeeting, "Trigger");
--- a/src/resources/extensions/sf/prompts/discuss-headless.md
+++ b/src/resources/extensions/sf/prompts/discuss-headless.md
@ -194,14 +194,7 @@ For multi-milestone projects, requirements should span the full vision. Requirem

 ## PM Strategy Memory

-Before writing milestone artifacts, write or update `.sf/PM-STRATEGY.md` with your analysis so far:
- **Diagnosis**: the core challenge (Rumelt)
- **Opportunity Map**: table of top opportunities with RICE scores and cannonball/lead-bullet classification
- **Jobs Analysis**: whose functional/emotional jobs this product serves
- **Guiding Policies**: principles governing decisions (e.g. "tests before features")
- **What Was Deferred**: explicitly out-of-scope items and why
-
-This file is the project's product strategy memory. Future agents read it to understand what's been decided strategically. **Write it even if brief — a short entry is better than none.**
+Research findings that shaped planning decisions are saved via `sf_summary_save` with `artifact_type: "RESEARCH"`. The orchestrator persists them to both DB and disk. Do not create separate strategy files in `.sf/`.

 ## Scope Assessment

@ -233,7 +226,7 @@ In a single pass:
 Preserve the specification's exact terminology, emphasis, and specific framing. Do not paraphrase domain-specific language into generics. If the spec said "craft feel," write "craft feel" — not "high-quality user experience." The context file is downstream agents' only window into this conversation — flattening specifics into generics loses the signal that shaped every decision.

 4. If `depth_verification_{{milestoneId}}_confirm` was confirmed, write `{{contextPath}}` — use the **Context** output template below. Preserve key risks, unknowns, existing codebase constraints, integration points, and relevant requirements surfaced during research. Include an "Assumptions" section documenting every judgment call.
-5. If depth verification was not confirmed, write `.sf/milestones/{{milestoneId}}/{{milestoneId}}-CONTEXT-DRAFT.md` instead. Include the project-knowledge evidence, confidence level, assumptions, open questions, and what must be researched next. Do **not** call `sf_plan_milestone`. End with: "Milestone {{milestoneId}} drafted for discussion."
+5. If depth verification was not confirmed, call `sf_summary_save` with `artifact_type: "CONTEXT-DRAFT"` and the draft content as `content` — the tool writes `.sf/milestones/{{milestoneId}}/{{milestoneId}}-CONTEXT-DRAFT.md` to disk. Include the project-knowledge evidence, confidence level, assumptions, open questions, and what must be researched next. Do **not** call `sf_plan_milestone`. End with: "Milestone {{milestoneId}} drafted for discussion."
 6. Only after confirmed final context, call `sf_plan_milestone` to create the roadmap. Decompose into demoable vertical slices with risk, depends, demo sentences, proof strategy, verification classes, milestone definition of done, requirement coverage, and a boundary map. If the milestone crosses multiple runtime boundaries, include an explicit final integration slice that proves the assembled system works end-to-end in a real environment. Use the **Roadmap** output template below to structure the tool call parameters.
 7. For each architectural or pattern decision, call `sf_decision_save` — the tool auto-assigns IDs and regenerates `.sf/DECISIONS.md` automatically.
 8. {{commitInstruction}}
@ -302,7 +295,7 @@ After deciding each milestone's readiness, immediately write or update `.sf/DISC
 }
 ```

-Write this file AFTER each gate decision, not just at the end. Update `gates_completed` incrementally. The system reads this file and BLOCKS auto-start if `gates_completed < total`.
+Write this file AFTER each gate decision, not just at the end. Update `gates_completed` incrementally. The system reads this file and BLOCKS auto-start if `gates_completed < total`. *(This is transient session state, not a tracked deliverable — it lives in `.sf/` during the discussion and is cleaned up afterward.)*

 For single-milestone projects, do NOT write this file.

--- a/src/resources/extensions/sf/prompts/discuss.md
+++ b/src/resources/extensions/sf/prompts/discuss.md
@ -404,7 +404,7 @@ Each context file (full or draft) should be rich enough that a future agent enco

 #### Milestone Gate Tracking (MANDATORY for multi-milestone)

-After EVERY Phase 3 gate decision, immediately write or update `.sf/DISCUSSION-MANIFEST.json` with the cumulative state. This file is mechanically validated by the system before auto-mode starts — if gates are incomplete, auto-mode will NOT start.
+After EVERY Phase 3 gate decision, immediately write or update `.sf/DISCUSSION-MANIFEST.json` with the cumulative state. This file is mechanically validated by the system before auto-mode starts — if gates are incomplete, auto-mode will NOT start. *(This is transient session state, not a tracked deliverable — it lives in `.sf/` during the discussion and is cleaned up afterward.)*

 ```json
 {
--- a/src/resources/extensions/sf/prompts/doctor-heal.md
+++ b/src/resources/extensions/sf/prompts/doctor-heal.md
@ -7,7 +7,7 @@ Rules:
 2. Read before edit.
 3. Prefer fixing authoritative artifacts over masking warnings.
 4. For missing summaries or UAT files, generate the real artifact from existing slice/task context when possible — do not leave placeholders if you can reconstruct the real content.
-5. For a missing milestone `CONTEXT.md` when the milestone is already past `pre-planning` (phase is `executing`, `summarizing`, `validating-milestone`, or `completing-milestone`): the artifact was skipped during bootstrap and must be reconstructed before execution can resume. Read `PROJECT.md`, `REQUIREMENTS.md`, the milestone's `ROADMAP.md`, and any slice-level context on disk, then write `.sf/milestones/{{milestoneId}}/{{milestoneId}}-CONTEXT.md` with the real context. Do not leave a stub — the plan gate will reject it on the next cycle.
+5. For a missing milestone `CONTEXT.md` when the milestone is already past `pre-planning` (phase is `executing`, `summarizing`, `validating-milestone`, or `completing-milestone`): the artifact was skipped during bootstrap and must be reconstructed before execution can resume. Read `PROJECT.md`, `REQUIREMENTS.md`, the milestone's `ROADMAP.md`, and any slice-level context on disk, then call `sf_summary_save` with `artifact_type: "CONTEXT"` and the reconstructed context as `content` — the tool writes `.sf/milestones/{{milestoneId}}/{{milestoneId}}-CONTEXT.md` to disk and persists to DB. Do not leave a stub — the plan gate will reject it on the next cycle.
 6. After each repair cluster, verify the relevant invariant directly from disk.
 7. When done, rerun `/sf doctor {{doctorCommandSuffix}}` mentally by ensuring the remaining issue set for this scope is reduced or cleared.
 8. Do NOT query `.sf/sf.db` directly via `sqlite3` or `node -e require('better-sqlite3')` — use `sf_milestone_status` to inspect DB state. Direct access bypasses the WAL connection owned by the engine and can corrupt in-flight writes.
--- a/src/resources/extensions/sf/prompts/guided-research-project.md
+++ b/src/resources/extensions/sf/prompts/guided-research-project.md
@ -47,6 +47,8 @@ Prompt:
 > - **Open questions** (anything where the user's choice will materially shape the architecture)
 >
 > Use `resolve_library` / `get_library_docs` for library docs. Use web search sparingly (2–3 queries). Cite sources where versions matter. Mark confidence per recommendation: high / medium / low.
+>
+> *(Note: `.sf/research/` files are transient working state used during planning. They inform the roadmap but are not promoted to docs/.)*

 ### Task 2 — Features research → `.sf/research/FEATURES.md`

--- a/src/resources/extensions/sf/prompts/plan-milestone.md
+++ b/src/resources/extensions/sf/prompts/plan-milestone.md
@ -6,13 +6,13 @@ You are executing SF auto-mode.

 Your working directory is `{{workingDirectory}}`. All file reads, writes, and shell commands MUST operate relative to this directory. Do NOT `cd` to any other directory.

-All relevant context has been preloaded below — start working immediately without re-reading these files.
+All relevant context has been preloaded below - start working immediately without re-reading these files.

 {{inlinedContext}}

 ## Your Role in the Pipeline

-You are the first deep look at this milestone. You have full tool access — explore the codebase, look up docs, investigate technology choices. Your job is to understand the landscape and then strategically decompose the work into demoable slices.
+You are the first deep look at this milestone. You have full tool access - explore the codebase, look up docs, investigate technology choices. Your job is to understand the landscape and then strategically decompose the work into demoable slices.

 After you finish, each slice goes through its own plan → execute cycle. Slice planners decompose into tasks. Executors build each task. Your roadmap sets the strategic frame for all of them.

@ -20,9 +20,9 @@ After you finish, each slice goes through its own plan → execute cycle. Slice

 Read the milestone title, the user's stated intent, and any inlined research above. Ask: does this milestone span unfamiliar territory, multiple risky integrations, or genuinely ambiguous scope? Or is it a focused well-understood feature decomposable into a handful of standard slices?

- **Deep planning** — multi-area milestone, novel architecture, multiple viable decomposition strategies. Explore broadly, write the full strategic frame, plan multiple slices with risk-driven ordering and explicit cross-slice dependencies.
- **Targeted planning** — known territory but new to this codebase, or moderate complexity. Explore the relevant areas, plan 2–4 slices, write the requirement-mapping but skip the long architectural narration.
- **Light planning** — well-scoped feature using established patterns. Plan **1–2 slices** with clear demos. Don't synthesize 4 slices to fill a template; use the Single-Slice Fast Path below if applicable. Skip Risks/Architecture sections that don't apply.
+- **Deep planning** - multi-area milestone, novel architecture, multiple viable decomposition strategies. Explore broadly, write the full strategic frame, plan multiple slices with risk-driven ordering and explicit cross-slice dependencies.
+- **Targeted planning** - known territory but new to this codebase, or moderate complexity. Explore the relevant areas, plan 2-4 slices, write the requirement-mapping but skip the long architectural narration.
+- **Light planning** - well-scoped feature using established patterns. Plan **1-2 slices** with clear demos. Don't synthesize 4 slices to fill a template; use the Single-Slice Fast Path below if applicable. Skip Risks/Architecture sections that don't apply.

 An honest "this milestone is one slice, here's the demo" beats a fabricated 5-slice decomposition for work that doesn't have it.

@ -31,7 +31,7 @@ An honest "this milestone is one slice, here's the demo" beats a fabricated 5-sl
 Before decomposing, build your understanding:

 1. **Codebase exploration.** Use native `lsp` first for symbol lookup, references, and cross-file navigation. For small/familiar codebases, use `rg`, `find`, and targeted reads. For large or unfamiliar codebases, use `scout` to build a broad map efficiently before diving in.
-2. **Library docs — DeepWiki first.** Use `ask_question` / `read_wiki_structure` / `read_wiki_contents` (DeepWiki) as the default for any GitHub-hosted library. Fall back to `resolve_library` / `get_library_docs` (Context7) for npm/pypi/crates packages DeepWiki doesn't have. Context7 free tier is capped at 1000 req/month — spend those on cases DeepWiki can't cover. Skip both for libraries already used in this codebase.
+2. **Library docs - DeepWiki first.** Use `ask_question` / `read_wiki_structure` / `read_wiki_contents` (DeepWiki) as the default for any GitHub-hosted library. Fall back to `resolve_library` / `get_library_docs` (Context7) for npm/pypi/crates packages DeepWiki doesn't have. Context7 free tier is capped at 1000 req/month - spend those on cases DeepWiki can't cover. Skip both for libraries already used in this codebase.
 3. **Skill Discovery ({{skillDiscoveryMode}}):**{{skillDiscoveryInstructions}}
 4. **Requirements analysis.** If `.sf/REQUIREMENTS.md` exists, research against it. Identify which Active requirements are table stakes, likely omissions, overbuilt risks, or domain-standard behaviors.
 5. **Comparable systems and nuanced stakeholder scan.** Before locking the roadmap, research what similar products, OSS tools, and production teams do in this category. Surface table stakes, common failure modes, customer expectations, business constraints, and differentiators worth preserving.
@ -51,19 +51,22 @@ Before decomposing, build your understanding:

 If milestone research exists (inlined above), trust those findings and skip redundant exploration. If findings are significant and no research file exists yet, write `{{researchOutputPath}}`.

-Narrate your decomposition reasoning — why you're grouping work this way, what risks are driving the order, what verification strategy you're choosing and why. Use complete sentences rather than planner shorthand or fragmentary notes.
+Narrate your decomposition reasoning - why you're grouping work this way, what risks are driving the order, what verification strategy you're choosing and why. Use complete sentences rather than planner shorthand or fragmentary notes.

 Before you persist the roadmap, run a bounded **Vision Alignment Meeting** as a real multi-agent review. Use the `subagent` tool in `mode: "debate"` with `rounds: 2` and a separate task for each participant lens below. Do **not** merely simulate every participant inside this planner response. Use only supported agent names: `planner`, `reviewer`, `researcher`, and `scout`. Put the stakeholder role name inside the task text; do not invent agent names such as `combatant`, `delivery-lead`, `product-manager`, or `customer-panel`. If the `subagent` tool is unavailable or fails after one retry, record that explicitly in `trigger` and run the structured meeting inline as a degraded fallback. This is broader than slice planning and should feel allowed to be chatty and nuanced. Gather the strongest additions, cuts, and ordering changes from these participant lenses:
- **Product Manager:** what is the real product move and what should the roadmap prove?
- **User Advocate:** what must matter for the user experience and trust surface?
- **Customer Panel:** multiple likely customer viewpoints, not a single flattened “user”.
- **Business:** wedge, retention, expansion path, or viability concerns.
- **Researcher:** comparable products, OSS tools, market expectations, DeepWiki/Context7 findings, and focused web research.
- **Delivery Lead:** smallest credible milestone sequence and scope cuts.
- **Partner:** strongest case for the roadmap.
- **Combatant:** why the roadmap is wrong, overbuilt, or solving the wrong thing.
- **Architect:** system-fit and sequencing synthesis.
- **Moderator:** weigh the claims after the meeting; do NOT majority-vote.
+
+Each role has a **purpose** (why it exists in the ceremony) and a **depth contract** (what a meaningful contribution must contain). A contribution that restates the roadmap title without evidence, specific risks, or concrete subsystem references is not meaningful — it is a rubber stamp.
+
+- **Product Manager:** what is the real product move and what should the roadmap prove? *Must identify the specific user pain point, the proposed capability, and what "shipped" looks like end-to-end.*
+- **User Advocate:** what must matter for the user experience and trust surface? *Must name at least one concrete user scenario where the current system fails or frustrates.*
+- **Customer Panel:** multiple likely customer viewpoints, not a single flattened "user". *Must present at least 2 distinct customer perspectives with different priorities or constraints.*
+- **Business:** wedge, retention, expansion path, or viability concerns. *Must name at least one specific failure mode this milestone addresses and the cost of not addressing it.*
+- **Researcher:** comparable products, OSS tools, market expectations, DeepWiki/Context7 findings, and focused web research. *Must cite at least one external source or comparable system by name with a specific finding.*
+- **Delivery Lead:** smallest credible milestone sequence and scope cuts. *Must state the critical path with task counts and wall-clock estimates, and identify what could be cut if timeboxed.*
+- **Partner:** strongest case for the roadmap. *Must cite specific evidence (source files, prior milestones, production incidents, or research findings) and name the single highest-impact slice with reasoning.*
+- **Combatant:** why the roadmap is wrong, overbuilt, or solving the wrong thing. *Must name at least 2 specific risks with concrete failure scenarios (not vague "might be hard"), and propose a mitigation or alternative for each.*
+- **Architect:** system-fit and sequencing synthesis. *Must name the specific subsystems affected, identify coupling points between them, and explain why the proposed dependency ordering is correct (or propose a better one).*
+- **Moderator:** weigh the claims after the meeting; do NOT majority-vote. *Must make an explicit decision that resolves at least one disagreement between participants, and state what was approved vs what was modified or deferred.*

 Every participant may propose roadmap additions, removals, sequencing changes, or deferrals. The roadmap is driven by the **weighted synthesis**, not raw vote count. Record:
 - `trigger`
@ -78,13 +81,14 @@ If confidence stays low after research and weighted discussion, assume the miles
 Then:
 1. Use the **Roadmap** output template from the inlined context above
 2. {{skillActivation}}
-3. Create the roadmap: decompose into demoable vertical slices — as many as the work genuinely needs, no more. A simple feature might be 1 slice. Don't decompose for decomposition's sake.
+3. Create the roadmap: decompose into demoable vertical slices - as many as the work genuinely needs, no more. A simple feature might be 1 slice. Don't decompose for decomposition's sake.
 4. Order by risk (high-risk first)
-5. Call `sf_plan_milestone` to persist the milestone planning fields, slice rows, and **horizontal checklist** in the DB-backed planning path. Do **not** write `{{outputPath}}`, `ROADMAP.md`, or other planning artifacts manually — the planning tool owns roadmap rendering and persistence.
-6. If planning produced structural decisions (e.g. slice ordering rationale, technology choices, scope exclusions), call `sf_decision_save` for each decision — the tool auto-assigns IDs and regenerates `.sf/DECISIONS.md` automatically.
+5. Call `sf_plan_milestone` to persist the milestone planning fields, slice rows, and **horizontal checklist** in the DB-backed planning path. Do **not** write `{{outputPath}}`, `ROADMAP.md`, or other planning artifacts manually - the planning tool owns roadmap rendering and persistence.
+6. If planning produced structural decisions (e.g. slice ordering rationale, technology choices, scope exclusions), call `sf_decision_save` for each decision - the tool auto-assigns IDs and regenerates `.sf/DECISIONS.md` automatically.

 ## Requirement Mapping Rules

+- **BUILD_PLAN reference required (D015).** Every new milestone must cite which BUILD_PLAN.md tier/item it implements. Include "BUILD_PLAN.md Tier X.Y" or "BUILD_PLAN §Tier X" in the vision text. If the milestone is not from BUILD_PLAN (e.g. a bugfix milestone), explicitly state "Not from BUILD_PLAN — <reason>" in the vision. Existing milestones without this reference are grandfathered; new ones must have it.
 - Every Active requirement relevant to this milestone must be in one of these states by the end of planning: mapped to a slice, explicitly deferred, blocked with reason, or moved out of scope.
 - Each requirement should have one accountable primary owner and may have supporting slices.
 - Product-facing milestones should cover launchability, primary user loop, continuity, and failure visibility when relevant.
@ -96,30 +100,30 @@ Then:

 Apply these when decomposing and ordering slices:

- **Risk-first means proof-first.** The earliest slices should prove the hardest thing works by shipping the real feature through the uncertain path. If auth is the risk, the first slice ships a real login page with real session handling that a user can actually use — not a CLI command that returns "authenticated: true". Proof is the shipped feature working. There is no separate "proof" artifact. Do not plan spikes, proof-of-concept slices, or validation-only slices — the proof is the real feature, built through the risky path.
- **Every slice is vertical, demoable, and shippable.** Every slice ships real, user-facing functionality. "Demoable" means the intended user can exercise the capability through its real interface — for a web app that's the UI, for a CLI tool that's the terminal, for an API that's a consuming client or curl. The test is: can someone *use* it, not just *assert* it passes. A slice that only proves something but doesn't ship real working code is not a slice — restructure it.
+- **Risk-first means proof-first.** The earliest slices should prove the hardest thing works by shipping the real feature through the uncertain path. If auth is the risk, the first slice ships a real login page with real session handling that a user can actually use - not a CLI command that returns "authenticated: true". Proof is the shipped feature working. There is no separate "proof" artifact. Do not plan spikes, proof-of-concept slices, or validation-only slices - the proof is the real feature, built through the risky path.
+- **Every slice is vertical, demoable, and shippable.** Every slice ships real, user-facing functionality. "Demoable" means the intended user can exercise the capability through its real interface - for a web app that's the UI, for a CLI tool that's the terminal, for an API that's a consuming client or curl. The test is: can someone *use* it, not just *assert* it passes. A slice that only proves something but doesn't ship real working code is not a slice - restructure it.
 - **Brownfield bias.** When planning against an existing codebase, ground slices in existing modules, conventions, and seams. Prefer extending real patterns over inventing new ones.
- **Each slice should establish something downstream slices can depend on.** Think about what stable surface this slice creates for later work — an API, a data shape, a proven integration path.
- **Avoid foundation-only slices.** If a slice doesn't produce something demoable end-to-end, it's probably a layer, not a vertical slice. Restructure it. Exception: if the infrastructure *is* the product surface (a new protocol, extension API, or provider interface), the slice is vertical by definition — the downstream consumer is the demo.
- **Verification-first.** When planning slices, know what "done" looks like before detailing implementation. Each slice's demo line should describe concrete, verifiable evidence — not vague "it works" claims.
+- **Each slice should establish something downstream slices can depend on.** Think about what stable surface this slice creates for later work - an API, a data shape, a proven integration path.
+- **Avoid foundation-only slices.** If a slice doesn't produce something demoable end-to-end, it's probably a layer, not a vertical slice. Restructure it. Exception: if the infrastructure *is* the product surface (a new protocol, extension API, or provider interface), the slice is vertical by definition - the downstream consumer is the demo.
+- **Verification-first.** When planning slices, know what "done" looks like before detailing implementation. Each slice's demo line should describe concrete, verifiable evidence - not vague "it works" claims.
 - **Plan for integrated reality, not just local proof.** Distinguish contract proof from live integration proof. If the milestone involves multiple runtime boundaries, one slice must explicitly prove the assembled system through the real entrypoint or runtime path.
 - **Truthful demo lines only.** If a slice is proven by fixtures or tests only, say so. Do not phrase harness-level proof as if the user can already perform the live end-to-end behavior unless that has actually been exercised.
 - **Completion must imply capability.** If every slice in this roadmap were completed exactly as written, the milestone's promised outcome should actually work at the proof level claimed. Do not write slices that can all be checked off while the user-visible capability still does not exist.
 - **Don't invent risks.** If the project is straightforward, skip the proof strategy and just ship value in smart order. Not everything has major unknowns.
- **Ship features, not proofs.** A completed slice should leave the product in a state where the new capability is actually usable through its real interface. A login flow slice ends with a working login page, not a middleware function. An API slice ends with endpoints that return real data from a real store, not hardcoded fixtures. A dashboard slice ends with a real dashboard rendering real data, not a component that renders mock props. If a slice can't ship the real thing yet because a dependency isn't built, it should ship with realistic stubs that are clearly marked for replacement — but the user-facing surface must be real.
- **Dependency format is comma-separated, never range syntax.** Write `depends:[S01,S02,S03]` — not `depends:[S01-S03]`. Range syntax is not a valid format and permanently blocks the slice.
- **Ambition matches the milestone.** The number and depth of slices should match the milestone's ambition. A milestone promising "core platform with auth, data model, and primary user loop" should have enough slices to actually deliver all three as working features — not two proof-of-concept slices and a note that "the rest will come in the next milestone." If the milestone's context promises an outcome, the roadmap must deliver it.
- **Right-size the decomposition.** Match slice count to actual complexity. If the work is small enough to build and verify in one pass, it's one slice — don't split it into three just because you can identify sub-steps. Multiple requirements can share a single slice. Conversely, don't cram genuinely independent capabilities into one slice just to keep the count low. Let the work dictate the structure.
+- **Ship features, not proofs.** A completed slice should leave the product in a state where the new capability is actually usable through its real interface. A login flow slice ends with a working login page, not a middleware function. An API slice ends with endpoints that return real data from a real store, not hardcoded fixtures. A dashboard slice ends with a real dashboard rendering real data, not a component that renders mock props. If a slice can't ship the real thing yet because a dependency isn't built, it should ship with realistic stubs that are clearly marked for replacement - but the user-facing surface must be real.
+- **Dependency format is comma-separated, never range syntax.** Write `depends:[S01,S02,S03]` - not `depends:[S01-S03]`. Range syntax is not a valid format and permanently blocks the slice.
+- **Ambition matches the milestone.** The number and depth of slices should match the milestone's ambition. A milestone promising "core platform with auth, data model, and primary user loop" should have enough slices to actually deliver all three as working features - not two proof-of-concept slices and a note that "the rest will come in the next milestone." If the milestone's context promises an outcome, the roadmap must deliver it.
+- **Right-size the decomposition.** Match slice count to actual complexity. If the work is small enough to build and verify in one pass, it's one slice - don't split it into three just because you can identify sub-steps. Multiple requirements can share a single slice. Conversely, don't cram genuinely independent capabilities into one slice just to keep the count low. Let the work dictate the structure.

 ## Single-Slice Fast Path

-If the roadmap has only one slice, also plan the slice and its tasks inline during this unit — don't leave them for a separate planning session.
+If the roadmap has only one slice, also plan the slice and its tasks inline during this unit - don't leave them for a separate planning session.

 1. After `sf_plan_milestone` returns, immediately call `sf_plan_slice` for S01 with the full task breakdown
 2. Use the **Slice Plan** and **Task Plan** output templates from the inlined context above to structure the tool call parameters
-3. For simple slices, keep the plan lean — omit Proof Level, Integration Closure, and Observability sections if they would all be "none". Executable verification commands are sufficient.
+3. For simple slices, keep the plan lean - omit Proof Level, Integration Closure, and Observability sections if they would all be "none". Executable verification commands are sufficient.

-Do **not** write plan files manually — use the DB-backed tools so state stays consistent.
+Do **not** write plan files manually - use the DB-backed tools so state stays consistent.

 ## Secret Forecasting

@ -129,17 +133,17 @@ If this milestone requires any external API keys or secrets:

 1. Use the **Secrets Manifest** output template from the inlined context above for the expected format
 2. Write `{{secretsOutputPath}}` listing every predicted secret as an H3 section with:
-   - **Service** — the external service name
-   - **Dashboard** — direct URL to the console/dashboard page where the key is created (not a generic homepage)
-   - **Format hint** — what the key looks like (e.g. `sk-...`, `ghp_...`, 40-char hex, UUID)
-   - **Status** — always `pending` during planning
-   - **Destination** — `dotenv`, `vercel`, or `convex` depending on where the key will be consumed
+   - **Service** - the external service name
+   - **Dashboard** - direct URL to the console/dashboard page where the key is created (not a generic homepage)
+   - **Format hint** - what the key looks like (e.g. `sk-...`, `ghp_...`, 40-char hex, UUID)
+   - **Status** - always `pending` during planning
+   - **Destination** - `dotenv`, `vercel`, or `convex` depending on where the key will be consumed
   - Numbered step-by-step guidance for obtaining the key (navigate to dashboard → create project → generate key → copy)

-If this milestone does not require any external API keys or secrets, skip this step entirely — do not create an empty manifest.
+If this milestone does not require any external API keys or secrets, skip this step entirely - do not create an empty manifest.

 ### Report sf-internal observations

-If during this unit you observe sf-the-tool friction — ambiguous prompts, missing context, misleading instructions, surprising behavior, prompt-quality issues, or improvement ideas — file them via `sf_self_report` before sealing the unit. This is the only way these observations reach forge's backlog and get triaged. Over-reporting is preferred to under-reporting; dedup happens later. Do NOT use this to file bugs in the user's project; only sf-the-tool itself. Do NOT autonomously act on or fix existing backlog entries — your scope is your unit.
+If during this unit you observe sf-the-tool friction - ambiguous prompts, missing context, misleading instructions, surprising behavior, prompt-quality issues, or improvement ideas - file them via `sf_self_report` before sealing the unit. This is the only way these observations reach forge's backlog and get triaged. Over-reporting is preferred to under-reporting; dedup happens later. Do NOT use this to file bugs in the user's project; only sf-the-tool itself. Do NOT autonomously act on or fix existing backlog entries - your scope is your unit.

 When done, say: "Milestone {{milestoneId}} planned."
--- a/src/resources/extensions/sf/prompts/plan-slice.md
+++ b/src/resources/extensions/sf/prompts/plan-slice.md
@ -88,11 +88,11 @@ Then:
   - **Inputs and Expected Output must list concrete backtick-wrapped file paths** (e.g. `` `src/types.ts` ``). These are machine-parsed to derive task dependencies — vague prose without paths breaks parallel execution. Every task must have at least one output file path.
   - Observability Impact section **only if the task touches runtime boundaries, async flows, or error paths** — omit it otherwise
   - Swarm guidance when relevant: if a task can safely split into 2-3 independent execution shards, say so in the task plan's Steps or Description with explicit file/directory ownership per shard. If the work touches shared interfaces, lockfiles, migrations, generated artifacts, or sequence-dependent code, state that it should execute single-agent.
-7. **Run adversarial review before persisting the plan.** Record all three lenses in the `adversarialReview` payload you send to `sf_plan_slice`:
-   - **Partner:** strongest case for why this plan is sufficient, grounded in the actual code and evidence you explored.
-   - **Combatant:** attack the premise first. Name at least 3 plausible alternative root causes, failure modes, or plan-shape mistakes, plus the cheapest falsifier for each.
-   - **Architect:** after reading partner + combatant, state the system-fit risk, sequencing risk, or missing integration proof.
-   - If any of the three reviews expose a problem, change the plan before persisting it. Do not treat the review as commentary-only.
+7. **Run adversarial review before persisting the plan.** Record all three lenses in the `adversarialReview` payload you send to `sf_plan_slice`. Each role has a purpose and depth contract — a review that agrees without raising specific objections is a rubber stamp, not a review.
+   - **Partner:** strongest case for why this plan is sufficient. *Must cite specific evidence from the code you explored — file paths, function names, test coverage gaps, or prior slice learnings. Not just "the plan looks good."*
+   - **Combatant:** attack the premise first. *Must name at least 3 plausible alternative root causes, failure modes, or plan-shape mistakes. Each must have a concrete scenario (not "might fail") and the cheapest falsifier for each.*
+   - **Architect:** after reading partner + combatant, state the system-fit risk, sequencing risk, or missing integration proof. *Must name the specific subsystems and coupling points this plan touches, and identify at least one integration seam that could break.*
+   - If any of the three reviews expose a problem, change the plan before persisting it. Do not treat the review as commentary-only. If the combatant only agrees, you did not push hard enough.
 8. **Always record a bounded planning meeting before persistence.** This is deeper for standard/heavy slices, low-confidence plans, multiple plausible approaches, or automatic feature planning where PM framing matters. For simple slices, keep each role to one concise line explaining why the slice is simple. Record it in the required `planningMeeting` payload:
   - **Trigger:** why the meeting was needed
   - **Product Manager:** diagnosis, user value, scope cut, and what would count as a useful increment
--- a/src/resources/extensions/sf/prompts/research-milestone.md
+++ b/src/resources/extensions/sf/prompts/research-milestone.md
@ -62,9 +62,7 @@ Research the codebase and relevant technologies. Narrate key findings and surpri
 **You MUST call `sf_summary_save` with the research content before finishing.**
 After `sf_summary_save` succeeds, do **not** call `sf_milestone_generate_id`, `sf_plan_milestone`, `sf_plan_slice`, `sf_plan_task`, or any planning/creation tool. The orchestrator dispatches planner units after research.

-After saving research, update `.sf/PM-STRATEGY.md` — append new findings to the Opportunity Map and Guiding Policies sections. If the file doesn't exist yet, create it. This is the project's persistent PM memory — research findings that shaped planning decisions belong here.
-
-### Report sf-internal observations
+When done, say only: "Milestone {{milestoneId}} researched."

 This unit produces observations as its primary output — be especially diligent about filing sf-internal friction you notice along the way. If during this unit you observe sf-the-tool friction — ambiguous prompts, missing context, misleading instructions, surprising behavior, prompt-quality issues, or improvement ideas — file them via `sf_self_report` before sealing the unit. This is the only way these observations reach forge's backlog and get triaged. Over-reporting is preferred to under-reporting; dedup happens later. Do NOT use this to file bugs in the user's project; only sf-the-tool itself. Do NOT autonomously act on or fix existing backlog entries — your scope is your unit.

--- a/src/resources/extensions/sf/tests/milestone-quality-ceremony.test.ts
+++ b/src/resources/extensions/sf/tests/milestone-quality-ceremony.test.ts
@ -0,0 +1,213 @@
+import { describe, expect, it } from "vitest";
+
+/**
+ * Tests for milestone-quality.js vision meeting depth validation.
+ *
+ * Purpose: verify that vision meeting validators enforce not just presence but
+ * purposeful depth — rubber-stamp contributions must trigger shallow warnings.
+ *
+ * Consumer: milestone planning pipeline (plan-milestone tool, roadmap inspection).
+ */
+
+async function importMilestoneQuality() {
+  return import("../milestone-quality.js");
+}
+
+describe("milestone-quality vision meeting depth", () => {
+  const makeValidMeeting = () => ({
+    trigger: "Comparative research identified eight high-signal improvements for the autonomous workflow engine",
+    pm: "Eight improvements from two production systems at scale. Doctor-first catches drift, adversarial verification catches bugs command-exit misses.",
+    userAdvocate: "Users experience wasted sessions on stale state, noisy memory context, and unpredictable autonomous execution — each improvement maps to a specific pain point.",
+    customerPanel: "Power users need reliability. Casual users need quality output. DevOps users need CI integration. Each group has different priorities.",
+    business: "Each finding addresses a real failure mode observed in production. Adversarial verification catches the 'tests pass but bug persists' class.",
+    researcher: "Both Keel (spoke-sh/keel) and Claude Code (Anthropic) are open-source. BUILD_PLAN.md Tier 2.1 references verified.",
+    deliveryLead: "Foundation tier first (S01), then independent improvements in parallel, then architecture tier (S04), then dependent tier last.",
+    partner: "Claude Code synthesis gate from coordinatorMode.ts is the single most impactful change. Keel doctor-first catches drift that wastes dispatch cycles.",
+    combatant: "Risk 1: S04 state machine refactor always takes longer — auto-dispatch.ts is most coupled. Risk 2: adversarial verification 2-3x token cost. Risk 3: sub-tasks are schema migration.",
+    architect: "Three dependency tiers. Foundation (S01-S03, S05, S08) independent. Architecture (S04) refactors auto-dispatch.ts. Dependent (S06, S07) built on S04.",
+    moderator: "Approve full scope. S04 approved with latency budget. S08 needs complexity threshold. Monitor S04 closely — reassess at 3 dispatch cycles.",
+    weightedSynthesis: "Foundation ships first, architecture second, dependent last. Critical path: S01 → S04 → S06/S07. Wall-clock: 5-8 dispatch cycles.",
+    confidenceByArea: "doctor: 0.9 | quality: 0.85 | battery: 0.7 | two-lane: 0.65 | hard-cutover: 0.9 | sub-tasks: 0.7 | fork-spawn: 0.75 | adversarial: 0.8",
+    recommendedRoute: "planning",
+  });
+
+  describe("hasStructuredVisionAlignmentMeeting", () => {
+    it("accepts_valid_meeting", async () => {
+      const { hasStructuredVisionAlignmentMeeting } = await importMilestoneQuality();
+      expect(hasStructuredVisionAlignmentMeeting(makeValidMeeting())).toBe(true);
+    });
+
+    it("rejects_null_meeting", async () => {
+      const { hasStructuredVisionAlignmentMeeting } = await importMilestoneQuality();
+      expect(hasStructuredVisionAlignmentMeeting(null)).toBe(false);
+    });
+
+    it("rejects_meeting_with_empty_fields", async () => {
+      const { hasStructuredVisionAlignmentMeeting } = await importMilestoneQuality();
+      const meeting = { ...makeValidMeeting(), partner: "" };
+      expect(hasStructuredVisionAlignmentMeeting(meeting)).toBe(false);
+    });
+
+    it("rejects_meeting_with_placeholder_fields", async () => {
+      const { hasStructuredVisionAlignmentMeeting } = await importMilestoneQuality();
+      const meeting = { ...makeValidMeeting(), combatant: "not provided." };
+      expect(hasStructuredVisionAlignmentMeeting(meeting)).toBe(false);
+    });
+
+    it("rejects_invalid_route", async () => {
+      const { hasStructuredVisionAlignmentMeeting } = await importMilestoneQuality();
+      const meeting = { ...makeValidMeeting(), recommendedRoute: "executing" };
+      expect(hasStructuredVisionAlignmentMeeting(meeting)).toBe(false);
+    });
+
+    it("accepts_researching_route", async () => {
+      const { hasStructuredVisionAlignmentMeeting } = await importMilestoneQuality();
+      const meeting = { ...makeValidMeeting(), recommendedRoute: "researching" };
+      expect(hasStructuredVisionAlignmentMeeting(meeting)).toBe(true);
+    });
+  });
+
+  describe("inspectMilestoneRoadmapMarkdown depth warnings", () => {
+    const makeRoadmapMarkdown = (meeting) => {
+      const meetingSection = `## Vision Alignment Meeting
+
+### Trigger
+${meeting.trigger}
+
+### Product Manager
+${meeting.pm}
+
+### User Advocate
+${meeting.userAdvocate}
+
+### Customer Panel
+${meeting.customerPanel}
+
+### Business
+${meeting.business}
+
+### Researcher
+${meeting.researcher}
+
+### Delivery Lead
+${meeting.deliveryLead}
+
+### Partner
+${meeting.partner}
+
+### Combatant
+${meeting.combatant}
+
+### Architect
+${meeting.architect}
+
+### Moderator
+${meeting.moderator}
+
+### Weighted Synthesis
+${meeting.weightedSynthesis}
+
+### Confidence By Area
+${meeting.confidenceByArea}
+
+### Recommended Route
+${meeting.recommendedRoute}
+`;
+      return `# M015: Test Milestone\n\n${meetingSection}\n## Slice Overview\n`;
+    };
+
+    it("flags_shallow_partner", async () => {
+      const { inspectMilestoneRoadmapMarkdown } = await importMilestoneQuality();
+      const meeting = { ...makeValidMeeting(), partner: "Looks good" };
+      const result = inspectMilestoneRoadmapMarkdown(makeRoadmapMarkdown(meeting));
+      expect(result.issues).toContain(
+        "shallow partner review — cite specific evidence (source files, prior milestones, production incidents)",
+      );
+    });
+
+    it("flags_shallow_combatant", async () => {
+      const { inspectMilestoneRoadmapMarkdown } = await importMilestoneQuality();
+      const meeting = { ...makeValidMeeting(), combatant: "Seems fine" };
+      const result = inspectMilestoneRoadmapMarkdown(makeRoadmapMarkdown(meeting));
+      expect(result.issues).toContain(
+        "shallow combatant review — name specific risks with concrete failure scenarios",
+      );
+    });
+
+    it("flags_shallow_architect", async () => {
+      const { inspectMilestoneRoadmapMarkdown } = await importMilestoneQuality();
+      const meeting = { ...makeValidMeeting(), architect: "Agree" };
+      const result = inspectMilestoneRoadmapMarkdown(makeRoadmapMarkdown(meeting));
+      expect(result.issues).toContain(
+        "shallow architect review — name affected subsystems, coupling points, and dependency rationale",
+      );
+    });
+
+    it("flags_shallow_researcher", async () => {
+      const { inspectMilestoneRoadmapMarkdown } = await importMilestoneQuality();
+      const meeting = { ...makeValidMeeting(), researcher: "Looks reasonable" };
+      const result = inspectMilestoneRoadmapMarkdown(makeRoadmapMarkdown(meeting));
+      expect(result.issues).toContain(
+        "shallow researcher review — cite at least one external source or comparable system",
+      );
+    });
+
+    it("flags_shallow_weighted_synthesis", async () => {
+      const { inspectMilestoneRoadmapMarkdown } = await importMilestoneQuality();
+      const meeting = { ...makeValidMeeting(), weightedSynthesis: "Approved" };
+      const result = inspectMilestoneRoadmapMarkdown(makeRoadmapMarkdown(meeting));
+      expect(result.issues).toContain(
+        "shallow weighted synthesis — must identify the strongest additions, cuts, and sequencing changes",
+      );
+    });
+
+    it("passes_with_purposeful_content", async () => {
+      const { inspectMilestoneRoadmapMarkdown } = await importMilestoneQuality();
+      const result = inspectMilestoneRoadmapMarkdown(makeRoadmapMarkdown(makeValidMeeting()));
+      const shallowIssues = result.issues.filter((i) => i.startsWith("shallow"));
+      expect(shallowIssues).toHaveLength(0);
+    });
+
+    it("still_flags_missing_meeting_section", async () => {
+      const { inspectMilestoneRoadmapMarkdown } = await importMilestoneQuality();
+      const content = "# M015: Test\n\nNo meeting here.\n";
+      const result = inspectMilestoneRoadmapMarkdown(content);
+      expect(result.issues).toContain("missing vision alignment meeting");
+    });
+
+    it("flags_missing_BUILD_PLAN_reference", async () => {
+      const { inspectMilestoneRoadmapMarkdown } = await importMilestoneQuality();
+      // Content with no BUILD_PLAN reference anywhere
+      const meeting = makeValidMeeting();
+      const content = makeRoadmapMarkdown(meeting).replaceAll("BUILD_PLAN", "XBLDPLAN");
+      const result = inspectMilestoneRoadmapMarkdown(content);
+      expect(result.issues).toContain(
+        "missing BUILD_PLAN.md reference — new milestones should cite which BUILD_PLAN tier/item they implement (D015)",
+      );
+    });
+
+    it("passes_with_BUILD_PLAN_reference_in_vision", async () => {
+      const { inspectMilestoneRoadmapMarkdown } = await importMilestoneQuality();
+      // M012-style content that references BUILD_PLAN (makeValidMeeting has it in researcher)
+      const meeting = makeValidMeeting();
+      const content = makeRoadmapMarkdown(meeting);
+      const result = inspectMilestoneRoadmapMarkdown(content);
+      const buildPlanIssues = result.issues.filter((i) => i.includes("BUILD_PLAN.md reference"));
+      expect(buildPlanIssues).toHaveLength(0);
+    });
+  });
+
+  describe("getVisionAlignmentBlockingIssue", () => {
+    it("returns_null_for_valid_meeting", async () => {
+      const { getVisionAlignmentBlockingIssue } = await importMilestoneQuality();
+      expect(getVisionAlignmentBlockingIssue(makeValidMeeting())).toBeNull();
+    });
+
+    it("returns_issue_for_missing_fields", async () => {
+      const { getVisionAlignmentBlockingIssue } = await importMilestoneQuality();
+      expect(getVisionAlignmentBlockingIssue(null)).toBe("missing vision alignment meeting");
+      const meeting = { ...makeValidMeeting(), trigger: "" };
+      expect(getVisionAlignmentBlockingIssue(meeting)).toBe("missing vision meeting trigger");
+    });
+  });
+});
--- a/src/resources/extensions/sf/tests/plan-quality-ceremony.test.ts
+++ b/src/resources/extensions/sf/tests/plan-quality-ceremony.test.ts
@ -0,0 +1,157 @@
+import { describe, expect, it } from "vitest";
+
+/**
+ * Tests for plan-quality.js ceremony depth validation.
+ *
+ * Purpose: verify that ceremony validators enforce not just presence but
+ * purposeful depth — rubber-stamp contributions like "This is fine" must
+ * trigger shallow warnings even though they pass presence checks.
+ *
+ * Consumer: slice planning pipeline (plan-slice tool, plan quality inspection).
+ */
+
+// Dynamic import to get fresh module state
+async function importPlanQuality() {
+  return import("../plan-quality.js");
+}
+
+describe("plan-quality ceremony depth", () => {
+  describe("hasCompleteAdversarialReview", () => {
+    it("rejects_missing_fields", async () => {
+      const { hasCompleteAdversarialReview } = await importPlanQuality();
+      expect(hasCompleteAdversarialReview(null)).toBe(false);
+      expect(hasCompleteAdversarialReview({})).toBe(false);
+      expect(hasCompleteAdversarialReview({ partner: "", combatant: "ok", architect: "ok" })).toBe(false);
+    });
+
+    it("accepts_present_fields_even_if_shallow", async () => {
+      const { hasCompleteAdversarialReview } = await importPlanQuality();
+      // Shallow but present — presence check passes
+      expect(hasCompleteAdversarialReview({
+        partner: "Looks good",
+        combatant: "Seems fine",
+        architect: "Agree",
+      })).toBe(true);
+    });
+
+    it("accepts_purposeful_fields", async () => {
+      const { hasCompleteAdversarialReview } = await importPlanQuality();
+      expect(hasCompleteAdversarialReview({
+        partner: "The plan correctly identifies the coupling between auto-dispatch.ts and phases.ts. File paths are specific and the approach is grounded in existing patterns.",
+        combatant: "Risk 1: state machine refactor will take longer due to coupling in auto-dispatch.ts. Risk 2: schema migration breaks existing parsers. Risk 3: synthesis gate adds latency.",
+        architect: "Three subsystems affected: dispatch controller (auto-dispatch.ts), planning artifacts (PLAN.md), and verification gate. Coupling point: dispatch ↔ verification.",
+      })).toBe(true);
+    });
+  });
+
+  describe("inspectSlicePlanMarkdown depth warnings", () => {
+    const makePlanMarkdown = (partner, combatant, architect) => `# Slice Plan
+
+## Adversarial Review
+
+### Partner Review
+${partner}
+
+### Combatant Review
+${combatant}
+
+### Architect Review
+${architect}
+`;
+
+    it("flags_shallow_partner_review", async () => {
+      const { inspectSlicePlanMarkdown } = await importPlanQuality();
+      const content = makePlanMarkdown(
+        "Looks good to me",
+        "Risk: might be hard. Risk: could break. Risk: unknown. Mitigation: be careful and test thoroughly before merging.",
+        "System-fit: the dispatch controller (auto-dispatch.ts) and planning artifacts (PLAN.md) are affected. Coupling point: dispatch ↔ verification gate.",
+      );
+      const result = inspectSlicePlanMarkdown(content);
+      expect(result.issues).toContain(
+        "shallow partner review — cite specific evidence (file paths, test gaps, prior learnings)",
+      );
+    });
+
+    it("flags_shallow_combatant_review", async () => {
+      const { inspectSlicePlanMarkdown } = await importPlanQuality();
+      const content = makePlanMarkdown(
+        "Strong plan grounded in the existing auto-dispatch.ts patterns. File paths are specific and verification strategy is sound.",
+        "This is fine",
+        "System-fit: the dispatch controller (auto-dispatch.ts) and planning artifacts (PLAN.md) are affected. Coupling point: dispatch ↔ verification gate.",
+      );
+      const result = inspectSlicePlanMarkdown(content);
+      expect(result.issues).toContain(
+        "shallow combatant review — name specific risks with concrete failure scenarios",
+      );
+    });
+
+    it("flags_shallow_architect_review", async () => {
+      const { inspectSlicePlanMarkdown } = await importPlanQuality();
+      const content = makePlanMarkdown(
+        "Strong plan grounded in the existing auto-dispatch.ts patterns. File paths are specific and verification strategy is sound.",
+        "Risk 1: state machine refactor takes longer. Risk 2: schema migration breaks parsers. Risk 3: latency overhead from synthesis gate. Mitigation: timebox and simplify.",
+        "Agree with plan",
+      );
+      const result = inspectSlicePlanMarkdown(content);
+      expect(result.issues).toContain(
+        "shallow architect review — name affected subsystems and coupling points",
+      );
+    });
+
+    it("passes_with_purposeful_reviews", async () => {
+      const { inspectSlicePlanMarkdown } = await importPlanQuality();
+      const content = makePlanMarkdown(
+        "Strong plan grounded in the existing auto-dispatch.ts patterns. File paths are specific and verification strategy is sound.",
+        "Risk 1: state machine refactor takes longer due to coupling. Risk 2: schema migration breaks parsers. Risk 3: latency overhead. Mitigation: timebox.",
+        "Three subsystems affected: dispatch controller, planning artifacts, and verification gate. Coupling point: dispatch ↔ verification.",
+      );
+      const result = inspectSlicePlanMarkdown(content);
+      const shallowIssues = result.issues.filter((i) => i.startsWith("shallow"));
+      expect(shallowIssues).toHaveLength(0);
+    });
+
+    it("still_flags_missing_sections", async () => {
+      const { inspectSlicePlanMarkdown } = await importPlanQuality();
+      const content = "# Slice Plan\n\nNo adversarial review section.\n";
+      const result = inspectSlicePlanMarkdown(content);
+      expect(result.issues).toContain("missing adversarial review");
+    });
+  });
+
+  describe("hasStructuredPlanningMeeting", () => {
+    const makeValidMeeting = () => ({
+      trigger: "Planning needed for slice decomposition",
+      pm: "User value is clear — this improves auto-mode reliability by preventing context oscillation",
+      researcher: "Claude Code coordinatorMode.ts shows the synthesis gate pattern works at Anthropic scale",
+      partner: "Strong plan grounded in existing auto-dispatch.ts patterns with specific file paths and evidence",
+      combatant: "Risk 1: state machine refactor takes longer. Risk 2: schema migration breaks parsers. Risk 3: latency.",
+      architect: "Dispatch controller, planning artifacts, and verification gate affected. Coupling: dispatch ↔ verification.",
+      moderator: "Approve with timebox on S04. Combatant's latency concern addressed by adding synthesis latency budget.",
+      confidenceSummary: "High confidence on foundation tier, medium on architecture tier due to state machine coupling",
+      recommendedRoute: "planning",
+    });
+
+    it("accepts_valid_meeting", async () => {
+      const { hasStructuredPlanningMeeting } = await importPlanQuality();
+      expect(hasStructuredPlanningMeeting(makeValidMeeting())).toBe(true);
+    });
+
+    it("rejects_missing_meeting", async () => {
+      const { hasStructuredPlanningMeeting } = await importPlanQuality();
+      expect(hasStructuredPlanningMeeting(null)).toBe(false);
+      expect(hasStructuredPlanningMeeting(undefined)).toBe(false);
+    });
+
+    it("rejects_invalid_route", async () => {
+      const { hasStructuredPlanningMeeting } = await importPlanQuality();
+      const meeting = { ...makeValidMeeting(), recommendedRoute: "executing" };
+      expect(hasStructuredPlanningMeeting(meeting)).toBe(false);
+    });
+
+    it("accepts_researching_route", async () => {
+      const { hasStructuredPlanningMeeting } = await importPlanQuality();
+      const meeting = { ...makeValidMeeting(), recommendedRoute: "researching" };
+      expect(hasStructuredPlanningMeeting(meeting)).toBe(true);
+    });
+  });
+});
--- a/src/tests/features-inventory-generator.test.ts
+++ b/src/tests/features-inventory-generator.test.ts
@ -25,6 +25,9 @@ test("features inventory generator surfaces expected workflow tool, extension, s
 	assert.ok(extensions.includes("sf"));
 	assert.ok(extensions.includes("search-the-web"));
 	assert.ok(extensions.includes("subagent"));
+	assert.ok(extensions.includes("guardrails"));
+	assert.ok(extensions.includes("sf-permissions"));
+	assert.ok(extensions.includes("sf-inturn-guard"));

 	assert.deepEqual(searchProviders, [
 		"brave",