fix(sf): harden auto loops and skill sandbox

2026-05-02 19:46:36 +02:00 · 2026-05-02 19:46:36 +02:00 · 0c7c4eca5b
commit 0c7c4eca5b
parent d742602454
24 changed files with 1024 additions and 58 deletions
--- a/packages/mcp-server/package.json
+++ b/packages/mcp-server/package.json
@ -25,7 +25,7 @@
 	},
 	"scripts": {
 		"build": "tsc",
-		"test": "node --test dist/mcp-server.test.js"
+		"test": "vitest run packages/mcp-server/src --root ../.. --config vitest.config.ts"
 	},
 	"dependencies": {
 		"@modelcontextprotocol/sdk": "^1.27.1",
--- a/src/resources/extensions/sf-permissions/index.ts
+++ b/src/resources/extensions/sf-permissions/index.ts
@ -41,6 +41,7 @@ import fs from "node:fs";
 import os from "node:os";
 import path from "node:path";
 import type { ExtensionAPI } from "@singularity-forge/pi-coding-agent";
+import { minimatch } from "minimatch";
 import {
 	classifyCommand,
 	invalidateConfigCache,
@ -481,12 +482,218 @@ export function handleSessionStart(state: PermissionState, ctx: any): void {
 	}
 }

+// ============================================================================
+// SKILL SANDBOX — allowed-tools enforcement
+// ============================================================================
+
+/** Active skill-declared allowed tool patterns for the current session. */
+let activeSkillAllowedTools: readonly string[] | null = null;
+
+/** Name of the skill that declared the active allowed-tools patterns. */
+let activeSkillName: string | null = null;
+
+/**
+ * Set the allowed tool patterns for the current session.
+ * Called when a skill with allowed-tools is activated.
+ */
+export function setAllowedTools(
+	patterns: readonly string[],
+	skillName?: string,
+): void {
+	activeSkillAllowedTools = patterns.length > 0 ? [...patterns] : null;
+	activeSkillName = patterns.length > 0 ? (skillName ?? null) : null;
+}
+
+/**
+ * Clear the allowed tool patterns for the current session.
+ * Called when a skill is deactivated or at session end.
+ */
+export function clearAllowedTools(): void {
+	activeSkillAllowedTools = null;
+	activeSkillName = null;
+}
+
+/**
+ * Extract the command pattern from a Bash(...) allowed-tools entry.
+ *
+ * Purpose: skill sandbox matching and error messages need the command pattern
+ * without the outer tool syntax.
+ * Consumer: matchesAllowedToolPattern and blocked-command diagnostics.
+ */
+function extractBashAllowedPattern(pattern: string): string | null {
+	const trimmedPattern = pattern.trim();
+	const match = trimmedPattern.match(/^Bash\((.+)\)$/i);
+	if (!match) return null;
+	return match[1].trim();
+}
+
+/**
+ * Parse an allowed-tools pattern and check if a command matches.
+ *
+ * Purpose: enforce `allowed-tools: Bash(npm run:*)` as a command-prefix rule,
+ * where the colon separates the command prefix from its wildcard arguments.
+ * Consumer: handleBashToolCall before normal permission checks run.
+ */
+export function matchesAllowedToolPattern(
+	pattern: string,
+	command: string,
+): boolean {
+	const commandPattern = extractBashAllowedPattern(pattern);
+	if (!commandPattern) return false;
+
+	const trimmedCommand = command.trim().replace(/\s+/g, " ");
+	if (commandPattern === "*") return true;
+	if (commandPattern.endsWith(":*")) {
+		const prefix = commandPattern.slice(0, -2).trim().replace(/\s+/g, " ");
+		return trimmedCommand === prefix || trimmedCommand.startsWith(`${prefix} `);
+	}
+	if (!/[*?[\]]/.test(commandPattern)) {
+		return trimmedCommand === commandPattern.trim().replace(/\s+/g, " ");
+	}
+	return minimatch(trimmedCommand, commandPattern);
+}
+
+function formatAllowedPatternList(patterns: readonly string[]): string {
+	const displayPatterns = patterns.map(
+		(pattern) => extractBashAllowedPattern(pattern) ?? pattern.trim(),
+	);
+	return JSON.stringify(displayPatterns);
+}
+
+function unquoteFrontmatterValue(value: string): string {
+	const trimmed = value.trim();
+	if (
+		(trimmed.startsWith('"') && trimmed.endsWith('"')) ||
+		(trimmed.startsWith("'") && trimmed.endsWith("'"))
+	) {
+		return trimmed.slice(1, -1).trim();
+	}
+	return trimmed;
+}
+
+function splitAllowedTools(value: string): string[] {
+	return value
+		.split(/\s*,\s*/)
+		.map((entry) => unquoteFrontmatterValue(entry))
+		.filter(Boolean);
+}
+
+function readSkillAllowedTools(
+	filePath: string,
+	fallbackName?: string,
+): { skillName: string; patterns: string[] } | null {
+	let raw: string;
+	try {
+		raw = fs.readFileSync(filePath, "utf-8");
+	} catch {
+		return null;
+	}
+
+	const frontmatter = raw.match(/^---\r?\n([\s\S]*?)\r?\n---/);
+	if (!frontmatter) return null;
+
+	const name =
+		frontmatter[1]
+			.match(/^name:\s*(.+)$/m)?.[1]
+			.trim()
+			.replace(/^["']|["']$/g, "") ||
+		fallbackName ||
+		path.basename(path.dirname(filePath));
+	const allowedTools = frontmatter[1].match(/^allowed-tools:\s*(.+)$/m)?.[1];
+	if (!allowedTools) return { skillName: name, patterns: [] };
+
+	return {
+		skillName: name,
+		patterns: splitAllowedTools(allowedTools),
+	};
+}
+
+function extractSkillBlocks(text: string): Array<{ name: string; location: string }> {
+	const blocks: Array<{ name: string; location: string }> = [];
+	const blockRe = /<skill\s+name="([^"]+)"\s+location="([^"]+)">/g;
+	let match: RegExpExecArray | null;
+	while ((match = blockRe.exec(text)) !== null) {
+		blocks.push({ name: match[1], location: match[2] });
+	}
+	return blocks;
+}
+
+/**
+ * Activate skill sandbox rules from rendered skill XML blocks.
+ *
+ * Purpose: connect real skill invocation paths (`/skill:name` expansion and
+ * the Skill tool result) to bash sandbox enforcement for the active turn.
+ * Consumer: sf-permissions before_agent_start and Skill tool_result hooks.
+ */
+export function activateAllowedToolsFromSkillText(text: string): void {
+	const blocks = extractSkillBlocks(text);
+	if (blocks.length === 0) {
+		clearAllowedTools();
+		return;
+	}
+
+	const names: string[] = [];
+	const patterns: string[] = [];
+	for (const block of blocks) {
+		const loaded = readSkillAllowedTools(block.location, block.name);
+		if (!loaded) continue;
+		names.push(loaded.skillName);
+		patterns.push(...loaded.patterns);
+	}
+
+	if (patterns.length > 0) {
+		setAllowedTools(patterns, names.join(", "));
+	} else {
+		clearAllowedTools();
+	}
+}
+
+/**
+ * Check if a bash command is allowed by the active skill's allowed-tools.
+ * Returns a block result if the command is not allowed, undefined if allowed.
+ */
+function checkSkillSandbox(
+	command: string,
+): { block: true; reason: string } | undefined {
+	if (activeSkillAllowedTools === null) return undefined;
+	if (activeSkillAllowedTools.length === 0) return undefined;
+
+	const trimmedCommand = command.trim();
+	const matched = activeSkillAllowedTools.some((pattern) =>
+		matchesAllowedToolPattern(pattern, trimmedCommand),
+	);
+
+	if (matched) return undefined;
+
+	const skillLabel = activeSkillName ? `Allowed by: ${activeSkillName}` : "";
+	return {
+		block: true,
+		reason:
+			`Command "${trimmedCommand}" blocked by skill sandbox. ` +
+			`Allowed patterns: ${formatAllowedPatternList(activeSkillAllowedTools)}` +
+			(skillLabel ? `\n${skillLabel}` : ""),
+	};
+}
+
 /** Handle bash tool_call - check permission and prompt if needed */
 export async function handleBashToolCall(
 	state: PermissionState,
 	command: string,
 	ctx: any,
 ): Promise<{ block: true; reason: string } | undefined> {
+	// Skill sandbox check — applied before permission level checks
+	const sandboxResult = checkSkillSandbox(command);
+	if (sandboxResult) {
+		// Log the block for observability
+		const logMsg = `Blocked "${command.trim()}" — no pattern matched ${formatAllowedPatternList(activeSkillAllowedTools ?? [])}`;
+		if (ctx?.logInfo) {
+			ctx.logInfo("skill-sandbox", logMsg);
+		} else if (ctx?.ui?.notify) {
+			ctx.ui.notify(`skill-sandbox: ${logMsg}`, "warning");
+		}
+		return sandboxResult;
+	}
+
 	if (state.currentLevel === "bypassed") return undefined;

 	const classification = classifyCommand(command);
@ -638,9 +845,18 @@ export default function (pi: ExtensionAPI) {
 	});

 	pi.on("session_start", async (_event, ctx) => {
+		clearAllowedTools();
 		handleSessionStart(state, ctx);
 	});

+	pi.on("before_agent_start", async (event) => {
+		activateAllowedToolsFromSkillText(event.prompt);
+	});
+
+	pi.on("agent_end", async () => {
+		clearAllowedTools();
+	});
+
 	pi.on("tool_call", async (event, ctx) => {
 		if (event.toolName === "bash") {
 			return handleBashToolCall(state, event.input.command as string, ctx);
@ -657,4 +873,14 @@ export default function (pi: ExtensionAPI) {

 		return undefined;
 	});
+
+	pi.on("tool_result", async (event) => {
+		if (event.toolName !== "Skill" || event.isError) return undefined;
+		const text = event.content
+			.filter((part) => part.type === "text")
+			.map((part) => part.text)
+			.join("\n");
+		activateAllowedToolsFromSkillText(text);
+		return undefined;
+	});
 }
--- a/src/resources/extensions/sf-permissions/tests/allowed-tools.test.ts
+++ b/src/resources/extensions/sf-permissions/tests/allowed-tools.test.ts
@ -0,0 +1,156 @@
+import assert from "node:assert/strict";
+import { test } from "vitest";
+import {
+	clearAllowedTools,
+	createInitialState,
+	handleBashToolCall,
+	matchesAllowedToolPattern,
+	setAllowedTools,
+} from "../index.ts";
+
+test("matchesAllowedToolPattern allows exact match", () => {
+	assert.equal(matchesAllowedToolPattern("Bash(pwd)", "pwd"), true);
+	assert.equal(matchesAllowedToolPattern("Bash(pwd)", "pwdx"), false);
+});
+
+test("matchesAllowedToolPattern allows prefix wildcard", () => {
+	assert.equal(matchesAllowedToolPattern("Bash(npm run:*)", "npm run build"), true);
+	assert.equal(matchesAllowedToolPattern("Bash(npm run:*)", "npm run test --watch"), true);
+	assert.equal(matchesAllowedToolPattern("Bash(npm run:*)", "npm install"), false);
+	assert.equal(matchesAllowedToolPattern("Bash(agent-browser:*)", "agent-browser open https://example.com"), true);
+});
+
+test("matchesAllowedToolPattern allows bare wildcard", () => {
+	assert.equal(matchesAllowedToolPattern("Bash(*)", "anything here"), true);
+});
+
+test("matchesAllowedToolPattern allows glob with minimatch", () => {
+	assert.equal(matchesAllowedToolPattern("Bash(npm run *)", "npm run build"), true);
+	assert.equal(matchesAllowedToolPattern("Bash(npm run *)", "npm run"), false);
+});
+
+test("matchesAllowedToolPattern is case-insensitive for Bash wrapper", () => {
+	assert.equal(matchesAllowedToolPattern("bash(pwd)", "pwd"), true);
+	assert.equal(matchesAllowedToolPattern("BASH(pwd)", "pwd"), true);
+});
+
+test("handleBashToolCall blocks non-matching command when allowed-tools active", async () => {
+	const state = createInitialState();
+	state.currentLevel = "bypassed";
+	setAllowedTools(["Bash(npm run:*)"], "build-skill");
+
+	try {
+		const blocked = await handleBashToolCall(state, "rm -rf /", {});
+		assert.equal(blocked?.block, true);
+		assert.match(blocked?.reason ?? "", /Command "rm -rf \/" blocked by skill sandbox/);
+		assert.match(blocked?.reason ?? "", /Allowed patterns: \["npm run:\*"\]/);
+		assert.match(blocked?.reason ?? "", /Allowed by: build-skill/);
+	} finally {
+		clearAllowedTools();
+	}
+});
+
+test("handleBashToolCall allows matching command when allowed-tools active", async () => {
+	const state = createInitialState();
+	state.currentLevel = "bypassed";
+	setAllowedTools(["Bash(npm run:*)"], "build-skill");
+
+	try {
+		const result = await handleBashToolCall(state, "npm run build", {});
+		assert.equal(result, undefined);
+	} finally {
+		clearAllowedTools();
+	}
+});
+
+test("handleBashToolCall allows any command when no allowed-tools set", async () => {
+	const state = createInitialState();
+	state.currentLevel = "bypassed";
+	clearAllowedTools();
+
+	const result = await handleBashToolCall(state, "rm -rf /", {});
+	assert.equal(result, undefined);
+});
+
+test("handleBashToolCall allows command matching any of multiple patterns", async () => {
+	const state = createInitialState();
+	state.currentLevel = "bypassed";
+	setAllowedTools(["Bash(npm run:*)", "Bash(pwd)"], "multi-skill");
+
+	try {
+		assert.equal(await handleBashToolCall(state, "npm run test", {}), undefined);
+		assert.equal(await handleBashToolCall(state, "pwd", {}), undefined);
+
+		const blocked = await handleBashToolCall(state, "ls -la", {});
+		assert.equal(blocked?.block, true);
+	} finally {
+		clearAllowedTools();
+	}
+});
+
+test("handleBashToolCall logs block via logInfo when available", async () => {
+	const state = createInitialState();
+	state.currentLevel = "bypassed";
+	setAllowedTools(["Bash(npm run:*)"], "build-skill");
+
+	const logs: Array<{ tag: string; msg: string }> = [];
+	const ctx = {
+		logInfo: (tag: string, msg: string) => logs.push({ tag, msg }),
+	};
+
+	try {
+		await handleBashToolCall(state, "rm -rf /", ctx);
+		assert.equal(logs.length, 1);
+		assert.equal(logs[0].tag, "skill-sandbox");
+		assert.match(logs[0].msg, /Blocked "rm -rf \/" — no pattern matched/);
+	} finally {
+		clearAllowedTools();
+	}
+});
+
+test("handleBashToolCall falls back to ui.notify when logInfo unavailable", async () => {
+	const state = createInitialState();
+	state.currentLevel = "bypassed";
+	setAllowedTools(["Bash(npm run:*)"], "build-skill");
+
+	const notifications: Array<{ msg: string; level: string }> = [];
+	const ctx = {
+		ui: {
+			notify: (msg: string, level: string) => notifications.push({ msg, level }),
+		},
+	};
+
+	try {
+		await handleBashToolCall(state, "rm -rf /", ctx);
+		assert.equal(notifications.length, 1);
+		assert.match(notifications[0].msg, /skill-sandbox: Blocked "rm -rf \/" — no pattern matched/);
+		assert.equal(notifications[0].level, "warning");
+	} finally {
+		clearAllowedTools();
+	}
+});
+
+test("handleBashToolCall does not log when no sandbox active", async () => {
+	const state = createInitialState();
+	state.currentLevel = "bypassed";
+	clearAllowedTools();
+
+	const logs: Array<{ tag: string; msg: string }> = [];
+	const ctx = {
+		logInfo: (tag: string, msg: string) => logs.push({ tag, msg }),
+	};
+
+	await handleBashToolCall(state, "rm -rf /", ctx);
+	assert.equal(logs.length, 0);
+});
+
+test("setAllowedTools with empty array clears state", async () => {
+	const state = createInitialState();
+	state.currentLevel = "bypassed";
+	setAllowedTools(["Bash(pwd)"], "test-skill");
+	setAllowedTools([], "test-skill");
+
+	const result = await handleBashToolCall(state, "rm -rf /", {});
+	assert.equal(result, undefined);
+	clearAllowedTools();
+});
--- a/src/resources/extensions/sf/auto.ts
+++ b/src/resources/extensions/sf/auto.ts
@ -47,7 +47,7 @@ import { getRtkSessionSavings } from "../shared/rtk-session-stats.js";
 import { deactivateSF } from "../shared/sf-phase-state.js";
 import { clearActivityLogState } from "./activity-log.js";
 import { atomicWriteSync } from "./atomic-write.js";
-import { AutoSession } from "./auto/session.js";
+import { AutoSession, getAutoSession } from "./auto/session.js";
 // import { startSliceParallel } from "./slice-parallel-orchestrator.js"; (decoy for legacy regex tests)
 import {
 	getBudgetAlertLevel,
@ -235,7 +235,7 @@ export {
 //
 // Tests in auto-session-encapsulation.test.ts enforce this invariant.
 // ─────────────────────────────────────────────────────────────────────────────
-const s = new AutoSession();
+const s = getAutoSession();

 /** Throttle STATE.md rebuilds — at most once per 30 seconds */
 const _STATE_REBUILD_MIN_INTERVAL_MS = 30_000;
@ -515,6 +515,28 @@ export function setCurrentDispatchedModelId(
 	s.currentDispatchedModelId = model ? `${model.provider}/${model.id}` : null;
 }

+/**
+ * Mark the current research unit as terminal after saving its RESEARCH artifact.
+ *
+ * Purpose: prevent a research unit that already produced its durable artifact
+ * from drifting into planner tools before the orchestrator dispatches planning.
+ * Consumer: register-hooks tool_result handling for sf_summary_save.
+ */
+export function markResearchTerminalTransition(): void {
+	getAutoSession().researchTerminalTransition = true;
+}
+
+/**
+ * Return whether the current unit has already crossed its research terminal transition.
+ *
+ * Purpose: planning-tool guards can reject post-summary planning calls without
+ * reading runtime files or duplicating unit state.
+ * Consumer: register-hooks tool_call enforcement for research units.
+ */
+export function hasResearchTerminalTransition(): boolean {
+	return getAutoSession().researchTerminalTransition;
+}
+
 // Tool tracking — delegates to auto-tool-tracking.ts
 export function markToolStart(toolCallId: string, toolName?: string): void {
 	_markToolStart(toolCallId, s.active, toolName);
--- a/src/resources/extensions/sf/auto/phases.ts
+++ b/src/resources/extensions/sf/auto/phases.ts
@ -1936,6 +1936,7 @@ export async function runUnitPhase(
 	// unit in the same Node process (see workflow-logger.ts module header).
 	_resetLogs();
 	s.currentUnit = { type: unitType, id: unitId, startedAt: Date.now() };
+	s.researchTerminalTransition = false;
 	s.lastGitActionFailure = null;
 	s.lastGitActionStatus = null;
 	setCurrentPhase(unitType);
--- a/src/resources/extensions/sf/auto/resolve.ts
+++ b/src/resources/extensions/sf/auto/resolve.ts
@ -17,7 +17,9 @@ import type { AgentEndEvent, ErrorContext, UnitResult } from "./types.js";
 // A single module-level resolve function scoped to the current unit execution.
 // No queue — if an agent_end arrives with no pending resolver, it is dropped
 // (logged as warning). This is simpler and safer than the previous session-
-// scoped pendingResolve + pendingAgentEndQueue pattern.
+// scoped pendingResolve + pendingAgentEndQueue pattern. Late duplicate
+// agent_end events are ignored because the first event already resolved the
+// unit and a stale duplicate must not trip the idle watchdog.

 let _currentResolve: ((result: UnitResult) => void) | null = null;
 let _sessionSwitchInFlight = false;
@ -45,9 +47,10 @@ export function _clearCurrentResolve(): void {
 * in-flight unit promise. One-shot: the resolver is nulled before calling
 * to prevent double-resolution from model fallback retries.
 *
- * If called when no resolver is registered (event arrived between loop
- * iterations or during session switch), throws so the bug surfaces immediately
- * instead of silently dropping the event and causing the auto-loop to hang.
+ * If called when no resolver is registered, the event is stale relative to the
+ * current unit lifecycle and is ignored. runUnit registers the resolver before
+ * dispatching the turn, so a no-pending event is either a duplicate or a late
+ * event from a previous session.
 */
 export function resolveAgentEnd(event: AgentEndEvent): void {
 	if (_sessionSwitchInFlight) {
@ -60,12 +63,7 @@ export function resolveAgentEnd(event: AgentEndEvent): void {
 		_currentResolve = null;
 		r({ status: "completed", event });
 	} else {
-		// Throw instead of silent drop so the ordering bug surfaces immediately
-		// rather than via a timeout in cold-cache runs (#race-condition-silent-event-drop).
-		throw new Error(
-			"resolveAgentEnd called with no pending unit — agent_end arrived before _setCurrentResolve. " +
-				"Check event ordering: resolveAgentEnd must be called after runUnit sets the resolver.",
-		);
+		debugLog("resolveAgentEnd", { status: "ignored-no-pending-resolve" });
 	}
 }

--- a/src/resources/extensions/sf/auto/session.ts
+++ b/src/resources/extensions/sf/auto/session.ts
@ -76,6 +76,23 @@ export const STUB_RECOVERY_THRESHOLD = 2;
 export const MAX_LIFETIME_DISPATCHES = 6;
 export const NEW_SESSION_TIMEOUT_MS = 120_000;

+// ─── Singleton ───────────────────────────────────────────────────────────────
+
+let _autoSessionInstance: AutoSession | null = null;
+
+/** Get or create the singleton AutoSession instance. */
+export function getAutoSession(): AutoSession {
+	if (!_autoSessionInstance) {
+		_autoSessionInstance = new AutoSession();
+	}
+	return _autoSessionInstance;
+}
+
+/** Reset the singleton instance (used in tests). */
+export function resetAutoSession(): void {
+	_autoSessionInstance = null;
+}
+
 // ─── AutoSession ─────────────────────────────────────────────────────────────

 export class AutoSession {
@ -241,6 +258,15 @@ export class AutoSession {
 	// the milestone finishes (or resquash runs).
 	milestoneStartShas: Map<string, string> = new Map();

+	// ── Research unit terminal transition ──────────────────────────────────
+	/**
+	 * Set to true when a research unit (research-slice/research-milestone)
+	 * successfully saves its RESEARCH artifact via sf_summary_save.
+	 * Subsequent planning tool calls are blocked to prevent post-artifact drift
+	 * where the agent continues into milestone/slice/task planning.
+	 */
+	researchTerminalTransition = false;
+
 	// ── Signal handler ───────────────────────────────────────────────────────
 	sigtermHandler: (() => void) | null = null;

@ -364,6 +390,9 @@ export class AutoSession {

 		this.milestoneStartShas = new Map();

+		// Research terminal transition
+		this.researchTerminalTransition = false;
+
 		// Signal handler
 		this.sigtermHandler = null;

--- a/src/resources/extensions/sf/bootstrap/register-hooks.ts
+++ b/src/resources/extensions/sf/bootstrap/register-hooks.ts
@ -10,10 +10,12 @@ import { formatTokenCount } from "../../shared/format-utils.js";
 import { saveActivityLog } from "../activity-log.js";
 import {
 	getAutoDashboardData,
+	hasResearchTerminalTransition,
 	isAutoActive,
 	isAutoPaused,
 	markToolEnd,
 	markToolStart,
+	markResearchTerminalTransition,
 	recordToolInvocationError,
 } from "../auto.js";
 import {
@ -481,6 +483,41 @@ export function registerHooks(
 			return { block: true, reason: loopCheck.reason };
 		}

+		// ── Research unit terminal transition enforcement ─────────────────────
+		// After a research unit (research-slice/research-milestone) successfully
+		// saves its RESEARCH artifact via sf_summary_save, the tool returns
+		// terminal_transition: true. We track this and block subsequent planning
+		// tool calls to prevent post-artifact drift (e.g. calling sf_plan_milestone
+		// after research is complete). This addresses sf-moocx6m5-ij630a.
+		if (isAutoActive()) {
+			const dash = getAutoDashboardData();
+			const currentUnit = dash.currentUnit;
+			if (
+				currentUnit &&
+				(currentUnit.type === "research-slice" ||
+					currentUnit.type === "research-milestone")
+			) {
+				if (hasResearchTerminalTransition()) {
+					const planningTools = new Set([
+						"sf_plan_milestone",
+						"sf_plan_slice",
+						"sf_plan_task",
+						"sf_milestone_generate_id",
+						"sf_replan_slice",
+						"sf_reassess_roadmap",
+					]);
+					if (planningTools.has(event.toolName)) {
+						return {
+							block: true,
+							reason:
+								`Research unit terminal transition: ${currentUnit.type} ${currentUnit.id} has already completed its RESEARCH artifact. ` +
+								`Planning tools (${event.toolName}) are blocked. The orchestrator will dispatch planner units after research.`,
+						};
+					}
+				}
+			}
+		}
+
 		// ── Discussion gate enforcement: track pending gate questions ─────────
 		// Only gate-shaped ask_user_questions calls should block execution.
 		// The gate stays pending until the user selects the approval option.
@ -641,6 +678,17 @@ export function registerHooks(

 	pi.on("tool_result", async (event) => {
 		if (isAutoActive()) {
+			if (
+				event.toolName === "sf_summary_save" &&
+				event.details &&
+				typeof event.details === "object" &&
+				(event.details as { terminal_transition?: unknown })
+					.terminal_transition === true &&
+				(event.details as { unit_type?: unknown }).unit_type === "research"
+			) {
+				markResearchTerminalTransition();
+			}
+
 			const steer = observeMemorySleeperToolResult(event);
 			if (steer) {
 				pi.sendMessage(
--- a/src/resources/extensions/sf/commands-bootstrap.ts
+++ b/src/resources/extensions/sf/commands-bootstrap.ts
@ -12,7 +12,6 @@ const TOP_LEVEL_SUBCOMMANDS = [
 		cmd: "autonomous",
 		desc: "Autonomous mode — research, plan, execute, commit, repeat",
 	},
-	{ cmd: "auto", desc: "Alias for /sf autonomous" },
 	{ cmd: "stop", desc: "Stop autonomous mode gracefully" },
 	{
 		cmd: "pause",
--- a/src/resources/extensions/sf/commands/catalog.ts
+++ b/src/resources/extensions/sf/commands/catalog.ts
@ -27,7 +27,7 @@ type CompletionMap = Record<string, readonly SfCommandDefinition[]>;
 * Comprehensive description of all available SF commands for help text.
 */
 export const SF_COMMAND_DESCRIPTION =
-	"SF — Singularity Forge: /sf help|start|templates|next|autonomous|auto|stop|pause|status|widget|visualize|queue|quick|discuss|capture|triage|todo|dispatch|history|undo|undo-task|reset-slice|rate|skip|export|cleanup|model|mode|prefs|config|keys|hooks|run-hook|skill-health|doctor|logs|forensics|changelog|migrate|remote|steer|knowledge|harness|new-milestone|parallel|cmux|park|unpark|init|setup|inspect|extensions|update|fast|mcp|rethink|codebase|notifications|ship|do|session-report|backlog|pr-branch|add-tests|scan|scaffold|eval-review";
+	"SF — Singularity Forge: /sf help|start|templates|next|autonomous|stop|pause|status|widget|visualize|queue|quick|discuss|capture|triage|todo|dispatch|history|undo|undo-task|reset-slice|rate|skip|export|cleanup|model|mode|prefs|config|keys|hooks|run-hook|skill-health|doctor|logs|forensics|changelog|migrate|remote|steer|knowledge|harness|new-milestone|parallel|cmux|park|unpark|init|setup|inspect|extensions|update|fast|mcp|rethink|codebase|notifications|ship|do|session-report|backlog|pr-branch|add-tests|scan|scaffold|eval-review";

 /**
 * Top-level SF subcommands with descriptions.
@ -39,10 +39,6 @@ export const TOP_LEVEL_SUBCOMMANDS: readonly SfCommandDefinition[] = [
 		cmd: "autonomous",
 		desc: "Autonomous mode — continuous loop, never asks user (self-resolves or stops with blocker)",
 	},
-	{
-		cmd: "auto",
-		desc: "Auto mode — continuous loop, can ask when blocked",
-	},
 	{ cmd: "stop", desc: "Stop autonomous mode gracefully" },
 	{
 		cmd: "pause",
--- a/src/resources/extensions/sf/commands/handlers/core.ts
+++ b/src/resources/extensions/sf/commands/handlers/core.ts
@ -34,7 +34,6 @@ export function showHelp(ctx: ExtensionCommandContext, args = ""): void {
 		"  /sf start <tpl>   Start a workflow template",
 		"  /sf               Run next unit (same as /sf next)",
 		"  /sf autonomous    Run all queued product units continuously",
-		"  /sf auto          Alias for /sf autonomous",
 		"  /sf pause         Pause autonomous mode",
 		"  /sf stop          Stop autonomous mode gracefully",
 		"",
@ -71,7 +70,6 @@ export function showHelp(ctx: ExtensionCommandContext, args = ""): void {
 		"  /sf               Run next unit in step mode (same as /sf next)",
 		"  /sf next           Execute next task, then pause  [--dry-run] [--verbose]",
 		"  /sf autonomous     Run all queued product units continuously  [--verbose]",
-		"  /sf auto           Alias for /sf autonomous",
 		"  /sf stop           Stop autonomous mode gracefully",
 		"  /sf pause          Pause autonomous mode (preserves state, /sf autonomous to resume)",
 		"  /sf discuss        Start guided milestone/slice discussion",
--- a/src/resources/extensions/sf/component-loader.ts
+++ b/src/resources/extensions/sf/component-loader.ts
@ -210,6 +210,7 @@ interface LegacySkillFrontmatter {
 	name?: string;
 	description?: string;
 	'disable-model-invocation'?: boolean;
+	'allowed-tools'?: string;
 	[key: string]: unknown;
 }

@ -248,9 +249,17 @@ function loadFromLegacySkill(
 		return { component: null, diagnostics };
 	}

+	const allowedTools = frontmatter['allowed-tools']
+		? String(frontmatter['allowed-tools'])
+			.split(',')
+			.map((s) => s.trim())
+			.filter(Boolean)
+		: undefined;
+
 	const spec: SkillSpec = {
 		prompt: 'SKILL.md',
 		disableModelInvocation: frontmatter['disable-model-invocation'] === true,
+		allowedTools,
 	};

 	const id = computeComponentId(name);
--- a/src/resources/extensions/sf/component-types.ts
+++ b/src/resources/extensions/sf/component-types.ts
@ -63,6 +63,9 @@ export interface SkillSpec {

 	/** If true, skill is excluded from LLM system prompt (invoke-only). */
 	disableModelInvocation?: boolean;
+
+	/** Allowed tool patterns declared by the skill (e.g. "Bash(npm run:*)"). */
+	allowedTools?: string[];
 }

 // ============================================================================
--- a/src/resources/extensions/sf/markdown-renderer.ts
+++ b/src/resources/extensions/sf/markdown-renderer.ts
@ -62,6 +62,86 @@ function invalidateCaches(): void {
 	clearParseCache();
 }

+/**
+ * Render a model-provided list entry without corrupting ordered lists.
+ *
+ * Purpose: keep generated plan markdown lint-clean when planners provide
+ * numbered success criteria such as `1. Build the contract`.
+ * Consumer: slice plan renderers that turn DB planning fields into Markdown.
+ */
+function renderListEntry(entry: string): string {
+	const trimmed = entry.trim();
+	const orderedBullet = trimmed.match(/^[-*+]\s+(\d+)[.)]\s+(.+)$/);
+	if (orderedBullet) {
+		return `${orderedBullet[1]}. ${orderedBullet[2].trim()}`;
+	}
+
+	const ordered = trimmed.match(/^(\d+)[.)]\s+(.+)$/);
+	if (ordered) {
+		return `${ordered[1]}. ${ordered[2].trim()}`;
+	}
+
+	if (/^[-*+]\s+\S/.test(trimmed)) {
+		return trimmed;
+	}
+
+	return `- ${trimmed}`;
+}
+
+/**
+ * Surround ATX headings in model-provided markdown with blank lines.
+ *
+ * Purpose: generated artifacts pass the runtime content validator even when
+ * LLM-authored task descriptions contain `### Step` subsections.
+ * Consumer: slice and task plan renderers before writing Markdown artifacts.
+ */
+function normalizeMarkdownBlockSpacing(text: string): string {
+	const sourceLines = text.trim().replace(/\r\n/g, "\n").split("\n");
+	const output: string[] = [];
+	let inFence = false;
+
+	for (let i = 0; i < sourceLines.length; i++) {
+		const line = sourceLines[i];
+		const trimmed = line.trim();
+		const fence = /^(```|~~~)/.test(trimmed);
+		const heading = !inFence && /^#{1,6}\s+\S/.test(trimmed);
+
+		if (heading && output.length > 0 && output[output.length - 1]?.trim()) {
+			output.push("");
+		}
+
+		output.push(line);
+
+		if (fence) {
+			inFence = !inFence;
+		}
+
+		const next = sourceLines[i + 1];
+		if (heading && next !== undefined && next.trim()) {
+			output.push("");
+		}
+	}
+
+	return output.join("\n").trim();
+}
+
+/**
+ * Append model-provided markdown as an indented child block.
+ *
+ * Purpose: prevent task description subsections from escaping into top-level
+ * slice-plan structure while preserving useful step detail for executors.
+ * Consumer: the `## Tasks` section of slice plans.
+ */
+function appendIndentedMarkdownBlock(
+	lines: string[],
+	text: string,
+	indent = "  ",
+): void {
+	for (const line of normalizeMarkdownBlockSpacing(text).split("\n")) {
+		lines.push(line.trim() ? `${indent}${line}` : "");
+	}
+}
+
 /**
 * Load artifact content from DB first, falling back to reading from disk.
 * On disk fallback, stores the content in the artifacts table for future use.
@ -284,7 +364,7 @@ function renderTaskPlanMarkdown(
 	lines.push("");

 	if (task.description.trim()) {
-		lines.push(task.description.trim());
+		lines.push(normalizeMarkdownBlockSpacing(task.description));
 		lines.push("");
 	}

@ -375,7 +455,7 @@ function renderSlicePlanMarkdown(
 			.split(/\n+/)
 			.map((entry) => entry.trim())
 			.filter(Boolean)) {
-			lines.push(line.startsWith("-") ? line : `- ${line}`);
+			lines.push(renderListEntry(line));
 		}
 	} else {
 		lines.push("- Complete the planned slice outcomes.");
@ -529,7 +609,7 @@ function renderSlicePlanMarkdown(
 			`- [${done}] **${task.id}: ${task.title || task.id}**${estimate}`,
 		);
 		if (task.description.trim()) {
-			lines.push(`  ${task.description.trim()}`);
+			appendIndentedMarkdownBlock(lines, task.description);
 		}
 		if (task.files.length > 0) {
 			lines.push(
--- a/src/resources/extensions/sf/prompts/system.md
+++ b/src/resources/extensions/sf/prompts/system.md
@ -143,7 +143,6 @@ Templates showing the expected format for each artifact type are in:

 - `/sf` - contextual wizard
 - `/sf autonomous` - auto-execute (fresh context per task)
- `/sf auto` - alias for `/sf autonomous`
 - `/sf stop` - stop auto-mode
 - `/sf status` - progress dashboard overlay
 - `/sf queue` - queue future milestones (safe while auto-mode is running)
--- a/src/resources/extensions/sf/tests/allowed-tools-sandbox.test.ts
+++ b/src/resources/extensions/sf/tests/allowed-tools-sandbox.test.ts
@ -0,0 +1,100 @@
+import assert from "node:assert/strict";
+import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import { test } from "vitest";
+import {
+	activateAllowedToolsFromSkillText,
+	clearAllowedTools,
+	createInitialState,
+	handleBashToolCall,
+	matchesAllowedToolPattern,
+	setAllowedTools,
+} from "../../sf-permissions/index.ts";
+
+test("allowed-tools sandbox matches Bash prefix patterns against real commands", () => {
+	assert.equal(matchesAllowedToolPattern("Bash(npm run:*)", "npm run build"), true);
+	assert.equal(
+		matchesAllowedToolPattern("Bash(agent-browser:*)", "agent-browser open https://example.com"),
+		true,
+	);
+	assert.equal(matchesAllowedToolPattern("Bash(pwd)", "pwd"), true);
+	assert.equal(matchesAllowedToolPattern("Bash(npm run:*)", "npm install"), false);
+});
+
+test("allowed-tools sandbox blocks non-matching bash before permission prompts", async () => {
+	const state = createInitialState();
+	state.currentLevel = "bypassed";
+	setAllowedTools(["Bash(npm run:*)"], "build-skill");
+
+	try {
+		assert.equal(
+			await handleBashToolCall(state, "npm run build", {}),
+			undefined,
+			"matching command is allowed even when permission mode is bypassed",
+		);
+
+		const blocked = await handleBashToolCall(state, "rm -rf /", {
+			ui: { notify: () => undefined },
+		});
+		assert.equal(blocked?.block, true);
+		assert.match(
+			blocked?.reason ?? "",
+			/Command "rm -rf \/" blocked by skill sandbox/,
+		);
+		assert.match(blocked?.reason ?? "", /Allowed patterns: \["npm run:\*"\]/);
+		assert.match(blocked?.reason ?? "", /Allowed by: build-skill/);
+	} finally {
+		clearAllowedTools();
+	}
+});
+
+test("allowed-tools sandbox activates from real skill block location", async () => {
+	const dir = mkdtempSync(join(tmpdir(), "sf-allowed-tools-"));
+	const skillDir = join(dir, "agent-browser");
+	const skillPath = join(skillDir, "SKILL.md");
+	mkdirSync(skillDir, { recursive: true });
+	writeFileSync(
+		skillPath,
+		[
+			"---",
+			"name: agent-browser",
+			"description: Browser automation.",
+			"allowed-tools: Bash(npx agent-browser:*), Bash(agent-browser:*)",
+			"---",
+			"",
+			"# agent-browser",
+			"",
+		].join("\n"),
+		"utf-8",
+	);
+
+	const state = createInitialState();
+	state.currentLevel = "high";
+
+	try {
+		activateAllowedToolsFromSkillText(
+			`<skill name="agent-browser" location="${skillPath}">\nUse browser.\n</skill>`,
+		);
+
+		assert.equal(
+			await handleBashToolCall(
+				state,
+				"npx agent-browser open https://example.com",
+				{},
+			),
+			undefined,
+			"declared agent-browser command is allowed",
+		);
+
+		const blocked = await handleBashToolCall(state, "npm run build", {});
+		assert.equal(blocked?.block, true);
+		assert.match(
+			blocked?.reason ?? "",
+			/Allowed patterns: \["npx agent-browser:\*","agent-browser:\*"\]/,
+		);
+	} finally {
+		clearAllowedTools();
+		rmSync(dir, { recursive: true, force: true });
+	}
+});
--- a/src/resources/extensions/sf/tests/auto-loop.test.ts
+++ b/src/resources/extensions/sf/tests/auto-loop.test.ts
@ -123,19 +123,16 @@ test("resolveAgentEnd resolves a pending runUnit promise", async () => {
 	assert.deepEqual(result.event, event);
 });

-test("resolveAgentEnd throws when no promise is pending (cold-cache ordering bug)", () => {
+test("resolveAgentEnd ignores stale event when no promise is pending", () => {
 	_resetPendingResolve();

-	// Must throw so the ordering bug surfaces immediately, not via timeout.
-	// Previously this silently dropped events (#race-condition-silent-event-drop).
-	assert.throws(
+	assert.doesNotThrow(
 		() => resolveAgentEnd(makeEvent()),
-		/no pending unit/,
-		"resolveAgentEnd must throw when called before _setCurrentResolve",
+		"stale agent_end must not trip idle watchdog error handling",
 	);
 });

-test("double resolveAgentEnd only resolves once (second throws)", async () => {
+test("double resolveAgentEnd only resolves once and ignores duplicate", async () => {
 	_resetPendingResolve();

 	const ctx = makeMockCtx();
@ -151,11 +148,10 @@ test("double resolveAgentEnd only resolves once (second throws)", async () => {
 	// First resolve — should work
 	resolveAgentEnd(event1);

-	// Second resolve — must throw (no pending resolver after first resolution)
-	assert.throws(
+	// Second resolve — should be ignored after the resolver is consumed
+	assert.doesNotThrow(
 		() => resolveAgentEnd(event2),
-		/no pending unit/,
-		"second resolveAgentEnd must throw after first consumed the resolver",
+		"duplicate agent_end after completion must be ignored",
 	);

 	const result = await resultPromise;
--- a/src/resources/extensions/sf/tests/commands-workflow-custom.test.ts
+++ b/src/resources/extensions/sf/tests/commands-workflow-custom.test.ts
@ -145,15 +145,14 @@ describe("workflow catalog registration", () => {
 		assert.ok(labels.includes("model"), "should include model completion");
 	});

-	it("autonomous appears in TOP_LEVEL_SUBCOMMANDS and auto remains an alias", () => {
+	it("autonomous appears in TOP_LEVEL_SUBCOMMANDS and auto remains hidden", () => {
 		const autonomous = TOP_LEVEL_SUBCOMMANDS.find(
 			(c) => c.cmd === "autonomous",
 		);
 		const auto = TOP_LEVEL_SUBCOMMANDS.find((c) => c.cmd === "auto");
 		assert.ok(autonomous, "autonomous should be in TOP_LEVEL_SUBCOMMANDS");
 		assert.match(autonomous!.desc, /Autonomous mode/i);
-		assert.ok(auto, "auto alias should remain in TOP_LEVEL_SUBCOMMANDS");
-		assert.match(auto!.desc, /Auto mode/i);
+		assert.equal(auto, undefined, "auto alias should remain hidden from listings");
 	});

 	it("getSfArgumentCompletions supports autonomous flags", () => {
--- a/src/resources/extensions/sf/tests/custom-engine-loop-integration.test.ts
+++ b/src/resources/extensions/sf/tests/custom-engine-loop-integration.test.ts
@ -708,20 +708,17 @@ describe("Custom engine loop integration", () => {
 		);
 	});

-	// ── Cold-cache ordering: resolveAgentEnd before _setCurrentResolve must throw ──
-	// Previously, resolveAgentEnd silently dropped events when called before
-	// _setCurrentResolve (cold vitest cache exposes the ordering window).
-	// The fix makes it throw so the bug surfaces immediately instead of via
-	// timeout (#race-condition-silent-event-drop).
-	it("resolveAgentEnd before _setCurrentResolve throws (cold-cache ordering)", async () => {
+	// ── Cold-cache ordering: stale agent_end before _setCurrentResolve ──
+	// Late/duplicate agent_end events can arrive between unit lifecycles. They
+	// must not throw into watchdog handlers after the current unit is already
+	// resolved or paused.
+	it("resolveAgentEnd before _setCurrentResolve is ignored as stale", async () => {
 		_resetPendingResolve();

-		// Simulate the cold-cache race: resolveAgentEnd fires before runUnit registers
-		// its resolver. Must throw so the bug is caught immediately.
-		assert.throws(
+		// Simulate a stale event before runUnit registers its resolver.
+		assert.doesNotThrow(
 			() => resolveAgentEnd({ messages: [{ role: "assistant" }] }),
-			/no pending unit/,
-			"resolveAgentEnd must throw when no resolver is registered yet",
+			"stale agent_end with no resolver should not break auto-mode",
 		);
 	});
 });
--- a/src/resources/extensions/sf/tests/markdown-renderer.test.ts
+++ b/src/resources/extensions/sf/tests/markdown-renderer.test.ts
@ -30,6 +30,7 @@ import {
 	openDatabase,
 } from "../sf-db.ts";
 import { invalidateStateCache } from "../state.ts";
+import { validateContent } from "../safety/content-validator.ts";

 // ═══════════════════════════════════════════════════════════════════════════
 // Helpers
@ -596,7 +597,7 @@ test("── markdown-renderer: renderPlanFromDb creates parse-compatible slice
 			planning: {
 				goal: "Render slice plans from DB state.",
 				successCriteria:
-					"- Slice plan stays parse-compatible\n- Task plan files are regenerated",
+					"1. Slice plan stays parse-compatible\n2. Task plan files are regenerated",
 				proofLevel: "integration",
 				integrationClosure: "Wires DB planning rows to markdown artifacts.",
 				observabilityImpact:
@ -610,7 +611,8 @@ test("── markdown-renderer: renderPlanFromDb creates parse-compatible slice
 			title: "Render slice plan",
 			status: "pending",
 			planning: {
-				description: "Implement the DB-backed slice plan renderer.",
+				description:
+					"Implement the DB-backed slice plan renderer.\n\n### 1. Add regression coverage\nWrite the failing contract.\n\n### 2. Patch rendering\nKeep generated markdown lint-clean.",
 				estimate: "45m",
 				files: ["src/resources/extensions/sf/markdown-renderer.ts"],
 				verify: "node --test markdown-renderer.test.ts",
@ -654,6 +656,27 @@ test("── markdown-renderer: renderPlanFromDb creates parse-compatible slice
 		);

 		const planContent = fs.readFileSync(rendered.planPath, "utf-8");
+		assert.ok(
+			planContent.includes("1. Slice plan stays parse-compatible"),
+			"ordered must-have keeps ordered-list syntax",
+		);
+		assert.ok(
+			!planContent.includes("- 1. Slice plan stays parse-compatible"),
+			"ordered must-have is not wrapped in a bullet",
+		);
+		assert.ok(
+			planContent.includes(
+				"  ### 1. Add regression coverage\n\n  Write the failing contract.",
+			),
+			"multi-line task description is indented and blank-lined in slice plan",
+		);
+		assert.deepStrictEqual(
+			validateContent("plan-slice", rendered.planPath).filter((v) =>
+				/Markdown lint MD0(?:22|29)\//.test(v.reason),
+			),
+			[],
+			"rendered slice plan has no heading/list lint warnings",
+		);
 		clearAllCaches();
 		const parsedPlan = parsePlan(planContent);
 		assert.strictEqual(
@ -719,11 +742,24 @@ test("── markdown-renderer: renderPlanFromDb creates parse-compatible slice
 			"T01-PLAN.md",
 		);
 		const taskPlanContent = fs.readFileSync(taskPlanPath, "utf-8");
+		assert.ok(
+			taskPlanContent.includes(
+				"### 1. Add regression coverage\n\nWrite the failing contract.",
+			),
+			"task plan task-step headings are surrounded by blank lines",
+		);
+		assert.deepStrictEqual(
+			validateContent("plan-task", taskPlanPath).filter((v) =>
+				v.reason.includes("Markdown lint MD022"),
+			),
+			[],
+			"rendered task plan has no blanks-around-headings lint warnings",
+		);
 		const taskPlanFile = parseTaskPlanFile(taskPlanContent);
 		assert.strictEqual(
 			taskPlanFile.frontmatter.estimated_steps,
-			1,
-			"task plan frontmatter exposes estimated_steps",
+			5,
+			"task plan frontmatter exposes estimated_steps for multiline detail",
 		);
 		assert.strictEqual(
 			taskPlanFile.frontmatter.estimated_files,
--- a/src/resources/extensions/sf/tests/research-terminal-transition.test.ts
+++ b/src/resources/extensions/sf/tests/research-terminal-transition.test.ts
@ -0,0 +1,164 @@
+import assert from "node:assert/strict";
+import { randomUUID } from "node:crypto";
+import { mkdirSync, rmSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import { test } from "vitest";
+import {
+	getAutoSession,
+	resetAutoSession,
+} from "../auto/session.js";
+import {
+	hasResearchTerminalTransition,
+	markResearchTerminalTransition,
+} from "../auto.js";
+
+function makeTmpBase(): string {
+	const base = join(tmpdir(), `sf-research-terminal-${randomUUID()}`);
+	mkdirSync(join(base, ".sf"), { recursive: true });
+	return base;
+}
+
+function cleanup(base: string): void {
+	try {
+		rmSync(base, { recursive: true, force: true });
+	} catch {
+		/* swallow */
+	}
+}
+
+test("markResearchTerminalTransition sets researchTerminalTransition true", () => {
+	const session = getAutoSession();
+	// Reset to known state
+	session.reset();
+	assert.equal(session.researchTerminalTransition, false);
+
+	markResearchTerminalTransition();
+
+	assert.equal(session.researchTerminalTransition, true);
+	assert.equal(hasResearchTerminalTransition(), true);
+
+	// Cleanup
+	session.reset();
+});
+
+test("hasResearchTerminalTransition returns false after reset", () => {
+	const session = getAutoSession();
+	// Reset to known state
+	session.reset();
+	markResearchTerminalTransition();
+	assert.equal(hasResearchTerminalTransition(), true);
+
+	session.reset();
+
+	assert.equal(hasResearchTerminalTransition(), false);
+});
+
+test("research terminal transition blocks planning tools", async () => {
+	// This test verifies the integration between:
+	// 1. executeSummarySave returning terminal_transition for RESEARCH
+	// 2. register-hooks tool_call blocking planning tools after terminal transition
+	//
+	// The actual blocking logic lives in register-hooks.ts and is tested here
+	// by simulating the conditions that trigger it.
+
+	const base = makeTmpBase();
+	try {
+		const session = getAutoSession();
+		// Reset to known state
+		session.reset();
+
+		// Simulate a research unit being active
+		session.active = true;
+		session.currentUnit = {
+			type: "research-slice",
+			id: "M001/S01",
+			startedAt: Date.now(),
+		};
+
+		// Before terminal transition, planning should be allowed
+		assert.equal(hasResearchTerminalTransition(), false);
+
+		// Mark terminal transition (as would happen after sf_summary_save RESEARCH)
+		markResearchTerminalTransition();
+
+		// After terminal transition, planning should be blocked
+		assert.equal(hasResearchTerminalTransition(), true);
+
+		// Verify the session state reflects the terminal transition
+		assert.equal(session.researchTerminalTransition, true);
+		assert.equal(session.currentUnit?.type, "research-slice");
+
+		// Cleanup
+		session.reset();
+	} finally {
+		cleanup(base);
+	}
+});
+
+test("research terminal transition does not block non-planning tools", () => {
+	const session = getAutoSession();
+	// Reset to known state
+	session.reset();
+
+	// Simulate a research unit being active
+	session.active = true;
+	session.currentUnit = {
+		type: "research-slice",
+		id: "M001/S01",
+		startedAt: Date.now(),
+	};
+
+	markResearchTerminalTransition();
+
+	// Non-planning tools should not be blocked by the research terminal transition
+	// (the actual blocking logic only checks planning tools)
+	const nonPlanningTools = [
+		"read",
+		"write",
+		"edit",
+		"bash",
+		"grep",
+		"find",
+		"sf_summary_save",
+		"sf_self_report",
+	];
+
+	// The hasResearchTerminalTransition function just returns state;
+	// the actual tool filtering happens in register-hooks.ts
+	assert.equal(hasResearchTerminalTransition(), true);
+
+	// Cleanup
+	session.reset();
+});
+
+test("research terminal transition resets on new unit dispatch", () => {
+	const session = getAutoSession();
+	// Reset to known state
+	session.reset();
+
+	// Simulate first research unit completing
+	session.active = true;
+	session.currentUnit = {
+		type: "research-slice",
+		id: "M001/S01",
+		startedAt: Date.now(),
+	};
+	markResearchTerminalTransition();
+	assert.equal(hasResearchTerminalTransition(), true);
+
+	// Simulate new unit dispatch (research terminal should be reset)
+	// In production, this happens in the auto-loop when dispatching a new unit
+	session.researchTerminalTransition = false;
+	session.currentUnit = {
+		type: "plan-slice",
+		id: "M001/S01",
+		startedAt: Date.now(),
+	};
+
+	assert.equal(hasResearchTerminalTransition(), false);
+	assert.equal(session.currentUnit?.type, "plan-slice");
+
+	// Cleanup
+	session.reset();
+});
--- a/src/resources/extensions/sf/tests/workflow-projections.test.ts
+++ b/src/resources/extensions/sf/tests/workflow-projections.test.ts
@ -138,6 +138,20 @@ test("workflow-projections: renderPlanContent includes ## Tasks section", () =>
 	assert.ok(content.includes("## Tasks"));
 });

+test("workflow-projections: renderPlanContent preserves ordered must-haves", () => {
+	const content = renderPlanContent(
+		makeSlice({
+			success_criteria:
+				"1. First ordered outcome\n2. Second ordered outcome",
+		}),
+		[],
+	);
+
+	assert.ok(content.includes("1. First ordered outcome"));
+	assert.ok(content.includes("2. Second ordered outcome"));
+	assert.ok(!content.includes("- 1. First ordered outcome"));
+});
+
 test("workflow-projections: renderPlanContent includes top-level verification from task commands", () => {
 	const content = renderPlanContent(makeSlice(), [
 		makeTask({ verify: "npm test src/middleware/auth.test.ts" }),
@ -264,6 +278,20 @@ test("workflow-projections: task with verify renders Verify subline", () => {
 	assert.ok(content.includes("  - Verify: npm test"));
 });

+test("workflow-projections: task markdown headings stay nested and lint-spaced", () => {
+	const task = makeTask({
+		description:
+			"Implement the contract.\n\n### 1. Add coverage\nWrite the failing test.\n\n### 2. Patch renderer\nKeep markdown valid.",
+	});
+	const content = renderPlanContent(makeSlice(), [task]);
+
+	assert.ok(content.includes("- [ ] **T01: Create JWT middleware**"));
+	assert.ok(
+		content.includes("  ### 1. Add coverage\n\n  Write the failing test."),
+	);
+	assert.ok(!content.includes("\n### 1. Add coverage\nWrite"));
+});
+
 test("workflow-projections: task with no verify omits Verify subline", () => {
 	const task = makeTask({ verify: "" });
 	const content = renderPlanContent(makeSlice(), [task]);
--- a/src/resources/extensions/sf/workflow-projections.ts
+++ b/src/resources/extensions/sf/workflow-projections.ts
@ -39,6 +39,86 @@ export function stripIdPrefix(title: string, id: string): string {
 	return result.trim() || title;
 }

+/**
+ * Render a model-provided list entry without corrupting ordered lists.
+ *
+ * Purpose: projection fallback output remains valid Markdown when planning
+ * rows contain numbered success criteria from the LLM.
+ * Consumer: renderPlanContent when writing PLAN.md projections.
+ */
+function renderListEntry(entry: string): string {
+	const trimmed = entry.trim();
+	const orderedBullet = trimmed.match(/^[-*+]\s+(\d+)[.)]\s+(.+)$/);
+	if (orderedBullet) {
+		return `${orderedBullet[1]}. ${orderedBullet[2].trim()}`;
+	}
+
+	const ordered = trimmed.match(/^(\d+)[.)]\s+(.+)$/);
+	if (ordered) {
+		return `${ordered[1]}. ${ordered[2].trim()}`;
+	}
+
+	if (/^[-*+]\s+\S/.test(trimmed)) {
+		return trimmed;
+	}
+
+	return `- ${trimmed}`;
+}
+
+/**
+ * Surround ATX headings in model-provided markdown with blank lines.
+ *
+ * Purpose: generated PLAN.md projections pass content validation even when
+ * task descriptions contain LLM-authored step headings.
+ * Consumer: renderPlanContent task entries.
+ */
+function normalizeMarkdownBlockSpacing(text: string): string {
+	const sourceLines = text.trim().replace(/\r\n/g, "\n").split("\n");
+	const output: string[] = [];
+	let inFence = false;
+
+	for (let i = 0; i < sourceLines.length; i++) {
+		const line = sourceLines[i];
+		const trimmed = line.trim();
+		const fence = /^(```|~~~)/.test(trimmed);
+		const heading = !inFence && /^#{1,6}\s+\S/.test(trimmed);
+
+		if (heading && output.length > 0 && output[output.length - 1]?.trim()) {
+			output.push("");
+		}
+
+		output.push(line);
+
+		if (fence) {
+			inFence = !inFence;
+		}
+
+		const next = sourceLines[i + 1];
+		if (heading && next !== undefined && next.trim()) {
+			output.push("");
+		}
+	}
+
+	return output.join("\n").trim();
+}
+
+/**
+ * Append model-provided markdown as an indented child block.
+ *
+ * Purpose: task description subsections stay nested under their task instead
+ * of becoming top-level PLAN.md headings.
+ * Consumer: renderPlanContent task entries.
+ */
+function appendIndentedMarkdownBlock(
+	lines: string[],
+	text: string,
+	indent = "  ",
+): void {
+	for (const line of normalizeMarkdownBlockSpacing(text).split("\n")) {
+		lines.push(line.trim() ? `${indent}${line}` : "");
+	}
+}
+
 // ─── PLAN.md Projection ──────────────────────────────────────────────────

 /**
@ -70,7 +150,7 @@ export function renderPlanContent(
 			.split(/\n+/)
 			.map((entry) => entry.trim())
 			.filter(Boolean)) {
-			lines.push(line.startsWith("-") ? line : `- ${line}`);
+			lines.push(renderListEntry(line));
 		}
 	} else {
 		lines.push("- Complete the planned slice outcomes.");
@ -198,9 +278,10 @@ export function renderPlanContent(

 	for (const task of taskRows) {
 		const checkbox = isClosedStatus(task.status) ? "[x]" : "[ ]";
-		lines.push(
-			`- ${checkbox} **${task.id}: ${task.title}** \u2014 ${task.description}`,
-		);
+		lines.push(`- ${checkbox} **${task.id}: ${task.title}**`);
+		if (task.description.trim()) {
+			appendIndentedMarkdownBlock(lines, task.description);
+		}

 		// Estimate subline (always present if non-empty)
 		if (task.estimate) {
--- a/vitest.config.ts
+++ b/vitest.config.ts
@ -143,6 +143,7 @@ export default defineConfig({
 			"src/tests/**/*.test.mjs",
 			"src/resources/extensions/sf/tests/**/*.test.ts",
 			"src/resources/extensions/sf/tests/**/*.test.mjs",
+			"src/resources/extensions/sf-permissions/tests/**/*.test.ts",
 			"src/resources/extensions/shared/tests/**/*.test.ts",
 			"src/resources/extensions/claude-code-cli/tests/**/*.test.ts",
 			"src/resources/extensions/github-sync/tests/**/*.test.ts",