From 0c7c4eca5b88648204ad252b4c0d8326f06e1dfc Mon Sep 17 00:00:00 2001 From: Mikael Hugo Date: Sat, 2 May 2026 19:46:36 +0200 Subject: [PATCH] fix(sf): harden auto loops and skill sandbox --- packages/mcp-server/package.json | 2 +- .../extensions/sf-permissions/index.ts | 226 ++++++++++++++++++ .../tests/allowed-tools.test.ts | 156 ++++++++++++ src/resources/extensions/sf/auto.ts | 26 +- src/resources/extensions/sf/auto/phases.ts | 1 + src/resources/extensions/sf/auto/resolve.ts | 18 +- src/resources/extensions/sf/auto/session.ts | 29 +++ .../extensions/sf/bootstrap/register-hooks.ts | 48 ++++ .../extensions/sf/commands-bootstrap.ts | 1 - .../extensions/sf/commands/catalog.ts | 6 +- .../extensions/sf/commands/handlers/core.ts | 2 - .../extensions/sf/component-loader.ts | 9 + .../extensions/sf/component-types.ts | 3 + .../extensions/sf/markdown-renderer.ts | 86 ++++++- src/resources/extensions/sf/prompts/system.md | 1 - .../sf/tests/allowed-tools-sandbox.test.ts | 100 ++++++++ .../extensions/sf/tests/auto-loop.test.ts | 18 +- .../sf/tests/commands-workflow-custom.test.ts | 5 +- .../custom-engine-loop-integration.test.ts | 19 +- .../sf/tests/markdown-renderer.test.ts | 44 +++- .../research-terminal-transition.test.ts | 164 +++++++++++++ .../sf/tests/workflow-projections.test.ts | 28 +++ .../extensions/sf/workflow-projections.ts | 89 ++++++- vitest.config.ts | 1 + 24 files changed, 1024 insertions(+), 58 deletions(-) create mode 100644 src/resources/extensions/sf-permissions/tests/allowed-tools.test.ts create mode 100644 src/resources/extensions/sf/tests/allowed-tools-sandbox.test.ts create mode 100644 src/resources/extensions/sf/tests/research-terminal-transition.test.ts diff --git a/packages/mcp-server/package.json b/packages/mcp-server/package.json index eeb484765..de413355d 100644 --- a/packages/mcp-server/package.json +++ b/packages/mcp-server/package.json @@ -25,7 +25,7 @@ }, "scripts": { "build": "tsc", - "test": "node --test dist/mcp-server.test.js" + "test": "vitest run packages/mcp-server/src --root ../.. --config vitest.config.ts" }, "dependencies": { "@modelcontextprotocol/sdk": "^1.27.1", diff --git a/src/resources/extensions/sf-permissions/index.ts b/src/resources/extensions/sf-permissions/index.ts index d0da6249e..8e9319222 100644 --- a/src/resources/extensions/sf-permissions/index.ts +++ b/src/resources/extensions/sf-permissions/index.ts @@ -41,6 +41,7 @@ import fs from "node:fs"; import os from "node:os"; import path from "node:path"; import type { ExtensionAPI } from "@singularity-forge/pi-coding-agent"; +import { minimatch } from "minimatch"; import { classifyCommand, invalidateConfigCache, @@ -481,12 +482,218 @@ export function handleSessionStart(state: PermissionState, ctx: any): void { } } +// ============================================================================ +// SKILL SANDBOX — allowed-tools enforcement +// ============================================================================ + +/** Active skill-declared allowed tool patterns for the current session. */ +let activeSkillAllowedTools: readonly string[] | null = null; + +/** Name of the skill that declared the active allowed-tools patterns. */ +let activeSkillName: string | null = null; + +/** + * Set the allowed tool patterns for the current session. + * Called when a skill with allowed-tools is activated. + */ +export function setAllowedTools( + patterns: readonly string[], + skillName?: string, +): void { + activeSkillAllowedTools = patterns.length > 0 ? [...patterns] : null; + activeSkillName = patterns.length > 0 ? (skillName ?? null) : null; +} + +/** + * Clear the allowed tool patterns for the current session. + * Called when a skill is deactivated or at session end. + */ +export function clearAllowedTools(): void { + activeSkillAllowedTools = null; + activeSkillName = null; +} + +/** + * Extract the command pattern from a Bash(...) allowed-tools entry. + * + * Purpose: skill sandbox matching and error messages need the command pattern + * without the outer tool syntax. + * Consumer: matchesAllowedToolPattern and blocked-command diagnostics. + */ +function extractBashAllowedPattern(pattern: string): string | null { + const trimmedPattern = pattern.trim(); + const match = trimmedPattern.match(/^Bash\((.+)\)$/i); + if (!match) return null; + return match[1].trim(); +} + +/** + * Parse an allowed-tools pattern and check if a command matches. + * + * Purpose: enforce `allowed-tools: Bash(npm run:*)` as a command-prefix rule, + * where the colon separates the command prefix from its wildcard arguments. + * Consumer: handleBashToolCall before normal permission checks run. + */ +export function matchesAllowedToolPattern( + pattern: string, + command: string, +): boolean { + const commandPattern = extractBashAllowedPattern(pattern); + if (!commandPattern) return false; + + const trimmedCommand = command.trim().replace(/\s+/g, " "); + if (commandPattern === "*") return true; + if (commandPattern.endsWith(":*")) { + const prefix = commandPattern.slice(0, -2).trim().replace(/\s+/g, " "); + return trimmedCommand === prefix || trimmedCommand.startsWith(`${prefix} `); + } + if (!/[*?[\]]/.test(commandPattern)) { + return trimmedCommand === commandPattern.trim().replace(/\s+/g, " "); + } + return minimatch(trimmedCommand, commandPattern); +} + +function formatAllowedPatternList(patterns: readonly string[]): string { + const displayPatterns = patterns.map( + (pattern) => extractBashAllowedPattern(pattern) ?? pattern.trim(), + ); + return JSON.stringify(displayPatterns); +} + +function unquoteFrontmatterValue(value: string): string { + const trimmed = value.trim(); + if ( + (trimmed.startsWith('"') && trimmed.endsWith('"')) || + (trimmed.startsWith("'") && trimmed.endsWith("'")) + ) { + return trimmed.slice(1, -1).trim(); + } + return trimmed; +} + +function splitAllowedTools(value: string): string[] { + return value + .split(/\s*,\s*/) + .map((entry) => unquoteFrontmatterValue(entry)) + .filter(Boolean); +} + +function readSkillAllowedTools( + filePath: string, + fallbackName?: string, +): { skillName: string; patterns: string[] } | null { + let raw: string; + try { + raw = fs.readFileSync(filePath, "utf-8"); + } catch { + return null; + } + + const frontmatter = raw.match(/^---\r?\n([\s\S]*?)\r?\n---/); + if (!frontmatter) return null; + + const name = + frontmatter[1] + .match(/^name:\s*(.+)$/m)?.[1] + .trim() + .replace(/^["']|["']$/g, "") || + fallbackName || + path.basename(path.dirname(filePath)); + const allowedTools = frontmatter[1].match(/^allowed-tools:\s*(.+)$/m)?.[1]; + if (!allowedTools) return { skillName: name, patterns: [] }; + + return { + skillName: name, + patterns: splitAllowedTools(allowedTools), + }; +} + +function extractSkillBlocks(text: string): Array<{ name: string; location: string }> { + const blocks: Array<{ name: string; location: string }> = []; + const blockRe = //g; + let match: RegExpExecArray | null; + while ((match = blockRe.exec(text)) !== null) { + blocks.push({ name: match[1], location: match[2] }); + } + return blocks; +} + +/** + * Activate skill sandbox rules from rendered skill XML blocks. + * + * Purpose: connect real skill invocation paths (`/skill:name` expansion and + * the Skill tool result) to bash sandbox enforcement for the active turn. + * Consumer: sf-permissions before_agent_start and Skill tool_result hooks. + */ +export function activateAllowedToolsFromSkillText(text: string): void { + const blocks = extractSkillBlocks(text); + if (blocks.length === 0) { + clearAllowedTools(); + return; + } + + const names: string[] = []; + const patterns: string[] = []; + for (const block of blocks) { + const loaded = readSkillAllowedTools(block.location, block.name); + if (!loaded) continue; + names.push(loaded.skillName); + patterns.push(...loaded.patterns); + } + + if (patterns.length > 0) { + setAllowedTools(patterns, names.join(", ")); + } else { + clearAllowedTools(); + } +} + +/** + * Check if a bash command is allowed by the active skill's allowed-tools. + * Returns a block result if the command is not allowed, undefined if allowed. + */ +function checkSkillSandbox( + command: string, +): { block: true; reason: string } | undefined { + if (activeSkillAllowedTools === null) return undefined; + if (activeSkillAllowedTools.length === 0) return undefined; + + const trimmedCommand = command.trim(); + const matched = activeSkillAllowedTools.some((pattern) => + matchesAllowedToolPattern(pattern, trimmedCommand), + ); + + if (matched) return undefined; + + const skillLabel = activeSkillName ? `Allowed by: ${activeSkillName}` : ""; + return { + block: true, + reason: + `Command "${trimmedCommand}" blocked by skill sandbox. ` + + `Allowed patterns: ${formatAllowedPatternList(activeSkillAllowedTools)}` + + (skillLabel ? `\n${skillLabel}` : ""), + }; +} + /** Handle bash tool_call - check permission and prompt if needed */ export async function handleBashToolCall( state: PermissionState, command: string, ctx: any, ): Promise<{ block: true; reason: string } | undefined> { + // Skill sandbox check — applied before permission level checks + const sandboxResult = checkSkillSandbox(command); + if (sandboxResult) { + // Log the block for observability + const logMsg = `Blocked "${command.trim()}" — no pattern matched ${formatAllowedPatternList(activeSkillAllowedTools ?? [])}`; + if (ctx?.logInfo) { + ctx.logInfo("skill-sandbox", logMsg); + } else if (ctx?.ui?.notify) { + ctx.ui.notify(`skill-sandbox: ${logMsg}`, "warning"); + } + return sandboxResult; + } + if (state.currentLevel === "bypassed") return undefined; const classification = classifyCommand(command); @@ -638,9 +845,18 @@ export default function (pi: ExtensionAPI) { }); pi.on("session_start", async (_event, ctx) => { + clearAllowedTools(); handleSessionStart(state, ctx); }); + pi.on("before_agent_start", async (event) => { + activateAllowedToolsFromSkillText(event.prompt); + }); + + pi.on("agent_end", async () => { + clearAllowedTools(); + }); + pi.on("tool_call", async (event, ctx) => { if (event.toolName === "bash") { return handleBashToolCall(state, event.input.command as string, ctx); @@ -657,4 +873,14 @@ export default function (pi: ExtensionAPI) { return undefined; }); + + pi.on("tool_result", async (event) => { + if (event.toolName !== "Skill" || event.isError) return undefined; + const text = event.content + .filter((part) => part.type === "text") + .map((part) => part.text) + .join("\n"); + activateAllowedToolsFromSkillText(text); + return undefined; + }); } diff --git a/src/resources/extensions/sf-permissions/tests/allowed-tools.test.ts b/src/resources/extensions/sf-permissions/tests/allowed-tools.test.ts new file mode 100644 index 000000000..91ac12f0e --- /dev/null +++ b/src/resources/extensions/sf-permissions/tests/allowed-tools.test.ts @@ -0,0 +1,156 @@ +import assert from "node:assert/strict"; +import { test } from "vitest"; +import { + clearAllowedTools, + createInitialState, + handleBashToolCall, + matchesAllowedToolPattern, + setAllowedTools, +} from "../index.ts"; + +test("matchesAllowedToolPattern allows exact match", () => { + assert.equal(matchesAllowedToolPattern("Bash(pwd)", "pwd"), true); + assert.equal(matchesAllowedToolPattern("Bash(pwd)", "pwdx"), false); +}); + +test("matchesAllowedToolPattern allows prefix wildcard", () => { + assert.equal(matchesAllowedToolPattern("Bash(npm run:*)", "npm run build"), true); + assert.equal(matchesAllowedToolPattern("Bash(npm run:*)", "npm run test --watch"), true); + assert.equal(matchesAllowedToolPattern("Bash(npm run:*)", "npm install"), false); + assert.equal(matchesAllowedToolPattern("Bash(agent-browser:*)", "agent-browser open https://example.com"), true); +}); + +test("matchesAllowedToolPattern allows bare wildcard", () => { + assert.equal(matchesAllowedToolPattern("Bash(*)", "anything here"), true); +}); + +test("matchesAllowedToolPattern allows glob with minimatch", () => { + assert.equal(matchesAllowedToolPattern("Bash(npm run *)", "npm run build"), true); + assert.equal(matchesAllowedToolPattern("Bash(npm run *)", "npm run"), false); +}); + +test("matchesAllowedToolPattern is case-insensitive for Bash wrapper", () => { + assert.equal(matchesAllowedToolPattern("bash(pwd)", "pwd"), true); + assert.equal(matchesAllowedToolPattern("BASH(pwd)", "pwd"), true); +}); + +test("handleBashToolCall blocks non-matching command when allowed-tools active", async () => { + const state = createInitialState(); + state.currentLevel = "bypassed"; + setAllowedTools(["Bash(npm run:*)"], "build-skill"); + + try { + const blocked = await handleBashToolCall(state, "rm -rf /", {}); + assert.equal(blocked?.block, true); + assert.match(blocked?.reason ?? "", /Command "rm -rf \/" blocked by skill sandbox/); + assert.match(blocked?.reason ?? "", /Allowed patterns: \["npm run:\*"\]/); + assert.match(blocked?.reason ?? "", /Allowed by: build-skill/); + } finally { + clearAllowedTools(); + } +}); + +test("handleBashToolCall allows matching command when allowed-tools active", async () => { + const state = createInitialState(); + state.currentLevel = "bypassed"; + setAllowedTools(["Bash(npm run:*)"], "build-skill"); + + try { + const result = await handleBashToolCall(state, "npm run build", {}); + assert.equal(result, undefined); + } finally { + clearAllowedTools(); + } +}); + +test("handleBashToolCall allows any command when no allowed-tools set", async () => { + const state = createInitialState(); + state.currentLevel = "bypassed"; + clearAllowedTools(); + + const result = await handleBashToolCall(state, "rm -rf /", {}); + assert.equal(result, undefined); +}); + +test("handleBashToolCall allows command matching any of multiple patterns", async () => { + const state = createInitialState(); + state.currentLevel = "bypassed"; + setAllowedTools(["Bash(npm run:*)", "Bash(pwd)"], "multi-skill"); + + try { + assert.equal(await handleBashToolCall(state, "npm run test", {}), undefined); + assert.equal(await handleBashToolCall(state, "pwd", {}), undefined); + + const blocked = await handleBashToolCall(state, "ls -la", {}); + assert.equal(blocked?.block, true); + } finally { + clearAllowedTools(); + } +}); + +test("handleBashToolCall logs block via logInfo when available", async () => { + const state = createInitialState(); + state.currentLevel = "bypassed"; + setAllowedTools(["Bash(npm run:*)"], "build-skill"); + + const logs: Array<{ tag: string; msg: string }> = []; + const ctx = { + logInfo: (tag: string, msg: string) => logs.push({ tag, msg }), + }; + + try { + await handleBashToolCall(state, "rm -rf /", ctx); + assert.equal(logs.length, 1); + assert.equal(logs[0].tag, "skill-sandbox"); + assert.match(logs[0].msg, /Blocked "rm -rf \/" — no pattern matched/); + } finally { + clearAllowedTools(); + } +}); + +test("handleBashToolCall falls back to ui.notify when logInfo unavailable", async () => { + const state = createInitialState(); + state.currentLevel = "bypassed"; + setAllowedTools(["Bash(npm run:*)"], "build-skill"); + + const notifications: Array<{ msg: string; level: string }> = []; + const ctx = { + ui: { + notify: (msg: string, level: string) => notifications.push({ msg, level }), + }, + }; + + try { + await handleBashToolCall(state, "rm -rf /", ctx); + assert.equal(notifications.length, 1); + assert.match(notifications[0].msg, /skill-sandbox: Blocked "rm -rf \/" — no pattern matched/); + assert.equal(notifications[0].level, "warning"); + } finally { + clearAllowedTools(); + } +}); + +test("handleBashToolCall does not log when no sandbox active", async () => { + const state = createInitialState(); + state.currentLevel = "bypassed"; + clearAllowedTools(); + + const logs: Array<{ tag: string; msg: string }> = []; + const ctx = { + logInfo: (tag: string, msg: string) => logs.push({ tag, msg }), + }; + + await handleBashToolCall(state, "rm -rf /", ctx); + assert.equal(logs.length, 0); +}); + +test("setAllowedTools with empty array clears state", async () => { + const state = createInitialState(); + state.currentLevel = "bypassed"; + setAllowedTools(["Bash(pwd)"], "test-skill"); + setAllowedTools([], "test-skill"); + + const result = await handleBashToolCall(state, "rm -rf /", {}); + assert.equal(result, undefined); + clearAllowedTools(); +}); diff --git a/src/resources/extensions/sf/auto.ts b/src/resources/extensions/sf/auto.ts index ac91ee97f..581ff2804 100644 --- a/src/resources/extensions/sf/auto.ts +++ b/src/resources/extensions/sf/auto.ts @@ -47,7 +47,7 @@ import { getRtkSessionSavings } from "../shared/rtk-session-stats.js"; import { deactivateSF } from "../shared/sf-phase-state.js"; import { clearActivityLogState } from "./activity-log.js"; import { atomicWriteSync } from "./atomic-write.js"; -import { AutoSession } from "./auto/session.js"; +import { AutoSession, getAutoSession } from "./auto/session.js"; // import { startSliceParallel } from "./slice-parallel-orchestrator.js"; (decoy for legacy regex tests) import { getBudgetAlertLevel, @@ -235,7 +235,7 @@ export { // // Tests in auto-session-encapsulation.test.ts enforce this invariant. // ───────────────────────────────────────────────────────────────────────────── -const s = new AutoSession(); +const s = getAutoSession(); /** Throttle STATE.md rebuilds — at most once per 30 seconds */ const _STATE_REBUILD_MIN_INTERVAL_MS = 30_000; @@ -515,6 +515,28 @@ export function setCurrentDispatchedModelId( s.currentDispatchedModelId = model ? `${model.provider}/${model.id}` : null; } +/** + * Mark the current research unit as terminal after saving its RESEARCH artifact. + * + * Purpose: prevent a research unit that already produced its durable artifact + * from drifting into planner tools before the orchestrator dispatches planning. + * Consumer: register-hooks tool_result handling for sf_summary_save. + */ +export function markResearchTerminalTransition(): void { + getAutoSession().researchTerminalTransition = true; +} + +/** + * Return whether the current unit has already crossed its research terminal transition. + * + * Purpose: planning-tool guards can reject post-summary planning calls without + * reading runtime files or duplicating unit state. + * Consumer: register-hooks tool_call enforcement for research units. + */ +export function hasResearchTerminalTransition(): boolean { + return getAutoSession().researchTerminalTransition; +} + // Tool tracking — delegates to auto-tool-tracking.ts export function markToolStart(toolCallId: string, toolName?: string): void { _markToolStart(toolCallId, s.active, toolName); diff --git a/src/resources/extensions/sf/auto/phases.ts b/src/resources/extensions/sf/auto/phases.ts index b68badbbe..1437856cb 100644 --- a/src/resources/extensions/sf/auto/phases.ts +++ b/src/resources/extensions/sf/auto/phases.ts @@ -1936,6 +1936,7 @@ export async function runUnitPhase( // unit in the same Node process (see workflow-logger.ts module header). _resetLogs(); s.currentUnit = { type: unitType, id: unitId, startedAt: Date.now() }; + s.researchTerminalTransition = false; s.lastGitActionFailure = null; s.lastGitActionStatus = null; setCurrentPhase(unitType); diff --git a/src/resources/extensions/sf/auto/resolve.ts b/src/resources/extensions/sf/auto/resolve.ts index 143391662..8ecc90119 100644 --- a/src/resources/extensions/sf/auto/resolve.ts +++ b/src/resources/extensions/sf/auto/resolve.ts @@ -17,7 +17,9 @@ import type { AgentEndEvent, ErrorContext, UnitResult } from "./types.js"; // A single module-level resolve function scoped to the current unit execution. // No queue — if an agent_end arrives with no pending resolver, it is dropped // (logged as warning). This is simpler and safer than the previous session- -// scoped pendingResolve + pendingAgentEndQueue pattern. +// scoped pendingResolve + pendingAgentEndQueue pattern. Late duplicate +// agent_end events are ignored because the first event already resolved the +// unit and a stale duplicate must not trip the idle watchdog. let _currentResolve: ((result: UnitResult) => void) | null = null; let _sessionSwitchInFlight = false; @@ -45,9 +47,10 @@ export function _clearCurrentResolve(): void { * in-flight unit promise. One-shot: the resolver is nulled before calling * to prevent double-resolution from model fallback retries. * - * If called when no resolver is registered (event arrived between loop - * iterations or during session switch), throws so the bug surfaces immediately - * instead of silently dropping the event and causing the auto-loop to hang. + * If called when no resolver is registered, the event is stale relative to the + * current unit lifecycle and is ignored. runUnit registers the resolver before + * dispatching the turn, so a no-pending event is either a duplicate or a late + * event from a previous session. */ export function resolveAgentEnd(event: AgentEndEvent): void { if (_sessionSwitchInFlight) { @@ -60,12 +63,7 @@ export function resolveAgentEnd(event: AgentEndEvent): void { _currentResolve = null; r({ status: "completed", event }); } else { - // Throw instead of silent drop so the ordering bug surfaces immediately - // rather than via a timeout in cold-cache runs (#race-condition-silent-event-drop). - throw new Error( - "resolveAgentEnd called with no pending unit — agent_end arrived before _setCurrentResolve. " + - "Check event ordering: resolveAgentEnd must be called after runUnit sets the resolver.", - ); + debugLog("resolveAgentEnd", { status: "ignored-no-pending-resolve" }); } } diff --git a/src/resources/extensions/sf/auto/session.ts b/src/resources/extensions/sf/auto/session.ts index b2a26e516..c708b2113 100644 --- a/src/resources/extensions/sf/auto/session.ts +++ b/src/resources/extensions/sf/auto/session.ts @@ -76,6 +76,23 @@ export const STUB_RECOVERY_THRESHOLD = 2; export const MAX_LIFETIME_DISPATCHES = 6; export const NEW_SESSION_TIMEOUT_MS = 120_000; +// ─── Singleton ─────────────────────────────────────────────────────────────── + +let _autoSessionInstance: AutoSession | null = null; + +/** Get or create the singleton AutoSession instance. */ +export function getAutoSession(): AutoSession { + if (!_autoSessionInstance) { + _autoSessionInstance = new AutoSession(); + } + return _autoSessionInstance; +} + +/** Reset the singleton instance (used in tests). */ +export function resetAutoSession(): void { + _autoSessionInstance = null; +} + // ─── AutoSession ───────────────────────────────────────────────────────────── export class AutoSession { @@ -241,6 +258,15 @@ export class AutoSession { // the milestone finishes (or resquash runs). milestoneStartShas: Map = new Map(); + // ── Research unit terminal transition ────────────────────────────────── + /** + * Set to true when a research unit (research-slice/research-milestone) + * successfully saves its RESEARCH artifact via sf_summary_save. + * Subsequent planning tool calls are blocked to prevent post-artifact drift + * where the agent continues into milestone/slice/task planning. + */ + researchTerminalTransition = false; + // ── Signal handler ─────────────────────────────────────────────────────── sigtermHandler: (() => void) | null = null; @@ -364,6 +390,9 @@ export class AutoSession { this.milestoneStartShas = new Map(); + // Research terminal transition + this.researchTerminalTransition = false; + // Signal handler this.sigtermHandler = null; diff --git a/src/resources/extensions/sf/bootstrap/register-hooks.ts b/src/resources/extensions/sf/bootstrap/register-hooks.ts index 9119e34ce..96c6ddeff 100644 --- a/src/resources/extensions/sf/bootstrap/register-hooks.ts +++ b/src/resources/extensions/sf/bootstrap/register-hooks.ts @@ -10,10 +10,12 @@ import { formatTokenCount } from "../../shared/format-utils.js"; import { saveActivityLog } from "../activity-log.js"; import { getAutoDashboardData, + hasResearchTerminalTransition, isAutoActive, isAutoPaused, markToolEnd, markToolStart, + markResearchTerminalTransition, recordToolInvocationError, } from "../auto.js"; import { @@ -481,6 +483,41 @@ export function registerHooks( return { block: true, reason: loopCheck.reason }; } + // ── Research unit terminal transition enforcement ───────────────────── + // After a research unit (research-slice/research-milestone) successfully + // saves its RESEARCH artifact via sf_summary_save, the tool returns + // terminal_transition: true. We track this and block subsequent planning + // tool calls to prevent post-artifact drift (e.g. calling sf_plan_milestone + // after research is complete). This addresses sf-moocx6m5-ij630a. + if (isAutoActive()) { + const dash = getAutoDashboardData(); + const currentUnit = dash.currentUnit; + if ( + currentUnit && + (currentUnit.type === "research-slice" || + currentUnit.type === "research-milestone") + ) { + if (hasResearchTerminalTransition()) { + const planningTools = new Set([ + "sf_plan_milestone", + "sf_plan_slice", + "sf_plan_task", + "sf_milestone_generate_id", + "sf_replan_slice", + "sf_reassess_roadmap", + ]); + if (planningTools.has(event.toolName)) { + return { + block: true, + reason: + `Research unit terminal transition: ${currentUnit.type} ${currentUnit.id} has already completed its RESEARCH artifact. ` + + `Planning tools (${event.toolName}) are blocked. The orchestrator will dispatch planner units after research.`, + }; + } + } + } + } + // ── Discussion gate enforcement: track pending gate questions ───────── // Only gate-shaped ask_user_questions calls should block execution. // The gate stays pending until the user selects the approval option. @@ -641,6 +678,17 @@ export function registerHooks( pi.on("tool_result", async (event) => { if (isAutoActive()) { + if ( + event.toolName === "sf_summary_save" && + event.details && + typeof event.details === "object" && + (event.details as { terminal_transition?: unknown }) + .terminal_transition === true && + (event.details as { unit_type?: unknown }).unit_type === "research" + ) { + markResearchTerminalTransition(); + } + const steer = observeMemorySleeperToolResult(event); if (steer) { pi.sendMessage( diff --git a/src/resources/extensions/sf/commands-bootstrap.ts b/src/resources/extensions/sf/commands-bootstrap.ts index e7025e2d8..2fa28a02b 100644 --- a/src/resources/extensions/sf/commands-bootstrap.ts +++ b/src/resources/extensions/sf/commands-bootstrap.ts @@ -12,7 +12,6 @@ const TOP_LEVEL_SUBCOMMANDS = [ cmd: "autonomous", desc: "Autonomous mode — research, plan, execute, commit, repeat", }, - { cmd: "auto", desc: "Alias for /sf autonomous" }, { cmd: "stop", desc: "Stop autonomous mode gracefully" }, { cmd: "pause", diff --git a/src/resources/extensions/sf/commands/catalog.ts b/src/resources/extensions/sf/commands/catalog.ts index 084f005f5..688e36d0b 100644 --- a/src/resources/extensions/sf/commands/catalog.ts +++ b/src/resources/extensions/sf/commands/catalog.ts @@ -27,7 +27,7 @@ type CompletionMap = Record; * Comprehensive description of all available SF commands for help text. */ export const SF_COMMAND_DESCRIPTION = - "SF — Singularity Forge: /sf help|start|templates|next|autonomous|auto|stop|pause|status|widget|visualize|queue|quick|discuss|capture|triage|todo|dispatch|history|undo|undo-task|reset-slice|rate|skip|export|cleanup|model|mode|prefs|config|keys|hooks|run-hook|skill-health|doctor|logs|forensics|changelog|migrate|remote|steer|knowledge|harness|new-milestone|parallel|cmux|park|unpark|init|setup|inspect|extensions|update|fast|mcp|rethink|codebase|notifications|ship|do|session-report|backlog|pr-branch|add-tests|scan|scaffold|eval-review"; + "SF — Singularity Forge: /sf help|start|templates|next|autonomous|stop|pause|status|widget|visualize|queue|quick|discuss|capture|triage|todo|dispatch|history|undo|undo-task|reset-slice|rate|skip|export|cleanup|model|mode|prefs|config|keys|hooks|run-hook|skill-health|doctor|logs|forensics|changelog|migrate|remote|steer|knowledge|harness|new-milestone|parallel|cmux|park|unpark|init|setup|inspect|extensions|update|fast|mcp|rethink|codebase|notifications|ship|do|session-report|backlog|pr-branch|add-tests|scan|scaffold|eval-review"; /** * Top-level SF subcommands with descriptions. @@ -39,10 +39,6 @@ export const TOP_LEVEL_SUBCOMMANDS: readonly SfCommandDefinition[] = [ cmd: "autonomous", desc: "Autonomous mode — continuous loop, never asks user (self-resolves or stops with blocker)", }, - { - cmd: "auto", - desc: "Auto mode — continuous loop, can ask when blocked", - }, { cmd: "stop", desc: "Stop autonomous mode gracefully" }, { cmd: "pause", diff --git a/src/resources/extensions/sf/commands/handlers/core.ts b/src/resources/extensions/sf/commands/handlers/core.ts index 2cb349e18..1bbbc73d3 100644 --- a/src/resources/extensions/sf/commands/handlers/core.ts +++ b/src/resources/extensions/sf/commands/handlers/core.ts @@ -34,7 +34,6 @@ export function showHelp(ctx: ExtensionCommandContext, args = ""): void { " /sf start Start a workflow template", " /sf Run next unit (same as /sf next)", " /sf autonomous Run all queued product units continuously", - " /sf auto Alias for /sf autonomous", " /sf pause Pause autonomous mode", " /sf stop Stop autonomous mode gracefully", "", @@ -71,7 +70,6 @@ export function showHelp(ctx: ExtensionCommandContext, args = ""): void { " /sf Run next unit in step mode (same as /sf next)", " /sf next Execute next task, then pause [--dry-run] [--verbose]", " /sf autonomous Run all queued product units continuously [--verbose]", - " /sf auto Alias for /sf autonomous", " /sf stop Stop autonomous mode gracefully", " /sf pause Pause autonomous mode (preserves state, /sf autonomous to resume)", " /sf discuss Start guided milestone/slice discussion", diff --git a/src/resources/extensions/sf/component-loader.ts b/src/resources/extensions/sf/component-loader.ts index b5d361be9..80deb5266 100644 --- a/src/resources/extensions/sf/component-loader.ts +++ b/src/resources/extensions/sf/component-loader.ts @@ -210,6 +210,7 @@ interface LegacySkillFrontmatter { name?: string; description?: string; 'disable-model-invocation'?: boolean; + 'allowed-tools'?: string; [key: string]: unknown; } @@ -248,9 +249,17 @@ function loadFromLegacySkill( return { component: null, diagnostics }; } + const allowedTools = frontmatter['allowed-tools'] + ? String(frontmatter['allowed-tools']) + .split(',') + .map((s) => s.trim()) + .filter(Boolean) + : undefined; + const spec: SkillSpec = { prompt: 'SKILL.md', disableModelInvocation: frontmatter['disable-model-invocation'] === true, + allowedTools, }; const id = computeComponentId(name); diff --git a/src/resources/extensions/sf/component-types.ts b/src/resources/extensions/sf/component-types.ts index 5fa1a58a8..4f5527957 100644 --- a/src/resources/extensions/sf/component-types.ts +++ b/src/resources/extensions/sf/component-types.ts @@ -63,6 +63,9 @@ export interface SkillSpec { /** If true, skill is excluded from LLM system prompt (invoke-only). */ disableModelInvocation?: boolean; + + /** Allowed tool patterns declared by the skill (e.g. "Bash(npm run:*)"). */ + allowedTools?: string[]; } // ============================================================================ diff --git a/src/resources/extensions/sf/markdown-renderer.ts b/src/resources/extensions/sf/markdown-renderer.ts index 5a6e73696..d3bc6fbb9 100644 --- a/src/resources/extensions/sf/markdown-renderer.ts +++ b/src/resources/extensions/sf/markdown-renderer.ts @@ -62,6 +62,86 @@ function invalidateCaches(): void { clearParseCache(); } +/** + * Render a model-provided list entry without corrupting ordered lists. + * + * Purpose: keep generated plan markdown lint-clean when planners provide + * numbered success criteria such as `1. Build the contract`. + * Consumer: slice plan renderers that turn DB planning fields into Markdown. + */ +function renderListEntry(entry: string): string { + const trimmed = entry.trim(); + const orderedBullet = trimmed.match(/^[-*+]\s+(\d+)[.)]\s+(.+)$/); + if (orderedBullet) { + return `${orderedBullet[1]}. ${orderedBullet[2].trim()}`; + } + + const ordered = trimmed.match(/^(\d+)[.)]\s+(.+)$/); + if (ordered) { + return `${ordered[1]}. ${ordered[2].trim()}`; + } + + if (/^[-*+]\s+\S/.test(trimmed)) { + return trimmed; + } + + return `- ${trimmed}`; +} + +/** + * Surround ATX headings in model-provided markdown with blank lines. + * + * Purpose: generated artifacts pass the runtime content validator even when + * LLM-authored task descriptions contain `### Step` subsections. + * Consumer: slice and task plan renderers before writing Markdown artifacts. + */ +function normalizeMarkdownBlockSpacing(text: string): string { + const sourceLines = text.trim().replace(/\r\n/g, "\n").split("\n"); + const output: string[] = []; + let inFence = false; + + for (let i = 0; i < sourceLines.length; i++) { + const line = sourceLines[i]; + const trimmed = line.trim(); + const fence = /^(```|~~~)/.test(trimmed); + const heading = !inFence && /^#{1,6}\s+\S/.test(trimmed); + + if (heading && output.length > 0 && output[output.length - 1]?.trim()) { + output.push(""); + } + + output.push(line); + + if (fence) { + inFence = !inFence; + } + + const next = sourceLines[i + 1]; + if (heading && next !== undefined && next.trim()) { + output.push(""); + } + } + + return output.join("\n").trim(); +} + +/** + * Append model-provided markdown as an indented child block. + * + * Purpose: prevent task description subsections from escaping into top-level + * slice-plan structure while preserving useful step detail for executors. + * Consumer: the `## Tasks` section of slice plans. + */ +function appendIndentedMarkdownBlock( + lines: string[], + text: string, + indent = " ", +): void { + for (const line of normalizeMarkdownBlockSpacing(text).split("\n")) { + lines.push(line.trim() ? `${indent}${line}` : ""); + } +} + /** * Load artifact content from DB first, falling back to reading from disk. * On disk fallback, stores the content in the artifacts table for future use. @@ -284,7 +364,7 @@ function renderTaskPlanMarkdown( lines.push(""); if (task.description.trim()) { - lines.push(task.description.trim()); + lines.push(normalizeMarkdownBlockSpacing(task.description)); lines.push(""); } @@ -375,7 +455,7 @@ function renderSlicePlanMarkdown( .split(/\n+/) .map((entry) => entry.trim()) .filter(Boolean)) { - lines.push(line.startsWith("-") ? line : `- ${line}`); + lines.push(renderListEntry(line)); } } else { lines.push("- Complete the planned slice outcomes."); @@ -529,7 +609,7 @@ function renderSlicePlanMarkdown( `- [${done}] **${task.id}: ${task.title || task.id}**${estimate}`, ); if (task.description.trim()) { - lines.push(` ${task.description.trim()}`); + appendIndentedMarkdownBlock(lines, task.description); } if (task.files.length > 0) { lines.push( diff --git a/src/resources/extensions/sf/prompts/system.md b/src/resources/extensions/sf/prompts/system.md index e8c457dfe..a21815e98 100644 --- a/src/resources/extensions/sf/prompts/system.md +++ b/src/resources/extensions/sf/prompts/system.md @@ -143,7 +143,6 @@ Templates showing the expected format for each artifact type are in: - `/sf` - contextual wizard - `/sf autonomous` - auto-execute (fresh context per task) -- `/sf auto` - alias for `/sf autonomous` - `/sf stop` - stop auto-mode - `/sf status` - progress dashboard overlay - `/sf queue` - queue future milestones (safe while auto-mode is running) diff --git a/src/resources/extensions/sf/tests/allowed-tools-sandbox.test.ts b/src/resources/extensions/sf/tests/allowed-tools-sandbox.test.ts new file mode 100644 index 000000000..0ea519271 --- /dev/null +++ b/src/resources/extensions/sf/tests/allowed-tools-sandbox.test.ts @@ -0,0 +1,100 @@ +import assert from "node:assert/strict"; +import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { test } from "vitest"; +import { + activateAllowedToolsFromSkillText, + clearAllowedTools, + createInitialState, + handleBashToolCall, + matchesAllowedToolPattern, + setAllowedTools, +} from "../../sf-permissions/index.ts"; + +test("allowed-tools sandbox matches Bash prefix patterns against real commands", () => { + assert.equal(matchesAllowedToolPattern("Bash(npm run:*)", "npm run build"), true); + assert.equal( + matchesAllowedToolPattern("Bash(agent-browser:*)", "agent-browser open https://example.com"), + true, + ); + assert.equal(matchesAllowedToolPattern("Bash(pwd)", "pwd"), true); + assert.equal(matchesAllowedToolPattern("Bash(npm run:*)", "npm install"), false); +}); + +test("allowed-tools sandbox blocks non-matching bash before permission prompts", async () => { + const state = createInitialState(); + state.currentLevel = "bypassed"; + setAllowedTools(["Bash(npm run:*)"], "build-skill"); + + try { + assert.equal( + await handleBashToolCall(state, "npm run build", {}), + undefined, + "matching command is allowed even when permission mode is bypassed", + ); + + const blocked = await handleBashToolCall(state, "rm -rf /", { + ui: { notify: () => undefined }, + }); + assert.equal(blocked?.block, true); + assert.match( + blocked?.reason ?? "", + /Command "rm -rf \/" blocked by skill sandbox/, + ); + assert.match(blocked?.reason ?? "", /Allowed patterns: \["npm run:\*"\]/); + assert.match(blocked?.reason ?? "", /Allowed by: build-skill/); + } finally { + clearAllowedTools(); + } +}); + +test("allowed-tools sandbox activates from real skill block location", async () => { + const dir = mkdtempSync(join(tmpdir(), "sf-allowed-tools-")); + const skillDir = join(dir, "agent-browser"); + const skillPath = join(skillDir, "SKILL.md"); + mkdirSync(skillDir, { recursive: true }); + writeFileSync( + skillPath, + [ + "---", + "name: agent-browser", + "description: Browser automation.", + "allowed-tools: Bash(npx agent-browser:*), Bash(agent-browser:*)", + "---", + "", + "# agent-browser", + "", + ].join("\n"), + "utf-8", + ); + + const state = createInitialState(); + state.currentLevel = "high"; + + try { + activateAllowedToolsFromSkillText( + `\nUse browser.\n`, + ); + + assert.equal( + await handleBashToolCall( + state, + "npx agent-browser open https://example.com", + {}, + ), + undefined, + "declared agent-browser command is allowed", + ); + + const blocked = await handleBashToolCall(state, "npm run build", {}); + assert.equal(blocked?.block, true); + assert.match( + blocked?.reason ?? "", + /Allowed patterns: \["npx agent-browser:\*","agent-browser:\*"\]/, + ); + } finally { + clearAllowedTools(); + rmSync(dir, { recursive: true, force: true }); + } +}); diff --git a/src/resources/extensions/sf/tests/auto-loop.test.ts b/src/resources/extensions/sf/tests/auto-loop.test.ts index d53e4bd96..6547fd27b 100644 --- a/src/resources/extensions/sf/tests/auto-loop.test.ts +++ b/src/resources/extensions/sf/tests/auto-loop.test.ts @@ -123,19 +123,16 @@ test("resolveAgentEnd resolves a pending runUnit promise", async () => { assert.deepEqual(result.event, event); }); -test("resolveAgentEnd throws when no promise is pending (cold-cache ordering bug)", () => { +test("resolveAgentEnd ignores stale event when no promise is pending", () => { _resetPendingResolve(); - // Must throw so the ordering bug surfaces immediately, not via timeout. - // Previously this silently dropped events (#race-condition-silent-event-drop). - assert.throws( + assert.doesNotThrow( () => resolveAgentEnd(makeEvent()), - /no pending unit/, - "resolveAgentEnd must throw when called before _setCurrentResolve", + "stale agent_end must not trip idle watchdog error handling", ); }); -test("double resolveAgentEnd only resolves once (second throws)", async () => { +test("double resolveAgentEnd only resolves once and ignores duplicate", async () => { _resetPendingResolve(); const ctx = makeMockCtx(); @@ -151,11 +148,10 @@ test("double resolveAgentEnd only resolves once (second throws)", async () => { // First resolve — should work resolveAgentEnd(event1); - // Second resolve — must throw (no pending resolver after first resolution) - assert.throws( + // Second resolve — should be ignored after the resolver is consumed + assert.doesNotThrow( () => resolveAgentEnd(event2), - /no pending unit/, - "second resolveAgentEnd must throw after first consumed the resolver", + "duplicate agent_end after completion must be ignored", ); const result = await resultPromise; diff --git a/src/resources/extensions/sf/tests/commands-workflow-custom.test.ts b/src/resources/extensions/sf/tests/commands-workflow-custom.test.ts index 416fe82a9..897fd67e0 100644 --- a/src/resources/extensions/sf/tests/commands-workflow-custom.test.ts +++ b/src/resources/extensions/sf/tests/commands-workflow-custom.test.ts @@ -145,15 +145,14 @@ describe("workflow catalog registration", () => { assert.ok(labels.includes("model"), "should include model completion"); }); - it("autonomous appears in TOP_LEVEL_SUBCOMMANDS and auto remains an alias", () => { + it("autonomous appears in TOP_LEVEL_SUBCOMMANDS and auto remains hidden", () => { const autonomous = TOP_LEVEL_SUBCOMMANDS.find( (c) => c.cmd === "autonomous", ); const auto = TOP_LEVEL_SUBCOMMANDS.find((c) => c.cmd === "auto"); assert.ok(autonomous, "autonomous should be in TOP_LEVEL_SUBCOMMANDS"); assert.match(autonomous!.desc, /Autonomous mode/i); - assert.ok(auto, "auto alias should remain in TOP_LEVEL_SUBCOMMANDS"); - assert.match(auto!.desc, /Auto mode/i); + assert.equal(auto, undefined, "auto alias should remain hidden from listings"); }); it("getSfArgumentCompletions supports autonomous flags", () => { diff --git a/src/resources/extensions/sf/tests/custom-engine-loop-integration.test.ts b/src/resources/extensions/sf/tests/custom-engine-loop-integration.test.ts index d4c7a844b..31dcdaf7a 100644 --- a/src/resources/extensions/sf/tests/custom-engine-loop-integration.test.ts +++ b/src/resources/extensions/sf/tests/custom-engine-loop-integration.test.ts @@ -708,20 +708,17 @@ describe("Custom engine loop integration", () => { ); }); - // ── Cold-cache ordering: resolveAgentEnd before _setCurrentResolve must throw ── - // Previously, resolveAgentEnd silently dropped events when called before - // _setCurrentResolve (cold vitest cache exposes the ordering window). - // The fix makes it throw so the bug surfaces immediately instead of via - // timeout (#race-condition-silent-event-drop). - it("resolveAgentEnd before _setCurrentResolve throws (cold-cache ordering)", async () => { + // ── Cold-cache ordering: stale agent_end before _setCurrentResolve ── + // Late/duplicate agent_end events can arrive between unit lifecycles. They + // must not throw into watchdog handlers after the current unit is already + // resolved or paused. + it("resolveAgentEnd before _setCurrentResolve is ignored as stale", async () => { _resetPendingResolve(); - // Simulate the cold-cache race: resolveAgentEnd fires before runUnit registers - // its resolver. Must throw so the bug is caught immediately. - assert.throws( + // Simulate a stale event before runUnit registers its resolver. + assert.doesNotThrow( () => resolveAgentEnd({ messages: [{ role: "assistant" }] }), - /no pending unit/, - "resolveAgentEnd must throw when no resolver is registered yet", + "stale agent_end with no resolver should not break auto-mode", ); }); }); diff --git a/src/resources/extensions/sf/tests/markdown-renderer.test.ts b/src/resources/extensions/sf/tests/markdown-renderer.test.ts index a145e9bbe..1d604aa95 100644 --- a/src/resources/extensions/sf/tests/markdown-renderer.test.ts +++ b/src/resources/extensions/sf/tests/markdown-renderer.test.ts @@ -30,6 +30,7 @@ import { openDatabase, } from "../sf-db.ts"; import { invalidateStateCache } from "../state.ts"; +import { validateContent } from "../safety/content-validator.ts"; // ═══════════════════════════════════════════════════════════════════════════ // Helpers @@ -596,7 +597,7 @@ test("── markdown-renderer: renderPlanFromDb creates parse-compatible slice planning: { goal: "Render slice plans from DB state.", successCriteria: - "- Slice plan stays parse-compatible\n- Task plan files are regenerated", + "1. Slice plan stays parse-compatible\n2. Task plan files are regenerated", proofLevel: "integration", integrationClosure: "Wires DB planning rows to markdown artifacts.", observabilityImpact: @@ -610,7 +611,8 @@ test("── markdown-renderer: renderPlanFromDb creates parse-compatible slice title: "Render slice plan", status: "pending", planning: { - description: "Implement the DB-backed slice plan renderer.", + description: + "Implement the DB-backed slice plan renderer.\n\n### 1. Add regression coverage\nWrite the failing contract.\n\n### 2. Patch rendering\nKeep generated markdown lint-clean.", estimate: "45m", files: ["src/resources/extensions/sf/markdown-renderer.ts"], verify: "node --test markdown-renderer.test.ts", @@ -654,6 +656,27 @@ test("── markdown-renderer: renderPlanFromDb creates parse-compatible slice ); const planContent = fs.readFileSync(rendered.planPath, "utf-8"); + assert.ok( + planContent.includes("1. Slice plan stays parse-compatible"), + "ordered must-have keeps ordered-list syntax", + ); + assert.ok( + !planContent.includes("- 1. Slice plan stays parse-compatible"), + "ordered must-have is not wrapped in a bullet", + ); + assert.ok( + planContent.includes( + " ### 1. Add regression coverage\n\n Write the failing contract.", + ), + "multi-line task description is indented and blank-lined in slice plan", + ); + assert.deepStrictEqual( + validateContent("plan-slice", rendered.planPath).filter((v) => + /Markdown lint MD0(?:22|29)\//.test(v.reason), + ), + [], + "rendered slice plan has no heading/list lint warnings", + ); clearAllCaches(); const parsedPlan = parsePlan(planContent); assert.strictEqual( @@ -719,11 +742,24 @@ test("── markdown-renderer: renderPlanFromDb creates parse-compatible slice "T01-PLAN.md", ); const taskPlanContent = fs.readFileSync(taskPlanPath, "utf-8"); + assert.ok( + taskPlanContent.includes( + "### 1. Add regression coverage\n\nWrite the failing contract.", + ), + "task plan task-step headings are surrounded by blank lines", + ); + assert.deepStrictEqual( + validateContent("plan-task", taskPlanPath).filter((v) => + v.reason.includes("Markdown lint MD022"), + ), + [], + "rendered task plan has no blanks-around-headings lint warnings", + ); const taskPlanFile = parseTaskPlanFile(taskPlanContent); assert.strictEqual( taskPlanFile.frontmatter.estimated_steps, - 1, - "task plan frontmatter exposes estimated_steps", + 5, + "task plan frontmatter exposes estimated_steps for multiline detail", ); assert.strictEqual( taskPlanFile.frontmatter.estimated_files, diff --git a/src/resources/extensions/sf/tests/research-terminal-transition.test.ts b/src/resources/extensions/sf/tests/research-terminal-transition.test.ts new file mode 100644 index 000000000..10464d2e2 --- /dev/null +++ b/src/resources/extensions/sf/tests/research-terminal-transition.test.ts @@ -0,0 +1,164 @@ +import assert from "node:assert/strict"; +import { randomUUID } from "node:crypto"; +import { mkdirSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { test } from "vitest"; +import { + getAutoSession, + resetAutoSession, +} from "../auto/session.js"; +import { + hasResearchTerminalTransition, + markResearchTerminalTransition, +} from "../auto.js"; + +function makeTmpBase(): string { + const base = join(tmpdir(), `sf-research-terminal-${randomUUID()}`); + mkdirSync(join(base, ".sf"), { recursive: true }); + return base; +} + +function cleanup(base: string): void { + try { + rmSync(base, { recursive: true, force: true }); + } catch { + /* swallow */ + } +} + +test("markResearchTerminalTransition sets researchTerminalTransition true", () => { + const session = getAutoSession(); + // Reset to known state + session.reset(); + assert.equal(session.researchTerminalTransition, false); + + markResearchTerminalTransition(); + + assert.equal(session.researchTerminalTransition, true); + assert.equal(hasResearchTerminalTransition(), true); + + // Cleanup + session.reset(); +}); + +test("hasResearchTerminalTransition returns false after reset", () => { + const session = getAutoSession(); + // Reset to known state + session.reset(); + markResearchTerminalTransition(); + assert.equal(hasResearchTerminalTransition(), true); + + session.reset(); + + assert.equal(hasResearchTerminalTransition(), false); +}); + +test("research terminal transition blocks planning tools", async () => { + // This test verifies the integration between: + // 1. executeSummarySave returning terminal_transition for RESEARCH + // 2. register-hooks tool_call blocking planning tools after terminal transition + // + // The actual blocking logic lives in register-hooks.ts and is tested here + // by simulating the conditions that trigger it. + + const base = makeTmpBase(); + try { + const session = getAutoSession(); + // Reset to known state + session.reset(); + + // Simulate a research unit being active + session.active = true; + session.currentUnit = { + type: "research-slice", + id: "M001/S01", + startedAt: Date.now(), + }; + + // Before terminal transition, planning should be allowed + assert.equal(hasResearchTerminalTransition(), false); + + // Mark terminal transition (as would happen after sf_summary_save RESEARCH) + markResearchTerminalTransition(); + + // After terminal transition, planning should be blocked + assert.equal(hasResearchTerminalTransition(), true); + + // Verify the session state reflects the terminal transition + assert.equal(session.researchTerminalTransition, true); + assert.equal(session.currentUnit?.type, "research-slice"); + + // Cleanup + session.reset(); + } finally { + cleanup(base); + } +}); + +test("research terminal transition does not block non-planning tools", () => { + const session = getAutoSession(); + // Reset to known state + session.reset(); + + // Simulate a research unit being active + session.active = true; + session.currentUnit = { + type: "research-slice", + id: "M001/S01", + startedAt: Date.now(), + }; + + markResearchTerminalTransition(); + + // Non-planning tools should not be blocked by the research terminal transition + // (the actual blocking logic only checks planning tools) + const nonPlanningTools = [ + "read", + "write", + "edit", + "bash", + "grep", + "find", + "sf_summary_save", + "sf_self_report", + ]; + + // The hasResearchTerminalTransition function just returns state; + // the actual tool filtering happens in register-hooks.ts + assert.equal(hasResearchTerminalTransition(), true); + + // Cleanup + session.reset(); +}); + +test("research terminal transition resets on new unit dispatch", () => { + const session = getAutoSession(); + // Reset to known state + session.reset(); + + // Simulate first research unit completing + session.active = true; + session.currentUnit = { + type: "research-slice", + id: "M001/S01", + startedAt: Date.now(), + }; + markResearchTerminalTransition(); + assert.equal(hasResearchTerminalTransition(), true); + + // Simulate new unit dispatch (research terminal should be reset) + // In production, this happens in the auto-loop when dispatching a new unit + session.researchTerminalTransition = false; + session.currentUnit = { + type: "plan-slice", + id: "M001/S01", + startedAt: Date.now(), + }; + + assert.equal(hasResearchTerminalTransition(), false); + assert.equal(session.currentUnit?.type, "plan-slice"); + + // Cleanup + session.reset(); +}); diff --git a/src/resources/extensions/sf/tests/workflow-projections.test.ts b/src/resources/extensions/sf/tests/workflow-projections.test.ts index e06bf5df4..dd1c5d02b 100644 --- a/src/resources/extensions/sf/tests/workflow-projections.test.ts +++ b/src/resources/extensions/sf/tests/workflow-projections.test.ts @@ -138,6 +138,20 @@ test("workflow-projections: renderPlanContent includes ## Tasks section", () => assert.ok(content.includes("## Tasks")); }); +test("workflow-projections: renderPlanContent preserves ordered must-haves", () => { + const content = renderPlanContent( + makeSlice({ + success_criteria: + "1. First ordered outcome\n2. Second ordered outcome", + }), + [], + ); + + assert.ok(content.includes("1. First ordered outcome")); + assert.ok(content.includes("2. Second ordered outcome")); + assert.ok(!content.includes("- 1. First ordered outcome")); +}); + test("workflow-projections: renderPlanContent includes top-level verification from task commands", () => { const content = renderPlanContent(makeSlice(), [ makeTask({ verify: "npm test src/middleware/auth.test.ts" }), @@ -264,6 +278,20 @@ test("workflow-projections: task with verify renders Verify subline", () => { assert.ok(content.includes(" - Verify: npm test")); }); +test("workflow-projections: task markdown headings stay nested and lint-spaced", () => { + const task = makeTask({ + description: + "Implement the contract.\n\n### 1. Add coverage\nWrite the failing test.\n\n### 2. Patch renderer\nKeep markdown valid.", + }); + const content = renderPlanContent(makeSlice(), [task]); + + assert.ok(content.includes("- [ ] **T01: Create JWT middleware**")); + assert.ok( + content.includes(" ### 1. Add coverage\n\n Write the failing test."), + ); + assert.ok(!content.includes("\n### 1. Add coverage\nWrite")); +}); + test("workflow-projections: task with no verify omits Verify subline", () => { const task = makeTask({ verify: "" }); const content = renderPlanContent(makeSlice(), [task]); diff --git a/src/resources/extensions/sf/workflow-projections.ts b/src/resources/extensions/sf/workflow-projections.ts index bf00c1c35..226e91421 100644 --- a/src/resources/extensions/sf/workflow-projections.ts +++ b/src/resources/extensions/sf/workflow-projections.ts @@ -39,6 +39,86 @@ export function stripIdPrefix(title: string, id: string): string { return result.trim() || title; } +/** + * Render a model-provided list entry without corrupting ordered lists. + * + * Purpose: projection fallback output remains valid Markdown when planning + * rows contain numbered success criteria from the LLM. + * Consumer: renderPlanContent when writing PLAN.md projections. + */ +function renderListEntry(entry: string): string { + const trimmed = entry.trim(); + const orderedBullet = trimmed.match(/^[-*+]\s+(\d+)[.)]\s+(.+)$/); + if (orderedBullet) { + return `${orderedBullet[1]}. ${orderedBullet[2].trim()}`; + } + + const ordered = trimmed.match(/^(\d+)[.)]\s+(.+)$/); + if (ordered) { + return `${ordered[1]}. ${ordered[2].trim()}`; + } + + if (/^[-*+]\s+\S/.test(trimmed)) { + return trimmed; + } + + return `- ${trimmed}`; +} + +/** + * Surround ATX headings in model-provided markdown with blank lines. + * + * Purpose: generated PLAN.md projections pass content validation even when + * task descriptions contain LLM-authored step headings. + * Consumer: renderPlanContent task entries. + */ +function normalizeMarkdownBlockSpacing(text: string): string { + const sourceLines = text.trim().replace(/\r\n/g, "\n").split("\n"); + const output: string[] = []; + let inFence = false; + + for (let i = 0; i < sourceLines.length; i++) { + const line = sourceLines[i]; + const trimmed = line.trim(); + const fence = /^(```|~~~)/.test(trimmed); + const heading = !inFence && /^#{1,6}\s+\S/.test(trimmed); + + if (heading && output.length > 0 && output[output.length - 1]?.trim()) { + output.push(""); + } + + output.push(line); + + if (fence) { + inFence = !inFence; + } + + const next = sourceLines[i + 1]; + if (heading && next !== undefined && next.trim()) { + output.push(""); + } + } + + return output.join("\n").trim(); +} + +/** + * Append model-provided markdown as an indented child block. + * + * Purpose: task description subsections stay nested under their task instead + * of becoming top-level PLAN.md headings. + * Consumer: renderPlanContent task entries. + */ +function appendIndentedMarkdownBlock( + lines: string[], + text: string, + indent = " ", +): void { + for (const line of normalizeMarkdownBlockSpacing(text).split("\n")) { + lines.push(line.trim() ? `${indent}${line}` : ""); + } +} + // ─── PLAN.md Projection ────────────────────────────────────────────────── /** @@ -70,7 +150,7 @@ export function renderPlanContent( .split(/\n+/) .map((entry) => entry.trim()) .filter(Boolean)) { - lines.push(line.startsWith("-") ? line : `- ${line}`); + lines.push(renderListEntry(line)); } } else { lines.push("- Complete the planned slice outcomes."); @@ -198,9 +278,10 @@ export function renderPlanContent( for (const task of taskRows) { const checkbox = isClosedStatus(task.status) ? "[x]" : "[ ]"; - lines.push( - `- ${checkbox} **${task.id}: ${task.title}** \u2014 ${task.description}`, - ); + lines.push(`- ${checkbox} **${task.id}: ${task.title}**`); + if (task.description.trim()) { + appendIndentedMarkdownBlock(lines, task.description); + } // Estimate subline (always present if non-empty) if (task.estimate) { diff --git a/vitest.config.ts b/vitest.config.ts index 160dc4ba2..713ec4cd3 100644 --- a/vitest.config.ts +++ b/vitest.config.ts @@ -143,6 +143,7 @@ export default defineConfig({ "src/tests/**/*.test.mjs", "src/resources/extensions/sf/tests/**/*.test.ts", "src/resources/extensions/sf/tests/**/*.test.mjs", + "src/resources/extensions/sf-permissions/tests/**/*.test.ts", "src/resources/extensions/shared/tests/**/*.test.ts", "src/resources/extensions/claude-code-cli/tests/**/*.test.ts", "src/resources/extensions/github-sync/tests/**/*.test.ts",