fix(sf): contain research unit dispatch

2026-05-02 14:23:01 +02:00 · 2026-05-02 14:23:01 +02:00 · a8f0c63b0a
commit a8f0c63b0a
parent 64b46fcb8a
10 changed files with 242 additions and 53 deletions
--- a/src/resources/extensions/sf/auto-direct-dispatch.ts
+++ b/src/resources/extensions/sf/auto-direct-dispatch.ts
@ -20,6 +20,7 @@ import {
 	buildResearchSlicePrompt,
 	buildRunUatPrompt,
 } from "./auto-prompts.js";
+import { scopeActiveToolsForUnitType } from "./constants.js";
 import { loadFile } from "./files.js";
 import { parseRoadmap } from "./parsers.js";
 import {
@ -336,8 +337,24 @@ export async function dispatchDirectPhase(
 		ctx.ui.notify("Session creation cancelled.", "warning");
 		return;
 	}
-	pi.sendMessage(
-		{ customType: "sf-dispatch", content: prompt, display: false },
-		{ triggerTurn: true },
-	);
+	let savedTools: string[] | null = null;
+	if (
+		typeof pi.getActiveTools === "function" &&
+		typeof pi.setActiveTools === "function"
+	) {
+		const currentTools = pi.getActiveTools();
+		const scopedTools = scopeActiveToolsForUnitType(unitType, currentTools);
+		if (scopedTools.length !== currentTools.length) {
+			savedTools = currentTools;
+			pi.setActiveTools(scopedTools);
+		}
+	}
+	try {
+		pi.sendMessage(
+			{ customType: "sf-dispatch", content: prompt, display: false },
+			{ triggerTurn: true },
+		);
+	} finally {
+		if (savedTools) pi.setActiveTools(savedTools);
+	}
 }
--- a/src/resources/extensions/sf/auto/run-unit.ts
+++ b/src/resources/extensions/sf/auto/run-unit.ts
@ -8,13 +8,24 @@ import type {
 	ExtensionAPI,
 	ExtensionContext,
 } from "@singularity-forge/pi-coding-agent";
+import {
+	collectSessionTokenUsage,
+	collectWorktreeFingerprint,
+	countChangedFiles,
+	resetRunawayGuardState,
+} from "../auto-runaway-guard.js";
+import { scopeActiveToolsForUnitType } from "../constants.js";
 import { debugLog } from "../debug-logger.js";
 import {
 	resolveAutoSupervisorConfig,
 	resolvePersistModelChanges,
 } from "../preferences.js";
 import { logWarning } from "../workflow-logger.js";
-import { _clearCurrentResolve, _setCurrentResolve, _setSessionSwitchInFlight } from "./resolve.js";
+import {
+	_clearCurrentResolve,
+	_setCurrentResolve,
+	_setSessionSwitchInFlight,
+} from "./resolve.js";
 import type { AutoSession } from "./session.js";
 import { NEW_SESSION_TIMEOUT_MS } from "./session.js";
 import {
@ -89,13 +100,10 @@ export async function runUnit(
 			}
 		});
 		const timeoutPromise = new Promise<{ cancelled: true }>((resolve) => {
-			sessionTimeoutHandle = setTimeout(
-				() => {
-					sessionAbortController.abort();
-					resolve({ cancelled: true });
-				},
-				NEW_SESSION_TIMEOUT_MS,
-			);
+			sessionTimeoutHandle = setTimeout(() => {
+				sessionAbortController.abort();
+				resolve({ cancelled: true });
+			}, NEW_SESSION_TIMEOUT_MS);
 		});
 		sessionResult = await Promise.race([sessionPromise, timeoutPromise]);
 	} catch (sessionErr) {
@ -203,6 +211,15 @@ export async function runUnit(
 		}
 	}

+	// Refresh the runaway baseline after newSession(). Resumed sessions recover
+	// old context during session creation; taking the baseline before that makes
+	// historical tokens look like budget spent by this unit.
+	resetRunawayGuardState(unitType, unitId, {
+		sessionTokens: collectSessionTokenUsage(ctx),
+		changedFiles: countChangedFiles(s.basePath),
+		worktreeFingerprint: collectWorktreeFingerprint(s.basePath),
+	});
+
 	// ── Send the prompt ──
 	debugLog("runUnit", { phase: "send-message", unitType, unitId });

@ -213,10 +230,34 @@ export async function runUnit(
 	const capturedTurnGen = getCurrentTurnGeneration();

 	const requestDispatchedAt = Date.now();
-	pi.sendMessage(
-		{ customType: "sf-auto", content: prompt, display: s.verbose },
-		{ triggerTurn: true },
-	);
+	let savedTools: string[] | null = null;
+	if (
+		typeof pi.getActiveTools === "function" &&
+		typeof pi.setActiveTools === "function"
+	) {
+		const currentTools = pi.getActiveTools();
+		const scopedTools = scopeActiveToolsForUnitType(unitType, currentTools);
+		if (scopedTools.length !== currentTools.length) {
+			savedTools = currentTools;
+			pi.setActiveTools(scopedTools);
+			debugLog("unit-tool-scoping", {
+				unitType,
+				before: currentTools.length,
+				after: scopedTools.length,
+				removed: currentTools.length - scopedTools.length,
+			});
+		}
+	}
+	try {
+		pi.sendMessage(
+			{ customType: "sf-auto", content: prompt, display: s.verbose },
+			{ triggerTurn: true },
+		);
+	} finally {
+		if (savedTools) {
+			pi.setActiveTools(savedTools);
+		}
+	}

 	// ── Await agent_end with absolute timeout (H4 fix) ──
 	// If supervision fails to resolve unitPromise within 30s, treat as cancelled.
--- a/src/resources/extensions/sf/constants.ts
+++ b/src/resources/extensions/sf/constants.ts
@ -63,3 +63,61 @@ export const DISCUSS_TOOLS_ALLOWLIST: readonly string[] = [
 	"sf_requirement_update",
 	"sf_update_requirement",
 ];
+
+/**
+ * SF tools allowed during research units.
+ *
+ * Purpose: keep research turns in their lane. A research agent writes a
+ * RESEARCH artifact and may report SF friction, but planning tools belong to
+ * later planner units. Allowing milestone/slice planning tools in research
+ * turns lets a saved research artifact drift into speculative planning and
+ * can keep the unit alive until runaway supervision pauses it.
+ *
+ * Consumer: guided-flow.ts and auto/run-unit.ts when narrowing SF tools for
+ * research-milestone and research-slice turns.
+ */
+export const RESEARCH_TOOLS_ALLOWLIST: readonly string[] = [
+	"sf_summary_save",
+	"sf_save_summary",
+	"sf_self_report",
+];
+
+/**
+ * Return the SF tool allowlist for a workflow unit, or null when the full SF
+ * tool set is appropriate.
+ *
+ * Purpose: centralize per-unit active-tool narrowing so guided and auto
+ * dispatches enforce the same phase boundaries.
+ *
+ * Consumer: guided-flow.ts and auto/run-unit.ts before sending a unit prompt.
+ */
+export function getSfToolsAllowlistForUnitType(
+	unitType: string | undefined,
+): readonly string[] | null {
+	if (!unitType) return null;
+	if (unitType.startsWith("discuss-")) return DISCUSS_TOOLS_ALLOWLIST;
+	if (unitType === "research-milestone" || unitType === "research-slice") {
+		return RESEARCH_TOOLS_ALLOWLIST;
+	}
+	return null;
+}
+
+/**
+ * Narrow active tools to the SF tools allowed for the current unit while
+ * preserving all non-SF tools.
+ *
+ * Purpose: hide out-of-phase SF mutation tools from the model without
+ * removing builtin or third-party tools needed to complete the unit.
+ *
+ * Consumer: guided-flow.ts and auto/run-unit.ts tool-scope guards.
+ */
+export function scopeActiveToolsForUnitType(
+	unitType: string | undefined,
+	currentTools: readonly string[],
+): string[] {
+	const allowlist = getSfToolsAllowlistForUnitType(unitType);
+	if (!allowlist) return [...currentTools];
+	return currentTools.filter(
+		(t) => !t.startsWith("sf_") || allowlist.includes(t),
+	);
+}
--- a/src/resources/extensions/sf/guided-flow.ts
+++ b/src/resources/extensions/sf/guided-flow.ts
@ -21,13 +21,13 @@ import type {
 	ExtensionContext,
 } from "@singularity-forge/pi-coding-agent";
 import { showConfirm, showNextAction } from "../shared/tui.js";
-import { resolveExpectedArtifactPath, startAutoDetached } from "./auto.js";
 import { ensureAgenticDocsScaffold } from "./agentic-docs-scaffold.js";
+import { resolveExpectedArtifactPath, startAutoDetached } from "./auto.js";
 import { selectAndApplyModel } from "./auto-model-selection.js";
 import { buildSkillActivationBlock } from "./auto-prompts.js";
 import { invalidateAllCaches } from "./cache.js";
 import { ensureSiftIndexWarmup } from "./code-intelligence.js";
-import { DISCUSS_TOOLS_ALLOWLIST } from "./constants.js";
+import { scopeActiveToolsForUnitType } from "./constants.js";
 import { clearLock } from "./crash-recovery.js";
 import { debugLog } from "./debug-logger.js";
 import { detectProjectState } from "./detection.js";
@ -522,27 +522,25 @@ async function dispatchWorkflow(
 		}
 	}

-	// Scope tools for discuss flows (#2949).
+	// Scope tools for bounded workflow phases (#2949, research containment).
 	// Providers with grammar-based constrained decoding (xAI/Grok) return
 	// "Grammar is too complex" when the combined tool schema is too large.
-	// Discuss flows only need a small subset of SF tools — strip the heavy
-	// planning/execution/completion tools to keep the grammar within limits.
+	// Some phases also only need a small subset of SF tools; strip
+	// out-of-phase planning/execution/completion tools to keep turns bounded.
 	let savedTools: string[] | null = null;
-	if (unitType?.startsWith("discuss-")) {
+	if (unitType) {
 		const currentTools = pi.getActiveTools();
-		savedTools = currentTools;
-		// Keep all non-SF tools (builtins, other extensions) and only the
-		// SF tools on the discuss allowlist.
-		const scopedTools = currentTools.filter(
-			(t) => !t.startsWith("sf_") || DISCUSS_TOOLS_ALLOWLIST.includes(t),
-		);
-		pi.setActiveTools(scopedTools);
-		debugLog("discuss-tool-scoping", {
-			unitType,
-			before: currentTools.length,
-			after: scopedTools.length,
-			removed: currentTools.length - scopedTools.length,
-		});
+		const scopedTools = scopeActiveToolsForUnitType(unitType, currentTools);
+		if (scopedTools.length !== currentTools.length) {
+			savedTools = currentTools;
+			pi.setActiveTools(scopedTools);
+			debugLog("unit-tool-scoping", {
+				unitType,
+				before: currentTools.length,
+				after: scopedTools.length,
+				removed: currentTools.length - scopedTools.length,
+			});
+		}
 	}

 	const workflowPath =
@ -786,7 +784,10 @@ export function bootstrapProject(basePath: string): void {
 	ensureGitignore(basePath);
 	ensurePreferences(basePath);
 	ensureAgenticDocsScaffold(basePath);
-	ensureSiftIndexWarmup(basePath, loadEffectiveSFPreferences()?.preferences?.codebase);
+	ensureSiftIndexWarmup(
+		basePath,
+		loadEffectiveSFPreferences()?.preferences?.codebase,
+	);
 	untrackRuntimeFiles(basePath);
 }

--- a/src/resources/extensions/sf/prompts/guided-research-slice.md
+++ b/src/resources/extensions/sf/prompts/guided-research-slice.md
@ -1,4 +1,4 @@
-Research slice {{sliceId}} ("{{sliceTitle}}") of milestone {{milestoneId}}. Read `.sf/DECISIONS.md` if it exists — respect existing decisions, don't contradict them. Read `.sf/REQUIREMENTS.md` if it exists — identify which Active requirements this slice owns or supports and target research toward risks, unknowns, and constraints that could affect delivery of those requirements. {{skillActivation}} Use native `lsp` first for symbol lookup, references, and cross-file navigation. For direct text inspection use `rg`/`find` for targeted reads, or `scout` if the area is broad or unfamiliar. If there are 2-3 independent unknowns, use a research swarm with parallel `scout`/`researcher` subagents and synthesize their findings here; do not swarm narrow sequence-dependent research. Check libraries DeepWiki-first: `ask_question` / `read_wiki_structure` / `read_wiki_contents` for any GitHub-hosted library; fall back to `resolve_library` / `get_library_docs` (Context7, capped at 1000 req/month free) for npm/pypi/crates packages DeepWiki doesn't have. Skip both for libraries already used in this codebase. Use the **Research** output template below. Call `sf_summary_save` with `milestone_id: {{milestoneId}}`, `slice_id: {{sliceId}}`, `artifact_type: "RESEARCH"`, and the research content — the tool writes the file to disk and persists to DB.
+Research slice {{sliceId}} ("{{sliceTitle}}") of milestone {{milestoneId}}. Read `.sf/DECISIONS.md` if it exists — respect existing decisions, don't contradict them. Read `.sf/REQUIREMENTS.md` if it exists — identify which Active requirements this slice owns or supports and target research toward risks, unknowns, and constraints that could affect delivery of those requirements. {{skillActivation}} Use native `lsp` first for symbol lookup, references, and cross-file navigation. For direct text inspection use `rg`/`find` for targeted reads, or `scout` if the area is broad or unfamiliar. If there are 2-3 independent unknowns, use a research swarm with parallel `scout`/`researcher` subagents and synthesize their findings here; do not swarm narrow sequence-dependent research. Check libraries DeepWiki-first: `ask_question` / `read_wiki_structure` / `read_wiki_contents` for any GitHub-hosted library; fall back to `resolve_library` / `get_library_docs` (Context7, capped at 1000 req/month free) for npm/pypi/crates packages DeepWiki doesn't have. Skip both for libraries already used in this codebase. Use the **Research** output template below. Call `sf_summary_save` with `milestone_id: {{milestoneId}}`, `slice_id: {{sliceId}}`, `artifact_type: "RESEARCH"`, and the research content — the tool writes the file to disk and persists to DB. After `sf_summary_save` succeeds, stop immediately; do **not** call `sf_milestone_generate_id`, `sf_plan_milestone`, `sf_plan_slice`, `sf_plan_task`, or any planning/creation tool.

 **You are the scout.** A planner agent reads your output in a fresh context to decompose this slice into tasks. Write for the planner — surface key files, where the work divides naturally, what to build first, and how to verify. If the research doc is vague, the planner re-explores code you already read. If it's precise, the planner decomposes immediately.

--- a/src/resources/extensions/sf/prompts/research-milestone.md
+++ b/src/resources/extensions/sf/prompts/research-milestone.md
@ -52,6 +52,7 @@ Then research the codebase and relevant technologies. Narrate key findings and s
 **Research is advisory, not auto-binding.** Surface candidate requirements clearly instead of silently expanding scope.

 **You MUST call `sf_summary_save` with the research content before finishing.**
+After `sf_summary_save` succeeds, do **not** call `sf_milestone_generate_id`, `sf_plan_milestone`, `sf_plan_slice`, `sf_plan_task`, or any planning/creation tool. The orchestrator dispatches planner units after research.

 After saving research, update `.sf/PM-STRATEGY.md` — append new findings to the Opportunity Map and Guiding Policies sections. If the file doesn't exist yet, create it. This is the project's persistent PM memory — research findings that shaped planning decisions belong here.

@ -59,4 +60,4 @@ After saving research, update `.sf/PM-STRATEGY.md` — append new findings to th

 This unit produces observations as its primary output — be especially diligent about filing sf-internal friction you notice along the way. If during this unit you observe sf-the-tool friction — ambiguous prompts, missing context, misleading instructions, surprising behavior, prompt-quality issues, or improvement ideas — file them via `sf_self_report` before sealing the unit. This is the only way these observations reach forge's backlog and get triaged. Over-reporting is preferred to under-reporting; dedup happens later. Do NOT use this to file bugs in the user's project; only sf-the-tool itself. Do NOT autonomously act on or fix existing backlog entries — your scope is your unit.

-When done, say: "Milestone {{milestoneId}} researched."
+When done, say only: "Milestone {{milestoneId}} researched."
--- a/src/resources/extensions/sf/prompts/research-slice.md
+++ b/src/resources/extensions/sf/prompts/research-slice.md
@ -54,9 +54,10 @@ Research what this slice needs. Narrate key findings and surprises as you go —
 The slice directory already exists at `{{slicePath}}/`. Do NOT mkdir.

 **You MUST call `sf_summary_save` with the research content before finishing.**
+After `sf_summary_save` succeeds, stop immediately. Do **not** call `sf_milestone_generate_id`, `sf_plan_milestone`, `sf_plan_slice`, `sf_plan_task`, or any planning/creation tool. The orchestrator dispatches planner units after research.

 ### Report sf-internal observations

 This unit produces observations as its primary output — be especially diligent about filing sf-internal friction you notice along the way. If during this unit you observe sf-the-tool friction — ambiguous prompts, missing context, misleading instructions, surprising behavior, prompt-quality issues, or improvement ideas — file them via `sf_self_report` before sealing the unit. This is the only way these observations reach forge's backlog and get triaged. Over-reporting is preferred to under-reporting; dedup happens later. Do NOT use this to file bugs in the user's project; only sf-the-tool itself. Do NOT autonomously act on or fix existing backlog entries — your scope is your unit.

-When done, say: "Slice {{sliceId}} researched."
+When done, say only: "Slice {{sliceId}} researched."
--- a/src/resources/extensions/sf/tests/discuss-tool-scope-leak.test.ts
+++ b/src/resources/extensions/sf/tests/discuss-tool-scope-leak.test.ts
@ -18,10 +18,13 @@
 import assert from "node:assert/strict";
 import { readFileSync } from "node:fs";
 import { dirname, join } from "node:path";
-import { describe, test } from 'vitest';
 import { fileURLToPath } from "node:url";
+import { describe, test } from "vitest";

-import { DISCUSS_TOOLS_ALLOWLIST } from "../constants.ts";
+import {
+	DISCUSS_TOOLS_ALLOWLIST,
+	RESEARCH_TOOLS_ALLOWLIST,
+} from "../constants.ts";

 const __dirname = dirname(fileURLToPath(import.meta.url));
 const guidedFlowSource = readFileSync(
@ -37,11 +40,10 @@ describe("#3616 — discuss tool scoping must not leak across sessions", () => {
 		);
 	});

-	test("tool scoping only activates for discuss-* unit types", () => {
-		// The guard must be: if (unitType?.startsWith("discuss-"))
+	test("tool scoping activates through the shared per-unit helper", () => {
 		assert.ok(
-			guidedFlowSource.includes('unitType?.startsWith("discuss-")'),
-			"tool scoping should only trigger for discuss-* unit types",
+			guidedFlowSource.includes("scopeActiveToolsForUnitType"),
+			"tool scoping should go through the shared per-unit helper",
 		);
 	});

@ -54,6 +56,17 @@ describe("#3616 — discuss tool scoping must not leak across sessions", () => {
 		);
 	});

+	test("research scope excludes planning tools", () => {
+		assert.ok(
+			!RESEARCH_TOOLS_ALLOWLIST.includes("sf_plan_milestone"),
+			"sf_plan_milestone should be excluded from research scope",
+		);
+		assert.ok(
+			!RESEARCH_TOOLS_ALLOWLIST.includes("sf_milestone_generate_id"),
+			"sf_milestone_generate_id should be excluded from research scope",
+		);
+	});
+
 	test("newSession() in agent-session.ts has defense against tool narrowing persistence", () => {
 		const agentSessionSource = readFileSync(
 			join(process.cwd(), "packages/pi-coding-agent/src/core/agent-session.ts"),
--- a/src/resources/extensions/sf/tests/discuss-tool-scoping.test.ts
+++ b/src/resources/extensions/sf/tests/discuss-tool-scoping.test.ts
@ -17,10 +17,14 @@
 import assert from "node:assert/strict";
 import { readFileSync } from "node:fs";
 import { dirname, join } from "node:path";
-import { describe, test } from 'vitest';
 import { fileURLToPath } from "node:url";
+import { describe, test } from "vitest";

-import { DISCUSS_TOOLS_ALLOWLIST } from "../constants.ts";
+import {
+	DISCUSS_TOOLS_ALLOWLIST,
+	RESEARCH_TOOLS_ALLOWLIST,
+	scopeActiveToolsForUnitType,
+} from "../constants.ts";

 const __dirname = dirname(fileURLToPath(import.meta.url));
 const promptsDir = join(__dirname, "..", "prompts");
@ -118,16 +122,54 @@ describe("discuss tool scoping (#2949)", () => {
 		);
 	});

-	test("dispatchWorkflow source code scopes tools for discuss unit types", () => {
+	test("dispatchWorkflow source code scopes tools for bounded unit types", () => {
 		const source = readFileSync(guidedFlowPath, "utf-8");
-		// Verify that dispatchWorkflow references the allowlist for tool scoping
 		assert.ok(
-			source.includes("DISCUSS_TOOLS_ALLOWLIST"),
-			"guided-flow.ts should reference DISCUSS_TOOLS_ALLOWLIST for tool scoping",
+			source.includes("scopeActiveToolsForUnitType"),
+			"guided-flow.ts should use shared per-unit tool scoping",
 		);
 		assert.ok(
 			source.includes("setActiveTools"),
-			"guided-flow.ts should call setActiveTools to scope tools during discuss",
+			"guided-flow.ts should call setActiveTools to scope tools",
 		);
 	});
 });
+
+describe("research tool scoping", () => {
+	test("RESEARCH_TOOLS_ALLOWLIST permits only summary save and self-report SF tools", () => {
+		assert.deepEqual(RESEARCH_TOOLS_ALLOWLIST, [
+			"sf_summary_save",
+			"sf_save_summary",
+			"sf_self_report",
+		]);
+		for (const planningTool of [
+			"sf_milestone_generate_id",
+			"sf_plan_milestone",
+			"sf_plan_slice",
+			"sf_plan_task",
+		]) {
+			assert.ok(
+				!RESEARCH_TOOLS_ALLOWLIST.includes(planningTool),
+				`research scope should exclude ${planningTool}`,
+			);
+		}
+	});
+
+	test("scopeActiveToolsForUnitType removes planning tools from research turns", () => {
+		const scoped = scopeActiveToolsForUnitType("research-slice", [
+			"read",
+			"bash",
+			"sf_summary_save",
+			"sf_self_report",
+			"sf_milestone_generate_id",
+			"sf_plan_milestone",
+			"sf_plan_slice",
+		]);
+		assert.deepEqual(scoped, [
+			"read",
+			"bash",
+			"sf_summary_save",
+			"sf_self_report",
+		]);
+	});
+});
--- a/src/resources/extensions/sf/tests/prompt-contracts.test.ts
+++ b/src/resources/extensions/sf/tests/prompt-contracts.test.ts
@ -1,7 +1,7 @@
 import assert from "node:assert/strict";
 import { readFileSync } from "node:fs";
 import { join } from "node:path";
-import { test } from 'vitest';
+import { test } from "vitest";

 const promptsDir = join(process.cwd(), "src/resources/extensions/sf/prompts");
 const skillsDir = join(process.cwd(), "src/resources/extensions/sf/skills");
@ -286,6 +286,21 @@ test("guided-resume-task prompt preserves recovery state until work is supersede
 	assert.doesNotMatch(prompt, /Delete the continue file after reading it/i);
 });

+test("research prompts stop after saving and forbid planning tools", () => {
+	for (const name of [
+		"research-milestone",
+		"research-slice",
+		"guided-research-slice",
+	] as const) {
+		const prompt = readPrompt(name);
+		assert.match(prompt, /After `sf_summary_save` succeeds/i);
+		assert.match(prompt, /do \*\*not\*\* call `sf_milestone_generate_id`/i);
+		assert.match(prompt, /`sf_plan_milestone`/);
+		assert.match(prompt, /`sf_plan_slice`/);
+		assert.match(prompt, /planning\/creation tool/i);
+	}
+});
+
 // ─── Prompt migration: execute-task → sf_complete_task ───────────────

 test("execute-task prompt references sf_complete_task tool", () => {