fix: make auto recovery evidence-driven

2026-04-29 19:45:43 +02:00 · 2026-04-29 19:45:43 +02:00 · aa70e1db56
commit aa70e1db56
parent 2ed1638153
5 changed files with 304 additions and 24 deletions
--- a/src/headless-events.ts
+++ b/src/headless-events.ts
@ -54,9 +54,11 @@ export function mapStatusToExitCode(status: string): number {
 /**
 * Detect genuine auto-mode termination notifications.
 *
- * Only matches the actual stop signals emitted by stopAuto():
+ * Only matches the actual stop/pause signals emitted by stopAuto()/pauseAuto():
 *   "Auto-mode stopped..."
 *   "Step-mode stopped..."
+ *   "Auto-mode paused..."
+ *   "Step-mode paused..."
 *
 * Does NOT match progress notifications that happen to contain words like
 * "complete" or "stopped" (e.g., "Override resolved — rewrite-docs completed",
@ -64,7 +66,12 @@ export function mapStatusToExitCode(status: string): number {
 *
 * Blocked detection is separate — checked via isBlockedNotification.
 */
-export const TERMINAL_PREFIXES = ["auto-mode stopped", "step-mode stopped"];
+export const TERMINAL_PREFIXES = [
+	"auto-mode stopped",
+	"step-mode stopped",
+	"auto-mode paused",
+	"step-mode paused",
+];

 /**
 * Idle timeout for short, single-shot commands (status, queue, history, etc.).
@ -115,7 +122,11 @@ export function isBlockedNotification(event: Record<string, unknown>): boolean {
 		return false;
 	const message = String(event.message ?? "").toLowerCase();
 	// Blocked notifications come through stopAuto as "Auto-mode stopped (Blocked: ...)"
-	return message.includes("blocked:");
+	return (
+		message.includes("blocked:") ||
+		message.startsWith("auto-mode paused") ||
+		message.startsWith("step-mode paused")
+	);
 }

 export function isMilestoneReadyNotification(
--- a/src/resources/extensions/sf/auto-timeout-recovery.ts
+++ b/src/resources/extensions/sf/auto-timeout-recovery.ts
@ -4,7 +4,8 @@
 * and blocker placeholder generation.
 */

-import { existsSync } from "node:fs";
+import { existsSync, readFileSync } from "node:fs";
+import { relative } from "node:path";
 import type {
 	ExtensionAPI,
 	ExtensionContext,
@ -13,8 +14,18 @@ import { resolveAgentEnd } from "./auto-loop.js";
 import {
 	diagnoseExpectedArtifact,
 	resolveExpectedArtifactPath,
+	verifyExpectedArtifact,
 	writeBlockerPlaceholder,
 } from "./auto-recovery.js";
+import { parseRoadmap as parseLegacyRoadmap } from "./parsers-legacy.js";
+import {
+	relMilestoneFile,
+	relSliceFile,
+	resolveMilestoneFile,
+	resolveSliceFile,
+} from "./paths.js";
+import { getSlice, isDbAvailable } from "./sf-db.js";
+import { parseUnitId } from "./unit-id.js";
 import {
 	formatExecuteTaskRecoveryStatus,
 	inspectExecuteTaskDurability,
@ -29,6 +40,92 @@ export interface RecoveryContext {
 	unitRecoveryCount: Map<string, number>;
 }

+export interface UnitRecoveryStatus {
+	expected: string;
+	verified: boolean;
+	missing: string[];
+}
+
+function relToBase(basePath: string, path: string): string {
+	const rel = relative(basePath, path);
+	return rel && !rel.startsWith("..") ? rel : path;
+}
+
+function formatSliceDbStatus(status: string | undefined): string {
+	return status ? `DB slice status is "${status}" (expected complete)` : "";
+}
+
+export function inspectUnitRecoveryStatus(
+	unitType: string,
+	unitId: string,
+	basePath: string,
+): UnitRecoveryStatus {
+	const expected =
+		diagnoseExpectedArtifact(unitType, unitId, basePath) ??
+		"required durable artifact";
+	const missing: string[] = [];
+	const artifactPath = resolveExpectedArtifactPath(unitType, unitId, basePath);
+	if (!artifactPath) {
+		missing.push(`artifact path could not be resolved for ${expected}`);
+	} else if (!existsSync(artifactPath)) {
+		missing.push(`artifact missing (${relToBase(basePath, artifactPath)})`);
+	}
+
+	if (unitType === "complete-slice") {
+		const { milestone: mid, slice: sid } = parseUnitId(unitId);
+		if (mid && sid) {
+			const uatPath = resolveSliceFile(basePath, mid, sid, "UAT");
+			if (!uatPath || !existsSync(uatPath)) {
+				missing.push(
+					`UAT missing (${relSliceFile(basePath, mid, sid, "UAT")})`,
+				);
+			}
+
+			const dbSlice = getSlice(mid, sid);
+			if (dbSlice) {
+				const dbStatus = formatSliceDbStatus(dbSlice.status);
+				if (dbStatus && dbSlice.status !== "complete") missing.push(dbStatus);
+			} else if (!isDbAvailable()) {
+				const roadmapPath = resolveMilestoneFile(basePath, mid, "ROADMAP");
+				if (!roadmapPath || !existsSync(roadmapPath)) {
+					missing.push(
+						`roadmap missing (${relMilestoneFile(basePath, mid, "ROADMAP")})`,
+					);
+				} else {
+					try {
+						const roadmap = parseLegacyRoadmap(
+							readFileSync(roadmapPath, "utf-8"),
+						);
+						const slice = roadmap.slices.find((s) => s.id === sid);
+						if (slice && !slice.done) {
+							missing.push(
+								`roadmap checkbox not marked [x] (${relMilestoneFile(basePath, mid, "ROADMAP")})`,
+							);
+						}
+					} catch {
+						missing.push(
+							`roadmap could not be parsed (${relMilestoneFile(basePath, mid, "ROADMAP")})`,
+						);
+					}
+				}
+			}
+		}
+	}
+
+	const verified = verifyExpectedArtifact(unitType, unitId, basePath);
+	if (!verified && missing.length === 0) {
+		missing.push(`artifact verification failed for ${expected}`);
+	}
+
+	return { expected, verified, missing };
+}
+
+export function formatUnitRecoveryStatus(status: UnitRecoveryStatus): string {
+	return status.missing.length > 0
+		? status.missing.join("; ")
+		: "all durable artifacts present";
+}
+
 export async function recoverTimedOutUnit(
 	ctx: ExtensionContext,
 	pi: ExtensionAPI,
@ -169,21 +266,19 @@ export async function recoverTimedOutUnit(
 		return "paused";
 	}

-	const expected =
-		diagnoseExpectedArtifact(unitType, unitId, basePath) ??
-		"required durable artifact";
+	const status = inspectUnitRecoveryStatus(unitType, unitId, basePath);
+	const diagnostic = formatUnitRecoveryStatus(status);

-	// Check if the artifact already exists on disk — agent may have written it
-	// without signaling completion.
-	const artifactPath = resolveExpectedArtifactPath(unitType, unitId, basePath);
-	if (artifactPath && existsSync(artifactPath)) {
+	// Check full unit durability — the primary artifact alone is not enough for
+	// units such as complete-slice, which also require UAT and state transition.
+	if (status.verified) {
 		writeUnitRuntimeRecord(basePath, unitType, unitId, currentUnitStartedAt, {
 			phase: "finalized",
 			recoveryAttempts: recoveryAttempts + 1,
 			lastRecoveryReason: reason,
 		});
 		ctx.ui.notify(
-			`${reason === "idle" ? "Idle" : "Timeout"} recovery: ${unitType} ${unitId} artifact already exists on disk. Advancing. (attempt ${attemptNumber})`,
+			`${reason === "idle" ? "Idle" : "Timeout"} recovery: ${unitType} ${unitId} durable state already verified. Advancing. (attempt ${attemptNumber})`,
 			"info",
 		);
 		unitRecoveryCount.delete(recoveryKey);
@ -208,19 +303,21 @@ export async function recoverTimedOutUnit(
 					`**FINAL ${reason === "idle" ? "IDLE" : "HARD TIMEOUT"} RECOVERY — last chance before skip.**`,
 					`You are still executing ${unitType} ${unitId}.`,
 					`Recovery attempt ${recoveryAttempts + 1} of ${maxRecoveryAttempts} — next failure skips this unit.`,
-					`Expected durable output: ${expected}.`,
-					"You MUST write the artifact file NOW, even if incomplete.",
-					"Write whatever you have — partial research, preliminary findings, best-effort analysis.",
-					"A partial artifact is infinitely better than no artifact.",
+					`Expected durable output: ${status.expected}.`,
+					`Current durability status: ${diagnostic}.`,
+					"Repair only the missing durability items listed above.",
+					"You MUST write the missing artifact/state NOW, even if incomplete.",
+					"Write whatever you have — partial research, preliminary findings, best-effort analysis — and record the remaining blocker explicitly.",
 					"If you are truly blocked, write the file with a BLOCKER section explaining why.",
 				]
 			: [
 					`**${reason === "idle" ? "IDLE" : "HARD TIMEOUT"} RECOVERY — stay in auto-mode.**`,
 					`You are still executing ${unitType} ${unitId}.`,
 					`Recovery attempt ${recoveryAttempts + 1} of ${maxRecoveryAttempts}.`,
-					`Expected durable output: ${expected}.`,
-					"Stop broad exploration.",
-					"Write the required artifact now.",
+					`Expected durable output: ${status.expected}.`,
+					`Current durability status: ${diagnostic}.`,
+					"Stop broad exploration and do not redo work that is already durable.",
+					"Repair only the missing durability items listed above.",
 					"If blocked, write the partial artifact and explicitly record the blocker instead of going silent.",
 				];

@ -233,7 +330,7 @@ export async function recoverTimedOutUnit(
 			{ triggerTurn: true, deliverAs: "steer" },
 		);
 		ctx.ui.notify(
-			`${reason === "idle" ? "Idle" : "Timeout"} recovery: steering ${unitType} ${unitId} to produce ${expected} (attempt ${attemptNumber}, session ${recoveryAttempts + 1}/${maxRecoveryAttempts}).`,
+			`${reason === "idle" ? "Idle" : "Timeout"} recovery: steering ${unitType} ${unitId} to repair ${diagnostic} (attempt ${attemptNumber}, session ${recoveryAttempts + 1}/${maxRecoveryAttempts}).`,
 			"warning",
 		);
 		return "recovered";
@ -250,7 +347,7 @@ export async function recoverTimedOutUnit(
 			lastRecoveryReason: reason,
 		});
 		ctx.ui.notify(
-			`Milestone ${unitId} ${reason}-recovery exhausted ${maxRecoveryAttempts} attempt(s) — worktree branch preserved. Re-run /sf auto once blockers are resolved.`,
+			`Milestone ${unitId} ${reason}-recovery exhausted ${maxRecoveryAttempts} attempt(s): ${diagnostic}. Worktree branch preserved. Re-run /sf auto once blockers are resolved.`,
 			"error",
 		);
 		return "paused";
@ -262,7 +359,7 @@ export async function recoverTimedOutUnit(
 		unitType,
 		unitId,
 		basePath,
-		`${reason} recovery exhausted ${maxRecoveryAttempts} attempts without producing the artifact.`,
+		`${reason} recovery exhausted ${maxRecoveryAttempts} attempts. Status: ${diagnostic}.`,
 	);

 	if (placeholder) {
--- a/src/resources/extensions/sf/tests/auto-timeout-recovery.test.ts
+++ b/src/resources/extensions/sf/tests/auto-timeout-recovery.test.ts
@ -0,0 +1,117 @@
+import assert from "node:assert/strict";
+import {
+	mkdirSync,
+	mkdtempSync,
+	readFileSync,
+	rmSync,
+	writeFileSync,
+} from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import { test } from "node:test";
+import {
+	formatUnitRecoveryStatus,
+	inspectUnitRecoveryStatus,
+	type RecoveryContext,
+	recoverTimedOutUnit,
+} from "../auto-timeout-recovery.ts";
+import { closeDatabase, isDbAvailable } from "../sf-db.ts";
+
+const ROADMAP_INCOMPLETE = `# M001: Test Milestone
+
+## Slices
+
+- [ ] **S01: Test Slice** \`risk:low\`
+> After this: something works
+`;
+
+function createCompleteSliceFixture(): string {
+	const base = mkdtempSync(join(tmpdir(), "sf-timeout-recovery-test-"));
+	const sliceDir = join(base, ".sf", "milestones", "M001", "slices", "S01");
+	mkdirSync(sliceDir, { recursive: true });
+	writeFileSync(
+		join(base, ".sf", "milestones", "M001", "M001-ROADMAP.md"),
+		ROADMAP_INCOMPLETE,
+		"utf-8",
+	);
+	writeFileSync(join(sliceDir, "S01-SUMMARY.md"), "# Summary\n", "utf-8");
+	return base;
+}
+
+function makeRecoveryContext(base: string): RecoveryContext {
+	return {
+		basePath: base,
+		verbose: false,
+		currentUnitStartedAt: Date.now(),
+		unitRecoveryCount: new Map(),
+	};
+}
+
+test("inspectUnitRecoveryStatus reports complete-slice gaps even when SUMMARY exists", () => {
+	if (isDbAvailable()) closeDatabase();
+	const base = createCompleteSliceFixture();
+	try {
+		const status = inspectUnitRecoveryStatus(
+			"complete-slice",
+			"M001/S01",
+			base,
+		);
+		const diagnostic = formatUnitRecoveryStatus(status);
+
+		assert.equal(status.verified, false);
+		assert.ok(
+			!diagnostic.includes("S01-SUMMARY.md"),
+			"existing SUMMARY must not be reported as missing",
+		);
+		assert.match(diagnostic, /UAT missing/);
+		assert.match(diagnostic, /roadmap checkbox not marked \[x\]/);
+	} finally {
+		rmSync(base, { recursive: true, force: true });
+	}
+});
+
+test("recoverTimedOutUnit steers complete-slice with exact missing durability items", async () => {
+	if (isDbAvailable()) closeDatabase();
+	const base = createCompleteSliceFixture();
+	const sent: string[] = [];
+	const notifications: string[] = [];
+	try {
+		const result = await recoverTimedOutUnit(
+			{
+				ui: {
+					notify: (message: string) => notifications.push(message),
+				},
+			} as any,
+			{
+				sendMessage: (message: { content: string }) => {
+					sent.push(message.content);
+				},
+			} as any,
+			"complete-slice",
+			"M001/S01",
+			"idle",
+			makeRecoveryContext(base),
+		);
+
+		assert.equal(result, "recovered");
+		assert.equal(sent.length, 1, "should steer instead of finalizing");
+		assert.match(sent[0], /Current durability status:/);
+		assert.match(sent[0], /UAT missing/);
+		assert.match(sent[0], /roadmap checkbox not marked \[x\]/);
+		assert.ok(
+			notifications.some((n) => n.includes("to repair")),
+			"notification should explain the exact repair target",
+		);
+		assert.ok(
+			notifications.every((n) => !n.includes("durable state already verified")),
+			"summary-only state must not be treated as recovered",
+		);
+		const runtimeRecord = readFileSync(
+			join(base, ".sf", "runtime", "units", "complete-slice-M001-S01.json"),
+			"utf-8",
+		);
+		assert.match(runtimeRecord, /"phase": "recovered"/);
+	} finally {
+		rmSync(base, { recursive: true, force: true });
+	}
+});
--- a/src/tests/headless-detection.test.ts
+++ b/src/tests/headless-detection.test.ts
@ -15,7 +15,12 @@ import test from "node:test";

 // ─── Extracted detection logic (mirrors headless.ts) ────────────────────────

-const TERMINAL_PREFIXES = ["auto-mode stopped", "step-mode stopped"];
+const TERMINAL_PREFIXES = [
+	"auto-mode stopped",
+	"step-mode stopped",
+	"auto-mode paused",
+	"step-mode paused",
+];

 function isTerminalNotification(event: Record<string, unknown>): boolean {
 	if (event.type !== "extension_ui_request" || event.method !== "notify")
@ -28,7 +33,11 @@ function isBlockedNotification(event: Record<string, unknown>): boolean {
 	if (event.type !== "extension_ui_request" || event.method !== "notify")
 		return false;
 	const message = String(event.message ?? "").toLowerCase();
-	return message.includes("blocked:");
+	return (
+		message.includes("blocked:") ||
+		message.startsWith("auto-mode paused") ||
+		message.startsWith("step-mode paused")
+	);
 }

 function makeNotify(message: string): Record<string, unknown> {
@ -71,6 +80,16 @@ test("detects 'Step-mode stopped.' as terminal", () => {
 	assert.ok(isTerminalNotification(makeNotify("Step-mode stopped.")));
 });

+test("detects provider-error auto pause as terminal", () => {
+	assert.ok(
+		isTerminalNotification(
+			makeNotify(
+				"Auto-mode paused due to provider error: Schema overload: consecutive tool validation failures exceeded cap",
+			),
+		),
+	);
+});
+
 // ─── False positives that previously triggered early exit (#879) ────────────

 test("does NOT match 'All slices are complete — nothing to discuss.'", () => {
@ -150,6 +169,17 @@ test("detects inline 'Blocked:' message", () => {
 	);
 });

+test("detects auto pause as blocked for headless callers", () => {
+	assert.ok(
+		isBlockedNotification(
+			makeNotify(
+				"Auto-mode paused due to provider error: Schema overload: consecutive tool validation failures exceeded cap",
+			),
+		),
+	);
+	assert.ok(isBlockedNotification(makeNotify("Auto-mode paused (Escape).")));
+});
+
 test("does NOT match 'blocked' without colon (avoids false positives)", () => {
 	assert.ok(
 		!isBlockedNotification(
--- a/src/tests/headless-events.test.ts
+++ b/src/tests/headless-events.test.ts
@ -194,7 +194,9 @@ import {
 	EXIT_CANCELLED,
 	EXIT_ERROR,
 	EXIT_SUCCESS,
+	isBlockedNotification,
 	isInteractiveHeadlessTool,
+	isTerminalNotification,
 	mapStatusToExitCode,
 	shouldArmHeadlessIdleTimeout,
 } from "../headless-events.js";
@ -246,6 +248,29 @@ test("isInteractiveHeadlessTool: non-interactive tools stay false", () => {
 	assert.equal(isInteractiveHeadlessTool(undefined), false);
 });

+test("isTerminalNotification: auto pause is terminal for headless callers", () => {
+	assert.equal(
+		isTerminalNotification({
+			type: "extension_ui_request",
+			method: "notify",
+			message:
+				"Auto-mode paused due to provider error: Schema overload: consecutive tool validation failures exceeded cap",
+		}),
+		true,
+	);
+});
+
+test("isBlockedNotification: auto pause exits as blocked", () => {
+	assert.equal(
+		isBlockedNotification({
+			type: "extension_ui_request",
+			method: "notify",
+			message: "Auto-mode paused (Escape).",
+		}),
+		true,
+	);
+});
+
 test("shouldArmHeadlessIdleTimeout: arms after tool calls when no interactive tool is in flight", () => {
 	assert.equal(shouldArmHeadlessIdleTimeout(1, 0), true);
 	assert.equal(shouldArmHeadlessIdleTimeout(3, 0), true);