From aa70e1db56db617f221ff2926e744f1f98ea51db Mon Sep 17 00:00:00 2001 From: Mikael Hugo Date: Wed, 29 Apr 2026 19:45:43 +0200 Subject: [PATCH] fix: make auto recovery evidence-driven --- src/headless-events.ts | 17 ++- .../extensions/sf/auto-timeout-recovery.ts | 135 +++++++++++++++--- .../sf/tests/auto-timeout-recovery.test.ts | 117 +++++++++++++++ src/tests/headless-detection.test.ts | 34 ++++- src/tests/headless-events.test.ts | 25 ++++ 5 files changed, 304 insertions(+), 24 deletions(-) create mode 100644 src/resources/extensions/sf/tests/auto-timeout-recovery.test.ts diff --git a/src/headless-events.ts b/src/headless-events.ts index 213553b02..76d684424 100644 --- a/src/headless-events.ts +++ b/src/headless-events.ts @@ -54,9 +54,11 @@ export function mapStatusToExitCode(status: string): number { /** * Detect genuine auto-mode termination notifications. * - * Only matches the actual stop signals emitted by stopAuto(): + * Only matches the actual stop/pause signals emitted by stopAuto()/pauseAuto(): * "Auto-mode stopped..." * "Step-mode stopped..." + * "Auto-mode paused..." + * "Step-mode paused..." * * Does NOT match progress notifications that happen to contain words like * "complete" or "stopped" (e.g., "Override resolved — rewrite-docs completed", @@ -64,7 +66,12 @@ export function mapStatusToExitCode(status: string): number { * * Blocked detection is separate — checked via isBlockedNotification. */ -export const TERMINAL_PREFIXES = ["auto-mode stopped", "step-mode stopped"]; +export const TERMINAL_PREFIXES = [ + "auto-mode stopped", + "step-mode stopped", + "auto-mode paused", + "step-mode paused", +]; /** * Idle timeout for short, single-shot commands (status, queue, history, etc.). @@ -115,7 +122,11 @@ export function isBlockedNotification(event: Record): boolean { return false; const message = String(event.message ?? "").toLowerCase(); // Blocked notifications come through stopAuto as "Auto-mode stopped (Blocked: ...)" - return message.includes("blocked:"); + return ( + message.includes("blocked:") || + message.startsWith("auto-mode paused") || + message.startsWith("step-mode paused") + ); } export function isMilestoneReadyNotification( diff --git a/src/resources/extensions/sf/auto-timeout-recovery.ts b/src/resources/extensions/sf/auto-timeout-recovery.ts index 5fb31b162..64d744a02 100644 --- a/src/resources/extensions/sf/auto-timeout-recovery.ts +++ b/src/resources/extensions/sf/auto-timeout-recovery.ts @@ -4,7 +4,8 @@ * and blocker placeholder generation. */ -import { existsSync } from "node:fs"; +import { existsSync, readFileSync } from "node:fs"; +import { relative } from "node:path"; import type { ExtensionAPI, ExtensionContext, @@ -13,8 +14,18 @@ import { resolveAgentEnd } from "./auto-loop.js"; import { diagnoseExpectedArtifact, resolveExpectedArtifactPath, + verifyExpectedArtifact, writeBlockerPlaceholder, } from "./auto-recovery.js"; +import { parseRoadmap as parseLegacyRoadmap } from "./parsers-legacy.js"; +import { + relMilestoneFile, + relSliceFile, + resolveMilestoneFile, + resolveSliceFile, +} from "./paths.js"; +import { getSlice, isDbAvailable } from "./sf-db.js"; +import { parseUnitId } from "./unit-id.js"; import { formatExecuteTaskRecoveryStatus, inspectExecuteTaskDurability, @@ -29,6 +40,92 @@ export interface RecoveryContext { unitRecoveryCount: Map; } +export interface UnitRecoveryStatus { + expected: string; + verified: boolean; + missing: string[]; +} + +function relToBase(basePath: string, path: string): string { + const rel = relative(basePath, path); + return rel && !rel.startsWith("..") ? rel : path; +} + +function formatSliceDbStatus(status: string | undefined): string { + return status ? `DB slice status is "${status}" (expected complete)` : ""; +} + +export function inspectUnitRecoveryStatus( + unitType: string, + unitId: string, + basePath: string, +): UnitRecoveryStatus { + const expected = + diagnoseExpectedArtifact(unitType, unitId, basePath) ?? + "required durable artifact"; + const missing: string[] = []; + const artifactPath = resolveExpectedArtifactPath(unitType, unitId, basePath); + if (!artifactPath) { + missing.push(`artifact path could not be resolved for ${expected}`); + } else if (!existsSync(artifactPath)) { + missing.push(`artifact missing (${relToBase(basePath, artifactPath)})`); + } + + if (unitType === "complete-slice") { + const { milestone: mid, slice: sid } = parseUnitId(unitId); + if (mid && sid) { + const uatPath = resolveSliceFile(basePath, mid, sid, "UAT"); + if (!uatPath || !existsSync(uatPath)) { + missing.push( + `UAT missing (${relSliceFile(basePath, mid, sid, "UAT")})`, + ); + } + + const dbSlice = getSlice(mid, sid); + if (dbSlice) { + const dbStatus = formatSliceDbStatus(dbSlice.status); + if (dbStatus && dbSlice.status !== "complete") missing.push(dbStatus); + } else if (!isDbAvailable()) { + const roadmapPath = resolveMilestoneFile(basePath, mid, "ROADMAP"); + if (!roadmapPath || !existsSync(roadmapPath)) { + missing.push( + `roadmap missing (${relMilestoneFile(basePath, mid, "ROADMAP")})`, + ); + } else { + try { + const roadmap = parseLegacyRoadmap( + readFileSync(roadmapPath, "utf-8"), + ); + const slice = roadmap.slices.find((s) => s.id === sid); + if (slice && !slice.done) { + missing.push( + `roadmap checkbox not marked [x] (${relMilestoneFile(basePath, mid, "ROADMAP")})`, + ); + } + } catch { + missing.push( + `roadmap could not be parsed (${relMilestoneFile(basePath, mid, "ROADMAP")})`, + ); + } + } + } + } + } + + const verified = verifyExpectedArtifact(unitType, unitId, basePath); + if (!verified && missing.length === 0) { + missing.push(`artifact verification failed for ${expected}`); + } + + return { expected, verified, missing }; +} + +export function formatUnitRecoveryStatus(status: UnitRecoveryStatus): string { + return status.missing.length > 0 + ? status.missing.join("; ") + : "all durable artifacts present"; +} + export async function recoverTimedOutUnit( ctx: ExtensionContext, pi: ExtensionAPI, @@ -169,21 +266,19 @@ export async function recoverTimedOutUnit( return "paused"; } - const expected = - diagnoseExpectedArtifact(unitType, unitId, basePath) ?? - "required durable artifact"; + const status = inspectUnitRecoveryStatus(unitType, unitId, basePath); + const diagnostic = formatUnitRecoveryStatus(status); - // Check if the artifact already exists on disk — agent may have written it - // without signaling completion. - const artifactPath = resolveExpectedArtifactPath(unitType, unitId, basePath); - if (artifactPath && existsSync(artifactPath)) { + // Check full unit durability — the primary artifact alone is not enough for + // units such as complete-slice, which also require UAT and state transition. + if (status.verified) { writeUnitRuntimeRecord(basePath, unitType, unitId, currentUnitStartedAt, { phase: "finalized", recoveryAttempts: recoveryAttempts + 1, lastRecoveryReason: reason, }); ctx.ui.notify( - `${reason === "idle" ? "Idle" : "Timeout"} recovery: ${unitType} ${unitId} artifact already exists on disk. Advancing. (attempt ${attemptNumber})`, + `${reason === "idle" ? "Idle" : "Timeout"} recovery: ${unitType} ${unitId} durable state already verified. Advancing. (attempt ${attemptNumber})`, "info", ); unitRecoveryCount.delete(recoveryKey); @@ -208,19 +303,21 @@ export async function recoverTimedOutUnit( `**FINAL ${reason === "idle" ? "IDLE" : "HARD TIMEOUT"} RECOVERY — last chance before skip.**`, `You are still executing ${unitType} ${unitId}.`, `Recovery attempt ${recoveryAttempts + 1} of ${maxRecoveryAttempts} — next failure skips this unit.`, - `Expected durable output: ${expected}.`, - "You MUST write the artifact file NOW, even if incomplete.", - "Write whatever you have — partial research, preliminary findings, best-effort analysis.", - "A partial artifact is infinitely better than no artifact.", + `Expected durable output: ${status.expected}.`, + `Current durability status: ${diagnostic}.`, + "Repair only the missing durability items listed above.", + "You MUST write the missing artifact/state NOW, even if incomplete.", + "Write whatever you have — partial research, preliminary findings, best-effort analysis — and record the remaining blocker explicitly.", "If you are truly blocked, write the file with a BLOCKER section explaining why.", ] : [ `**${reason === "idle" ? "IDLE" : "HARD TIMEOUT"} RECOVERY — stay in auto-mode.**`, `You are still executing ${unitType} ${unitId}.`, `Recovery attempt ${recoveryAttempts + 1} of ${maxRecoveryAttempts}.`, - `Expected durable output: ${expected}.`, - "Stop broad exploration.", - "Write the required artifact now.", + `Expected durable output: ${status.expected}.`, + `Current durability status: ${diagnostic}.`, + "Stop broad exploration and do not redo work that is already durable.", + "Repair only the missing durability items listed above.", "If blocked, write the partial artifact and explicitly record the blocker instead of going silent.", ]; @@ -233,7 +330,7 @@ export async function recoverTimedOutUnit( { triggerTurn: true, deliverAs: "steer" }, ); ctx.ui.notify( - `${reason === "idle" ? "Idle" : "Timeout"} recovery: steering ${unitType} ${unitId} to produce ${expected} (attempt ${attemptNumber}, session ${recoveryAttempts + 1}/${maxRecoveryAttempts}).`, + `${reason === "idle" ? "Idle" : "Timeout"} recovery: steering ${unitType} ${unitId} to repair ${diagnostic} (attempt ${attemptNumber}, session ${recoveryAttempts + 1}/${maxRecoveryAttempts}).`, "warning", ); return "recovered"; @@ -250,7 +347,7 @@ export async function recoverTimedOutUnit( lastRecoveryReason: reason, }); ctx.ui.notify( - `Milestone ${unitId} ${reason}-recovery exhausted ${maxRecoveryAttempts} attempt(s) — worktree branch preserved. Re-run /sf auto once blockers are resolved.`, + `Milestone ${unitId} ${reason}-recovery exhausted ${maxRecoveryAttempts} attempt(s): ${diagnostic}. Worktree branch preserved. Re-run /sf auto once blockers are resolved.`, "error", ); return "paused"; @@ -262,7 +359,7 @@ export async function recoverTimedOutUnit( unitType, unitId, basePath, - `${reason} recovery exhausted ${maxRecoveryAttempts} attempts without producing the artifact.`, + `${reason} recovery exhausted ${maxRecoveryAttempts} attempts. Status: ${diagnostic}.`, ); if (placeholder) { diff --git a/src/resources/extensions/sf/tests/auto-timeout-recovery.test.ts b/src/resources/extensions/sf/tests/auto-timeout-recovery.test.ts new file mode 100644 index 000000000..4cf3a6bbb --- /dev/null +++ b/src/resources/extensions/sf/tests/auto-timeout-recovery.test.ts @@ -0,0 +1,117 @@ +import assert from "node:assert/strict"; +import { + mkdirSync, + mkdtempSync, + readFileSync, + rmSync, + writeFileSync, +} from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { test } from "node:test"; +import { + formatUnitRecoveryStatus, + inspectUnitRecoveryStatus, + type RecoveryContext, + recoverTimedOutUnit, +} from "../auto-timeout-recovery.ts"; +import { closeDatabase, isDbAvailable } from "../sf-db.ts"; + +const ROADMAP_INCOMPLETE = `# M001: Test Milestone + +## Slices + +- [ ] **S01: Test Slice** \`risk:low\` +> After this: something works +`; + +function createCompleteSliceFixture(): string { + const base = mkdtempSync(join(tmpdir(), "sf-timeout-recovery-test-")); + const sliceDir = join(base, ".sf", "milestones", "M001", "slices", "S01"); + mkdirSync(sliceDir, { recursive: true }); + writeFileSync( + join(base, ".sf", "milestones", "M001", "M001-ROADMAP.md"), + ROADMAP_INCOMPLETE, + "utf-8", + ); + writeFileSync(join(sliceDir, "S01-SUMMARY.md"), "# Summary\n", "utf-8"); + return base; +} + +function makeRecoveryContext(base: string): RecoveryContext { + return { + basePath: base, + verbose: false, + currentUnitStartedAt: Date.now(), + unitRecoveryCount: new Map(), + }; +} + +test("inspectUnitRecoveryStatus reports complete-slice gaps even when SUMMARY exists", () => { + if (isDbAvailable()) closeDatabase(); + const base = createCompleteSliceFixture(); + try { + const status = inspectUnitRecoveryStatus( + "complete-slice", + "M001/S01", + base, + ); + const diagnostic = formatUnitRecoveryStatus(status); + + assert.equal(status.verified, false); + assert.ok( + !diagnostic.includes("S01-SUMMARY.md"), + "existing SUMMARY must not be reported as missing", + ); + assert.match(diagnostic, /UAT missing/); + assert.match(diagnostic, /roadmap checkbox not marked \[x\]/); + } finally { + rmSync(base, { recursive: true, force: true }); + } +}); + +test("recoverTimedOutUnit steers complete-slice with exact missing durability items", async () => { + if (isDbAvailable()) closeDatabase(); + const base = createCompleteSliceFixture(); + const sent: string[] = []; + const notifications: string[] = []; + try { + const result = await recoverTimedOutUnit( + { + ui: { + notify: (message: string) => notifications.push(message), + }, + } as any, + { + sendMessage: (message: { content: string }) => { + sent.push(message.content); + }, + } as any, + "complete-slice", + "M001/S01", + "idle", + makeRecoveryContext(base), + ); + + assert.equal(result, "recovered"); + assert.equal(sent.length, 1, "should steer instead of finalizing"); + assert.match(sent[0], /Current durability status:/); + assert.match(sent[0], /UAT missing/); + assert.match(sent[0], /roadmap checkbox not marked \[x\]/); + assert.ok( + notifications.some((n) => n.includes("to repair")), + "notification should explain the exact repair target", + ); + assert.ok( + notifications.every((n) => !n.includes("durable state already verified")), + "summary-only state must not be treated as recovered", + ); + const runtimeRecord = readFileSync( + join(base, ".sf", "runtime", "units", "complete-slice-M001-S01.json"), + "utf-8", + ); + assert.match(runtimeRecord, /"phase": "recovered"/); + } finally { + rmSync(base, { recursive: true, force: true }); + } +}); diff --git a/src/tests/headless-detection.test.ts b/src/tests/headless-detection.test.ts index ce07a6ae9..838907853 100644 --- a/src/tests/headless-detection.test.ts +++ b/src/tests/headless-detection.test.ts @@ -15,7 +15,12 @@ import test from "node:test"; // ─── Extracted detection logic (mirrors headless.ts) ──────────────────────── -const TERMINAL_PREFIXES = ["auto-mode stopped", "step-mode stopped"]; +const TERMINAL_PREFIXES = [ + "auto-mode stopped", + "step-mode stopped", + "auto-mode paused", + "step-mode paused", +]; function isTerminalNotification(event: Record): boolean { if (event.type !== "extension_ui_request" || event.method !== "notify") @@ -28,7 +33,11 @@ function isBlockedNotification(event: Record): boolean { if (event.type !== "extension_ui_request" || event.method !== "notify") return false; const message = String(event.message ?? "").toLowerCase(); - return message.includes("blocked:"); + return ( + message.includes("blocked:") || + message.startsWith("auto-mode paused") || + message.startsWith("step-mode paused") + ); } function makeNotify(message: string): Record { @@ -71,6 +80,16 @@ test("detects 'Step-mode stopped.' as terminal", () => { assert.ok(isTerminalNotification(makeNotify("Step-mode stopped."))); }); +test("detects provider-error auto pause as terminal", () => { + assert.ok( + isTerminalNotification( + makeNotify( + "Auto-mode paused due to provider error: Schema overload: consecutive tool validation failures exceeded cap", + ), + ), + ); +}); + // ─── False positives that previously triggered early exit (#879) ──────────── test("does NOT match 'All slices are complete — nothing to discuss.'", () => { @@ -150,6 +169,17 @@ test("detects inline 'Blocked:' message", () => { ); }); +test("detects auto pause as blocked for headless callers", () => { + assert.ok( + isBlockedNotification( + makeNotify( + "Auto-mode paused due to provider error: Schema overload: consecutive tool validation failures exceeded cap", + ), + ), + ); + assert.ok(isBlockedNotification(makeNotify("Auto-mode paused (Escape)."))); +}); + test("does NOT match 'blocked' without colon (avoids false positives)", () => { assert.ok( !isBlockedNotification( diff --git a/src/tests/headless-events.test.ts b/src/tests/headless-events.test.ts index abd3eca57..6c5746927 100644 --- a/src/tests/headless-events.test.ts +++ b/src/tests/headless-events.test.ts @@ -194,7 +194,9 @@ import { EXIT_CANCELLED, EXIT_ERROR, EXIT_SUCCESS, + isBlockedNotification, isInteractiveHeadlessTool, + isTerminalNotification, mapStatusToExitCode, shouldArmHeadlessIdleTimeout, } from "../headless-events.js"; @@ -246,6 +248,29 @@ test("isInteractiveHeadlessTool: non-interactive tools stay false", () => { assert.equal(isInteractiveHeadlessTool(undefined), false); }); +test("isTerminalNotification: auto pause is terminal for headless callers", () => { + assert.equal( + isTerminalNotification({ + type: "extension_ui_request", + method: "notify", + message: + "Auto-mode paused due to provider error: Schema overload: consecutive tool validation failures exceeded cap", + }), + true, + ); +}); + +test("isBlockedNotification: auto pause exits as blocked", () => { + assert.equal( + isBlockedNotification({ + type: "extension_ui_request", + method: "notify", + message: "Auto-mode paused (Escape).", + }), + true, + ); +}); + test("shouldArmHeadlessIdleTimeout: arms after tool calls when no interactive tool is in flight", () => { assert.equal(shouldArmHeadlessIdleTimeout(1, 0), true); assert.equal(shouldArmHeadlessIdleTimeout(3, 0), true);