Merge pull request #3758 from jeremymcs/fix/pre-verification-timeout-guard

fix(gsd): add timeout guard around postUnitPreVerification
This commit is contained in:
Jeremy McSpadden 2026-04-07 21:28:21 -05:00 committed by GitHub
commit 0c62fada0e
6 changed files with 214 additions and 15 deletions

View file

@ -7,6 +7,9 @@
* Leaf module no imports from auto/ to avoid circular dependencies.
*/
/** Timeout for postUnitPreVerification in runFinalize (ms). */
export const FINALIZE_PRE_TIMEOUT_MS = 60_000;
/** Timeout for postUnitPostVerification in runFinalize (ms). */
export const FINALIZE_POST_TIMEOUT_MS = 60_000;

View file

@ -46,7 +46,7 @@ export async function autoLoop(
): Promise<void> {
debugLog("autoLoop", { phase: "enter" });
let iteration = 0;
const loopState: LoopState = { recentUnits: [], stuckRecoveryAttempts: 0 };
const loopState: LoopState = { recentUnits: [], stuckRecoveryAttempts: 0, consecutiveFinalizeTimeouts: 0 };
let consecutiveErrors = 0;
const recentErrorMessages: string[] = [];
@ -247,7 +247,7 @@ export async function autoLoop(
// ── Phase 5: Finalize ───────────────────────────────────────────────
const finalizeResult = await runFinalize(ic, iterData, sidecarItem);
const finalizeResult = await runFinalize(ic, iterData, loopState, sidecarItem);
if (finalizeResult.action === "break") break;
if (finalizeResult.action === "continue") continue;

View file

@ -15,6 +15,7 @@ import type { PostUnitContext, PreVerificationOpts } from "../auto-post-unit.js"
import {
MAX_RECOVERY_CHARS,
BUDGET_THRESHOLDS,
MAX_FINALIZE_TIMEOUTS,
type PhaseResult,
type IterationContext,
type LoopState,
@ -33,7 +34,7 @@ import { gsdRoot } from "../paths.js";
import { atomicWriteSync } from "../atomic-write.js";
import { verifyExpectedArtifact, diagnoseExpectedArtifact, buildLoopRemediationSteps } from "../auto-recovery.js";
import { writeUnitRuntimeRecord } from "../unit-runtime.js";
import { withTimeout, FINALIZE_POST_TIMEOUT_MS } from "./finalize-timeout.js";
import { withTimeout, FINALIZE_PRE_TIMEOUT_MS, FINALIZE_POST_TIMEOUT_MS } from "./finalize-timeout.js";
import { getEligibleSlices } from "../slice-parallel-eligibility.js";
import { startSliceParallel } from "../slice-parallel-orchestrator.js";
import { isDbAvailable, getMilestoneSlices } from "../gsd-db.js";
@ -1427,6 +1428,7 @@ export async function runUnitPhase(
export async function runFinalize(
ic: IterationContext,
iterData: IterationData,
loopState: LoopState,
sidecarItem?: SidecarItem,
): Promise<PhaseResult> {
const { ctx, pi, s, deps } = ic;
@ -1450,13 +1452,58 @@ export async function runFinalize(
};
// Pre-verification processing (commit, doctor, state rebuild, etc.)
// Timeout guard: if postUnitPreVerification hangs (e.g., safety harness
// deadlock, browser teardown hang, worktree sync stall), force-continue
// after timeout so the auto-loop is not permanently frozen (#3757).
//
// On timeout, null out s.currentUnit so the timed-out task's late async
// mutations are harmless — postUnitPreVerification guards all side effects
// behind `if (s.currentUnit)`. The next iteration sets a fresh currentUnit.
// Sidecar items use lightweight pre-verification opts
const preVerificationOpts: PreVerificationOpts | undefined = sidecarItem
? sidecarItem.kind === "hook"
? { skipSettleDelay: true, skipWorktreeSync: true }
: { skipSettleDelay: true }
: undefined;
const preResult = await deps.postUnitPreVerification(postUnitCtx, preVerificationOpts);
const preUnitSnapshot = s.currentUnit
? { type: s.currentUnit.type, id: s.currentUnit.id, startedAt: s.currentUnit.startedAt }
: null;
const preResultGuard = await withTimeout(
deps.postUnitPreVerification(postUnitCtx, preVerificationOpts),
FINALIZE_PRE_TIMEOUT_MS,
"postUnitPreVerification",
);
if (preResultGuard.timedOut) {
// Detach session from the timed-out unit so late async completions
// cannot mutate state for the next unit (#3757).
s.currentUnit = null;
loopState.consecutiveFinalizeTimeouts++;
debugLog("autoLoop", {
phase: "pre-verification-timeout",
iteration: ic.iteration,
unitType: iterData.unitType,
unitId: iterData.unitId,
consecutiveTimeouts: loopState.consecutiveFinalizeTimeouts,
});
if (loopState.consecutiveFinalizeTimeouts >= MAX_FINALIZE_TIMEOUTS) {
ctx.ui.notify(
`postUnitPreVerification timed out ${loopState.consecutiveFinalizeTimeouts} consecutive times — stopping auto-mode to prevent budget waste`,
"error",
);
await deps.stopAuto(ctx, pi, `${loopState.consecutiveFinalizeTimeouts} consecutive finalize timeouts`);
return { action: "break", reason: "finalize-timeout-escalation" };
}
ctx.ui.notify(
`postUnitPreVerification timed out after ${FINALIZE_PRE_TIMEOUT_MS / 1000}s for ${iterData.unitType} ${iterData.unitId} (${loopState.consecutiveFinalizeTimeouts}/${MAX_FINALIZE_TIMEOUTS}) — continuing to next iteration`,
"warning",
);
return { action: "next", data: undefined as void };
}
const preResult = preResultGuard.value;
if (preResult === "dispatched") {
debugLog("autoLoop", {
phase: "exit",
@ -1525,14 +1572,29 @@ export async function runFinalize(
);
if (postResultGuard.timedOut) {
// Detach session from the timed-out unit so late async completions
// cannot mutate state for the next unit (#3757).
s.currentUnit = null;
loopState.consecutiveFinalizeTimeouts++;
debugLog("autoLoop", {
phase: "post-verification-timeout",
iteration: ic.iteration,
unitType: iterData.unitType,
unitId: iterData.unitId,
consecutiveTimeouts: loopState.consecutiveFinalizeTimeouts,
});
if (loopState.consecutiveFinalizeTimeouts >= MAX_FINALIZE_TIMEOUTS) {
ctx.ui.notify(
`postUnitPostVerification timed out ${loopState.consecutiveFinalizeTimeouts} consecutive times — stopping auto-mode to prevent budget waste`,
"error",
);
await deps.stopAuto(ctx, pi, `${loopState.consecutiveFinalizeTimeouts} consecutive finalize timeouts`);
return { action: "break", reason: "finalize-timeout-escalation" };
}
ctx.ui.notify(
`postUnitPostVerification timed out after ${FINALIZE_POST_TIMEOUT_MS / 1000}s for ${iterData.unitType} ${iterData.unitId} — continuing to next iteration`,
`postUnitPostVerification timed out after ${FINALIZE_POST_TIMEOUT_MS / 1000}s for ${iterData.unitType} ${iterData.unitId} (${loopState.consecutiveFinalizeTimeouts}/${MAX_FINALIZE_TIMEOUTS}) — continuing to next iteration`,
"warning",
);
return { action: "next", data: undefined as void };
@ -1554,6 +1616,9 @@ export async function runFinalize(
return { action: "break", reason: "step-wizard" };
}
// Both pre and post verification completed without timeout — reset counter
loopState.consecutiveFinalizeTimeouts = 0;
return { action: "next", data: undefined as void };
}

View file

@ -91,8 +91,13 @@ export interface IterationContext {
export interface LoopState {
recentUnits: Array<{ key: string; error?: string }>;
stuckRecoveryAttempts: number;
/** Consecutive finalize timeout count — stops auto-mode after threshold. */
consecutiveFinalizeTimeouts: number;
}
/** Max consecutive finalize timeouts before hard-stopping auto-mode. */
export const MAX_FINALIZE_TIMEOUTS = 3;
export interface PreDispatchData {
state: GSDState;
mid: string;

View file

@ -19,8 +19,10 @@
import { createTestContext } from "./test-helpers.ts";
import {
withTimeout,
FINALIZE_PRE_TIMEOUT_MS,
FINALIZE_POST_TIMEOUT_MS,
} from "../auto/finalize-timeout.ts";
import { MAX_FINALIZE_TIMEOUTS } from "../auto/types.ts";
const { assertTrue, assertEq, report } = createTestContext();
@ -78,6 +80,25 @@ const { assertTrue, assertEq, report } = createTestContext();
assertTrue(caught, "rejection should propagate");
}
// ═══ Test: FINALIZE_PRE_TIMEOUT_MS is defined and reasonable ═════════════════
{
console.log("\n=== #3757: pre-verification timeout constant is defined and reasonable ===");
assertTrue(
typeof FINALIZE_PRE_TIMEOUT_MS === "number",
"FINALIZE_PRE_TIMEOUT_MS should be a number",
);
assertTrue(
FINALIZE_PRE_TIMEOUT_MS >= 30_000,
`pre timeout should be >= 30s (got ${FINALIZE_PRE_TIMEOUT_MS}ms)`,
);
assertTrue(
FINALIZE_PRE_TIMEOUT_MS <= 120_000,
`pre timeout should be <= 120s (got ${FINALIZE_PRE_TIMEOUT_MS}ms)`,
);
}
// ═══ Test: FINALIZE_POST_TIMEOUT_MS is defined and reasonable ═════════════════
{
@ -113,4 +134,108 @@ const { assertTrue, assertEq, report } = createTestContext();
assertEq(result.timedOut, false, "should not time out");
}
// ═══ Test: runFinalize wraps BOTH pre and post verification with withTimeout ═
{
console.log("\n=== #3757: runFinalize wraps preVerification with timeout guard ===");
const { readFileSync } = await import("node:fs");
const phasesSource = readFileSync(
new URL("../auto/phases.ts", import.meta.url),
"utf-8",
);
// Find the runFinalize function body
const fnIdx = phasesSource.indexOf("export async function runFinalize(");
assertTrue(fnIdx > 0, "runFinalize function should exist in phases.ts");
const fnBody = phasesSource.slice(fnIdx, fnIdx + 8000);
// postUnitPreVerification must be wrapped in withTimeout
const preTimeoutIdx = fnBody.indexOf("withTimeout(");
assertTrue(preTimeoutIdx > 0, "withTimeout should appear in runFinalize");
const preVerIdx = fnBody.indexOf("postUnitPreVerification");
assertTrue(preVerIdx > 0, "postUnitPreVerification should appear in runFinalize");
// The first withTimeout should wrap postUnitPreVerification (not postUnitPostVerification)
const firstWithTimeout = fnBody.slice(preTimeoutIdx, preTimeoutIdx + 200);
assertTrue(
firstWithTimeout.includes("postUnitPreVerification"),
"first withTimeout in runFinalize should wrap postUnitPreVerification",
);
// postUnitPostVerification must also be wrapped
const postVerIdx = fnBody.indexOf("postUnitPostVerification");
assertTrue(postVerIdx > 0, "postUnitPostVerification should appear in runFinalize");
// Count withTimeout occurrences — should be at least 2 (pre + post)
const timeoutCount = (fnBody.match(/withTimeout\(/g) || []).length;
assertTrue(
timeoutCount >= 2,
`runFinalize should have at least 2 withTimeout guards (found ${timeoutCount})`,
);
}
// ═══ Test: MAX_FINALIZE_TIMEOUTS is defined and reasonable ═══════════════════
{
console.log("\n=== #3757: MAX_FINALIZE_TIMEOUTS is defined and reasonable ===");
assertTrue(
typeof MAX_FINALIZE_TIMEOUTS === "number",
"MAX_FINALIZE_TIMEOUTS should be a number",
);
assertTrue(
MAX_FINALIZE_TIMEOUTS >= 2,
`threshold should be >= 2 (got ${MAX_FINALIZE_TIMEOUTS})`,
);
assertTrue(
MAX_FINALIZE_TIMEOUTS <= 10,
`threshold should be <= 10 (got ${MAX_FINALIZE_TIMEOUTS})`,
);
}
// ═══ Test: timeout handlers escalate after consecutive timeouts ══════════════
{
console.log("\n=== #3757: timeout handlers escalate and detach currentUnit ===");
const { readFileSync } = await import("node:fs");
const phasesSource = readFileSync(
new URL("../auto/phases.ts", import.meta.url),
"utf-8",
);
const fnIdx = phasesSource.indexOf("export async function runFinalize(");
const fnBody = phasesSource.slice(fnIdx, fnIdx + 8000);
// Both timeout handlers should increment consecutiveFinalizeTimeouts
const incrementCount = (fnBody.match(/consecutiveFinalizeTimeouts\+\+/g) || []).length;
assertTrue(
incrementCount >= 2,
`should increment consecutiveFinalizeTimeouts in both pre and post handlers (found ${incrementCount})`,
);
// Both timeout handlers should check MAX_FINALIZE_TIMEOUTS for escalation
const escalationCount = (fnBody.match(/MAX_FINALIZE_TIMEOUTS/g) || []).length;
assertTrue(
escalationCount >= 2,
`should check MAX_FINALIZE_TIMEOUTS in both handlers (found ${escalationCount})`,
);
// Both timeout handlers should null out s.currentUnit to prevent late mutations
const detachCount = (fnBody.match(/s\.currentUnit\s*=\s*null/g) || []).length;
assertTrue(
detachCount >= 2,
`should detach s.currentUnit in both timeout handlers (found ${detachCount})`,
);
// Successful finalize should reset the counter
assertTrue(
fnBody.includes("consecutiveFinalizeTimeouts = 0"),
"should reset consecutiveFinalizeTimeouts on successful finalize",
);
}
report();

View file

@ -216,7 +216,7 @@ test("runDispatch emits dispatch-match with correct rule and flowId", async () =
mid: "M001",
midTitle: "Test Milestone",
};
const loopState: LoopState = { recentUnits: [], stuckRecoveryAttempts: 0 };
const loopState: LoopState = { recentUnits: [], stuckRecoveryAttempts: 0, consecutiveFinalizeTimeouts: 0 };
const result = await runDispatch(ic, preData, loopState);
@ -248,7 +248,7 @@ test("runDispatch emits dispatch-stop when dispatch returns stop action", async
mid: "M001",
midTitle: "Test",
};
const loopState: LoopState = { recentUnits: [], stuckRecoveryAttempts: 0 };
const loopState: LoopState = { recentUnits: [], stuckRecoveryAttempts: 0, consecutiveFinalizeTimeouts: 0 };
const result = await runDispatch(ic, preData, loopState);
assert.equal(result.action, "break");
@ -303,6 +303,7 @@ test("runDispatch checks prior-slice completion against the project root in work
const result = await runDispatch(ic, preData, {
recentUnits: [],
stuckRecoveryAttempts: 0,
consecutiveFinalizeTimeouts: 0,
});
assert.equal(result.action, "next");
@ -343,7 +344,7 @@ test("runUnitPhase emits unit-start and unit-end with causedBy reference", async
isRetry: false,
previousTier: undefined,
};
const loopState: LoopState = { recentUnits: [{ key: "execute-task/M001/S01/T01" }], stuckRecoveryAttempts: 0 };
const loopState: LoopState = { recentUnits: [{ key: "execute-task/M001/S01/T01" }], stuckRecoveryAttempts: 0, consecutiveFinalizeTimeouts: 0 };
// Start runUnitPhase (it will block on runUnit internally)
const unitPromise = runUnitPhase(ic, iterData, loopState);
@ -400,7 +401,7 @@ test("all events from a mock iteration have monotonically increasing seq and sam
mid: "M001",
midTitle: "Test",
};
const loopState: LoopState = { recentUnits: [], stuckRecoveryAttempts: 0 };
const loopState: LoopState = { recentUnits: [], stuckRecoveryAttempts: 0, consecutiveFinalizeTimeouts: 0 };
const dispatchResult = await runDispatch(ic, preData, loopState);
assert.equal(dispatchResult.action, "next");
@ -446,7 +447,7 @@ test("dispatch-match events include matchedRule field matching the rule name", a
midTitle: "Test",
};
await runDispatch(ic, preData, { recentUnits: [], stuckRecoveryAttempts: 0 });
await runDispatch(ic, preData, { recentUnits: [], stuckRecoveryAttempts: 0, consecutiveFinalizeTimeouts: 0 });
const matchEvents = capture.events.filter(e => e.eventType === "dispatch-match");
assert.equal(matchEvents.length, 1);
@ -475,7 +476,7 @@ test("pre-dispatch-hook event is emitted when hooks fire", async () => {
midTitle: "Test",
};
await runDispatch(ic, preData, { recentUnits: [], stuckRecoveryAttempts: 0 });
await runDispatch(ic, preData, { recentUnits: [], stuckRecoveryAttempts: 0, consecutiveFinalizeTimeouts: 0 });
const hookEvents = capture.events.filter(e => e.eventType === "pre-dispatch-hook");
assert.equal(hookEvents.length, 1, "should emit one pre-dispatch-hook event");
@ -497,7 +498,7 @@ test("terminal event is emitted on milestone-complete", async () => {
}) as any,
});
const ic = makeIC(deps);
const loopState: LoopState = { recentUnits: [], stuckRecoveryAttempts: 0 };
const loopState: LoopState = { recentUnits: [], stuckRecoveryAttempts: 0, consecutiveFinalizeTimeouts: 0 };
const result = await runPreDispatch(ic, loopState);
assert.equal(result.action, "break");
@ -521,7 +522,7 @@ test("terminal event is emitted on blocked state", async () => {
}) as any,
});
const ic = makeIC(deps);
const loopState: LoopState = { recentUnits: [], stuckRecoveryAttempts: 0 };
const loopState: LoopState = { recentUnits: [], stuckRecoveryAttempts: 0, consecutiveFinalizeTimeouts: 0 };
const result = await runPreDispatch(ic, loopState);
assert.equal(result.action, "break");
@ -550,7 +551,7 @@ test("milestone-transition event is emitted when milestone changes", async () =>
const ic = makeIC(deps);
// Session says current milestone is M001, but state will return M002
ic.s.currentMilestoneId = "M001";
const loopState: LoopState = { recentUnits: [], stuckRecoveryAttempts: 0 };
const loopState: LoopState = { recentUnits: [], stuckRecoveryAttempts: 0, consecutiveFinalizeTimeouts: 0 };
await runPreDispatch(ic, loopState);
@ -580,7 +581,7 @@ test("unit-end event contains errorContext when unit is cancelled with structure
isRetry: false,
previousTier: undefined,
};
const loopState: LoopState = { recentUnits: [{ key: "execute-task/M001/S01/T01" }], stuckRecoveryAttempts: 0 };
const loopState: LoopState = { recentUnits: [{ key: "execute-task/M001/S01/T01" }], stuckRecoveryAttempts: 0, consecutiveFinalizeTimeouts: 0 };
const unitPromise = runUnitPhase(ic, iterData, loopState);
await new Promise(r => setTimeout(r, 50));