Merge pull request #3758 from jeremymcs/fix/pre-verification-timeout-guard
fix(gsd): add timeout guard around postUnitPreVerification
This commit is contained in:
commit
0c62fada0e
6 changed files with 214 additions and 15 deletions
|
|
@ -7,6 +7,9 @@
|
|||
* Leaf module — no imports from auto/ to avoid circular dependencies.
|
||||
*/
|
||||
|
||||
/** Timeout for postUnitPreVerification in runFinalize (ms). */
|
||||
export const FINALIZE_PRE_TIMEOUT_MS = 60_000;
|
||||
|
||||
/** Timeout for postUnitPostVerification in runFinalize (ms). */
|
||||
export const FINALIZE_POST_TIMEOUT_MS = 60_000;
|
||||
|
||||
|
|
|
|||
|
|
@ -46,7 +46,7 @@ export async function autoLoop(
|
|||
): Promise<void> {
|
||||
debugLog("autoLoop", { phase: "enter" });
|
||||
let iteration = 0;
|
||||
const loopState: LoopState = { recentUnits: [], stuckRecoveryAttempts: 0 };
|
||||
const loopState: LoopState = { recentUnits: [], stuckRecoveryAttempts: 0, consecutiveFinalizeTimeouts: 0 };
|
||||
let consecutiveErrors = 0;
|
||||
const recentErrorMessages: string[] = [];
|
||||
|
||||
|
|
@ -247,7 +247,7 @@ export async function autoLoop(
|
|||
|
||||
// ── Phase 5: Finalize ───────────────────────────────────────────────
|
||||
|
||||
const finalizeResult = await runFinalize(ic, iterData, sidecarItem);
|
||||
const finalizeResult = await runFinalize(ic, iterData, loopState, sidecarItem);
|
||||
if (finalizeResult.action === "break") break;
|
||||
if (finalizeResult.action === "continue") continue;
|
||||
|
||||
|
|
|
|||
|
|
@ -15,6 +15,7 @@ import type { PostUnitContext, PreVerificationOpts } from "../auto-post-unit.js"
|
|||
import {
|
||||
MAX_RECOVERY_CHARS,
|
||||
BUDGET_THRESHOLDS,
|
||||
MAX_FINALIZE_TIMEOUTS,
|
||||
type PhaseResult,
|
||||
type IterationContext,
|
||||
type LoopState,
|
||||
|
|
@ -33,7 +34,7 @@ import { gsdRoot } from "../paths.js";
|
|||
import { atomicWriteSync } from "../atomic-write.js";
|
||||
import { verifyExpectedArtifact, diagnoseExpectedArtifact, buildLoopRemediationSteps } from "../auto-recovery.js";
|
||||
import { writeUnitRuntimeRecord } from "../unit-runtime.js";
|
||||
import { withTimeout, FINALIZE_POST_TIMEOUT_MS } from "./finalize-timeout.js";
|
||||
import { withTimeout, FINALIZE_PRE_TIMEOUT_MS, FINALIZE_POST_TIMEOUT_MS } from "./finalize-timeout.js";
|
||||
import { getEligibleSlices } from "../slice-parallel-eligibility.js";
|
||||
import { startSliceParallel } from "../slice-parallel-orchestrator.js";
|
||||
import { isDbAvailable, getMilestoneSlices } from "../gsd-db.js";
|
||||
|
|
@ -1427,6 +1428,7 @@ export async function runUnitPhase(
|
|||
export async function runFinalize(
|
||||
ic: IterationContext,
|
||||
iterData: IterationData,
|
||||
loopState: LoopState,
|
||||
sidecarItem?: SidecarItem,
|
||||
): Promise<PhaseResult> {
|
||||
const { ctx, pi, s, deps } = ic;
|
||||
|
|
@ -1450,13 +1452,58 @@ export async function runFinalize(
|
|||
};
|
||||
|
||||
// Pre-verification processing (commit, doctor, state rebuild, etc.)
|
||||
// Timeout guard: if postUnitPreVerification hangs (e.g., safety harness
|
||||
// deadlock, browser teardown hang, worktree sync stall), force-continue
|
||||
// after timeout so the auto-loop is not permanently frozen (#3757).
|
||||
//
|
||||
// On timeout, null out s.currentUnit so the timed-out task's late async
|
||||
// mutations are harmless — postUnitPreVerification guards all side effects
|
||||
// behind `if (s.currentUnit)`. The next iteration sets a fresh currentUnit.
|
||||
// Sidecar items use lightweight pre-verification opts
|
||||
const preVerificationOpts: PreVerificationOpts | undefined = sidecarItem
|
||||
? sidecarItem.kind === "hook"
|
||||
? { skipSettleDelay: true, skipWorktreeSync: true }
|
||||
: { skipSettleDelay: true }
|
||||
: undefined;
|
||||
const preResult = await deps.postUnitPreVerification(postUnitCtx, preVerificationOpts);
|
||||
const preUnitSnapshot = s.currentUnit
|
||||
? { type: s.currentUnit.type, id: s.currentUnit.id, startedAt: s.currentUnit.startedAt }
|
||||
: null;
|
||||
const preResultGuard = await withTimeout(
|
||||
deps.postUnitPreVerification(postUnitCtx, preVerificationOpts),
|
||||
FINALIZE_PRE_TIMEOUT_MS,
|
||||
"postUnitPreVerification",
|
||||
);
|
||||
|
||||
if (preResultGuard.timedOut) {
|
||||
// Detach session from the timed-out unit so late async completions
|
||||
// cannot mutate state for the next unit (#3757).
|
||||
s.currentUnit = null;
|
||||
loopState.consecutiveFinalizeTimeouts++;
|
||||
debugLog("autoLoop", {
|
||||
phase: "pre-verification-timeout",
|
||||
iteration: ic.iteration,
|
||||
unitType: iterData.unitType,
|
||||
unitId: iterData.unitId,
|
||||
consecutiveTimeouts: loopState.consecutiveFinalizeTimeouts,
|
||||
});
|
||||
|
||||
if (loopState.consecutiveFinalizeTimeouts >= MAX_FINALIZE_TIMEOUTS) {
|
||||
ctx.ui.notify(
|
||||
`postUnitPreVerification timed out ${loopState.consecutiveFinalizeTimeouts} consecutive times — stopping auto-mode to prevent budget waste`,
|
||||
"error",
|
||||
);
|
||||
await deps.stopAuto(ctx, pi, `${loopState.consecutiveFinalizeTimeouts} consecutive finalize timeouts`);
|
||||
return { action: "break", reason: "finalize-timeout-escalation" };
|
||||
}
|
||||
|
||||
ctx.ui.notify(
|
||||
`postUnitPreVerification timed out after ${FINALIZE_PRE_TIMEOUT_MS / 1000}s for ${iterData.unitType} ${iterData.unitId} (${loopState.consecutiveFinalizeTimeouts}/${MAX_FINALIZE_TIMEOUTS}) — continuing to next iteration`,
|
||||
"warning",
|
||||
);
|
||||
return { action: "next", data: undefined as void };
|
||||
}
|
||||
|
||||
const preResult = preResultGuard.value;
|
||||
if (preResult === "dispatched") {
|
||||
debugLog("autoLoop", {
|
||||
phase: "exit",
|
||||
|
|
@ -1525,14 +1572,29 @@ export async function runFinalize(
|
|||
);
|
||||
|
||||
if (postResultGuard.timedOut) {
|
||||
// Detach session from the timed-out unit so late async completions
|
||||
// cannot mutate state for the next unit (#3757).
|
||||
s.currentUnit = null;
|
||||
loopState.consecutiveFinalizeTimeouts++;
|
||||
debugLog("autoLoop", {
|
||||
phase: "post-verification-timeout",
|
||||
iteration: ic.iteration,
|
||||
unitType: iterData.unitType,
|
||||
unitId: iterData.unitId,
|
||||
consecutiveTimeouts: loopState.consecutiveFinalizeTimeouts,
|
||||
});
|
||||
|
||||
if (loopState.consecutiveFinalizeTimeouts >= MAX_FINALIZE_TIMEOUTS) {
|
||||
ctx.ui.notify(
|
||||
`postUnitPostVerification timed out ${loopState.consecutiveFinalizeTimeouts} consecutive times — stopping auto-mode to prevent budget waste`,
|
||||
"error",
|
||||
);
|
||||
await deps.stopAuto(ctx, pi, `${loopState.consecutiveFinalizeTimeouts} consecutive finalize timeouts`);
|
||||
return { action: "break", reason: "finalize-timeout-escalation" };
|
||||
}
|
||||
|
||||
ctx.ui.notify(
|
||||
`postUnitPostVerification timed out after ${FINALIZE_POST_TIMEOUT_MS / 1000}s for ${iterData.unitType} ${iterData.unitId} — continuing to next iteration`,
|
||||
`postUnitPostVerification timed out after ${FINALIZE_POST_TIMEOUT_MS / 1000}s for ${iterData.unitType} ${iterData.unitId} (${loopState.consecutiveFinalizeTimeouts}/${MAX_FINALIZE_TIMEOUTS}) — continuing to next iteration`,
|
||||
"warning",
|
||||
);
|
||||
return { action: "next", data: undefined as void };
|
||||
|
|
@ -1554,6 +1616,9 @@ export async function runFinalize(
|
|||
return { action: "break", reason: "step-wizard" };
|
||||
}
|
||||
|
||||
// Both pre and post verification completed without timeout — reset counter
|
||||
loopState.consecutiveFinalizeTimeouts = 0;
|
||||
|
||||
return { action: "next", data: undefined as void };
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -91,8 +91,13 @@ export interface IterationContext {
|
|||
export interface LoopState {
|
||||
recentUnits: Array<{ key: string; error?: string }>;
|
||||
stuckRecoveryAttempts: number;
|
||||
/** Consecutive finalize timeout count — stops auto-mode after threshold. */
|
||||
consecutiveFinalizeTimeouts: number;
|
||||
}
|
||||
|
||||
/** Max consecutive finalize timeouts before hard-stopping auto-mode. */
|
||||
export const MAX_FINALIZE_TIMEOUTS = 3;
|
||||
|
||||
export interface PreDispatchData {
|
||||
state: GSDState;
|
||||
mid: string;
|
||||
|
|
|
|||
|
|
@ -19,8 +19,10 @@
|
|||
import { createTestContext } from "./test-helpers.ts";
|
||||
import {
|
||||
withTimeout,
|
||||
FINALIZE_PRE_TIMEOUT_MS,
|
||||
FINALIZE_POST_TIMEOUT_MS,
|
||||
} from "../auto/finalize-timeout.ts";
|
||||
import { MAX_FINALIZE_TIMEOUTS } from "../auto/types.ts";
|
||||
|
||||
const { assertTrue, assertEq, report } = createTestContext();
|
||||
|
||||
|
|
@ -78,6 +80,25 @@ const { assertTrue, assertEq, report } = createTestContext();
|
|||
assertTrue(caught, "rejection should propagate");
|
||||
}
|
||||
|
||||
// ═══ Test: FINALIZE_PRE_TIMEOUT_MS is defined and reasonable ═════════════════
|
||||
|
||||
{
|
||||
console.log("\n=== #3757: pre-verification timeout constant is defined and reasonable ===");
|
||||
|
||||
assertTrue(
|
||||
typeof FINALIZE_PRE_TIMEOUT_MS === "number",
|
||||
"FINALIZE_PRE_TIMEOUT_MS should be a number",
|
||||
);
|
||||
assertTrue(
|
||||
FINALIZE_PRE_TIMEOUT_MS >= 30_000,
|
||||
`pre timeout should be >= 30s (got ${FINALIZE_PRE_TIMEOUT_MS}ms)`,
|
||||
);
|
||||
assertTrue(
|
||||
FINALIZE_PRE_TIMEOUT_MS <= 120_000,
|
||||
`pre timeout should be <= 120s (got ${FINALIZE_PRE_TIMEOUT_MS}ms)`,
|
||||
);
|
||||
}
|
||||
|
||||
// ═══ Test: FINALIZE_POST_TIMEOUT_MS is defined and reasonable ═════════════════
|
||||
|
||||
{
|
||||
|
|
@ -113,4 +134,108 @@ const { assertTrue, assertEq, report } = createTestContext();
|
|||
assertEq(result.timedOut, false, "should not time out");
|
||||
}
|
||||
|
||||
// ═══ Test: runFinalize wraps BOTH pre and post verification with withTimeout ═
|
||||
|
||||
{
|
||||
console.log("\n=== #3757: runFinalize wraps preVerification with timeout guard ===");
|
||||
|
||||
const { readFileSync } = await import("node:fs");
|
||||
const phasesSource = readFileSync(
|
||||
new URL("../auto/phases.ts", import.meta.url),
|
||||
"utf-8",
|
||||
);
|
||||
|
||||
// Find the runFinalize function body
|
||||
const fnIdx = phasesSource.indexOf("export async function runFinalize(");
|
||||
assertTrue(fnIdx > 0, "runFinalize function should exist in phases.ts");
|
||||
|
||||
const fnBody = phasesSource.slice(fnIdx, fnIdx + 8000);
|
||||
|
||||
// postUnitPreVerification must be wrapped in withTimeout
|
||||
const preTimeoutIdx = fnBody.indexOf("withTimeout(");
|
||||
assertTrue(preTimeoutIdx > 0, "withTimeout should appear in runFinalize");
|
||||
|
||||
const preVerIdx = fnBody.indexOf("postUnitPreVerification");
|
||||
assertTrue(preVerIdx > 0, "postUnitPreVerification should appear in runFinalize");
|
||||
|
||||
// The first withTimeout should wrap postUnitPreVerification (not postUnitPostVerification)
|
||||
const firstWithTimeout = fnBody.slice(preTimeoutIdx, preTimeoutIdx + 200);
|
||||
assertTrue(
|
||||
firstWithTimeout.includes("postUnitPreVerification"),
|
||||
"first withTimeout in runFinalize should wrap postUnitPreVerification",
|
||||
);
|
||||
|
||||
// postUnitPostVerification must also be wrapped
|
||||
const postVerIdx = fnBody.indexOf("postUnitPostVerification");
|
||||
assertTrue(postVerIdx > 0, "postUnitPostVerification should appear in runFinalize");
|
||||
|
||||
// Count withTimeout occurrences — should be at least 2 (pre + post)
|
||||
const timeoutCount = (fnBody.match(/withTimeout\(/g) || []).length;
|
||||
assertTrue(
|
||||
timeoutCount >= 2,
|
||||
`runFinalize should have at least 2 withTimeout guards (found ${timeoutCount})`,
|
||||
);
|
||||
}
|
||||
|
||||
// ═══ Test: MAX_FINALIZE_TIMEOUTS is defined and reasonable ═══════════════════
|
||||
|
||||
{
|
||||
console.log("\n=== #3757: MAX_FINALIZE_TIMEOUTS is defined and reasonable ===");
|
||||
|
||||
assertTrue(
|
||||
typeof MAX_FINALIZE_TIMEOUTS === "number",
|
||||
"MAX_FINALIZE_TIMEOUTS should be a number",
|
||||
);
|
||||
assertTrue(
|
||||
MAX_FINALIZE_TIMEOUTS >= 2,
|
||||
`threshold should be >= 2 (got ${MAX_FINALIZE_TIMEOUTS})`,
|
||||
);
|
||||
assertTrue(
|
||||
MAX_FINALIZE_TIMEOUTS <= 10,
|
||||
`threshold should be <= 10 (got ${MAX_FINALIZE_TIMEOUTS})`,
|
||||
);
|
||||
}
|
||||
|
||||
// ═══ Test: timeout handlers escalate after consecutive timeouts ══════════════
|
||||
|
||||
{
|
||||
console.log("\n=== #3757: timeout handlers escalate and detach currentUnit ===");
|
||||
|
||||
const { readFileSync } = await import("node:fs");
|
||||
const phasesSource = readFileSync(
|
||||
new URL("../auto/phases.ts", import.meta.url),
|
||||
"utf-8",
|
||||
);
|
||||
|
||||
const fnIdx = phasesSource.indexOf("export async function runFinalize(");
|
||||
const fnBody = phasesSource.slice(fnIdx, fnIdx + 8000);
|
||||
|
||||
// Both timeout handlers should increment consecutiveFinalizeTimeouts
|
||||
const incrementCount = (fnBody.match(/consecutiveFinalizeTimeouts\+\+/g) || []).length;
|
||||
assertTrue(
|
||||
incrementCount >= 2,
|
||||
`should increment consecutiveFinalizeTimeouts in both pre and post handlers (found ${incrementCount})`,
|
||||
);
|
||||
|
||||
// Both timeout handlers should check MAX_FINALIZE_TIMEOUTS for escalation
|
||||
const escalationCount = (fnBody.match(/MAX_FINALIZE_TIMEOUTS/g) || []).length;
|
||||
assertTrue(
|
||||
escalationCount >= 2,
|
||||
`should check MAX_FINALIZE_TIMEOUTS in both handlers (found ${escalationCount})`,
|
||||
);
|
||||
|
||||
// Both timeout handlers should null out s.currentUnit to prevent late mutations
|
||||
const detachCount = (fnBody.match(/s\.currentUnit\s*=\s*null/g) || []).length;
|
||||
assertTrue(
|
||||
detachCount >= 2,
|
||||
`should detach s.currentUnit in both timeout handlers (found ${detachCount})`,
|
||||
);
|
||||
|
||||
// Successful finalize should reset the counter
|
||||
assertTrue(
|
||||
fnBody.includes("consecutiveFinalizeTimeouts = 0"),
|
||||
"should reset consecutiveFinalizeTimeouts on successful finalize",
|
||||
);
|
||||
}
|
||||
|
||||
report();
|
||||
|
|
|
|||
|
|
@ -216,7 +216,7 @@ test("runDispatch emits dispatch-match with correct rule and flowId", async () =
|
|||
mid: "M001",
|
||||
midTitle: "Test Milestone",
|
||||
};
|
||||
const loopState: LoopState = { recentUnits: [], stuckRecoveryAttempts: 0 };
|
||||
const loopState: LoopState = { recentUnits: [], stuckRecoveryAttempts: 0, consecutiveFinalizeTimeouts: 0 };
|
||||
|
||||
const result = await runDispatch(ic, preData, loopState);
|
||||
|
||||
|
|
@ -248,7 +248,7 @@ test("runDispatch emits dispatch-stop when dispatch returns stop action", async
|
|||
mid: "M001",
|
||||
midTitle: "Test",
|
||||
};
|
||||
const loopState: LoopState = { recentUnits: [], stuckRecoveryAttempts: 0 };
|
||||
const loopState: LoopState = { recentUnits: [], stuckRecoveryAttempts: 0, consecutiveFinalizeTimeouts: 0 };
|
||||
|
||||
const result = await runDispatch(ic, preData, loopState);
|
||||
assert.equal(result.action, "break");
|
||||
|
|
@ -303,6 +303,7 @@ test("runDispatch checks prior-slice completion against the project root in work
|
|||
const result = await runDispatch(ic, preData, {
|
||||
recentUnits: [],
|
||||
stuckRecoveryAttempts: 0,
|
||||
consecutiveFinalizeTimeouts: 0,
|
||||
});
|
||||
|
||||
assert.equal(result.action, "next");
|
||||
|
|
@ -343,7 +344,7 @@ test("runUnitPhase emits unit-start and unit-end with causedBy reference", async
|
|||
isRetry: false,
|
||||
previousTier: undefined,
|
||||
};
|
||||
const loopState: LoopState = { recentUnits: [{ key: "execute-task/M001/S01/T01" }], stuckRecoveryAttempts: 0 };
|
||||
const loopState: LoopState = { recentUnits: [{ key: "execute-task/M001/S01/T01" }], stuckRecoveryAttempts: 0, consecutiveFinalizeTimeouts: 0 };
|
||||
|
||||
// Start runUnitPhase (it will block on runUnit internally)
|
||||
const unitPromise = runUnitPhase(ic, iterData, loopState);
|
||||
|
|
@ -400,7 +401,7 @@ test("all events from a mock iteration have monotonically increasing seq and sam
|
|||
mid: "M001",
|
||||
midTitle: "Test",
|
||||
};
|
||||
const loopState: LoopState = { recentUnits: [], stuckRecoveryAttempts: 0 };
|
||||
const loopState: LoopState = { recentUnits: [], stuckRecoveryAttempts: 0, consecutiveFinalizeTimeouts: 0 };
|
||||
const dispatchResult = await runDispatch(ic, preData, loopState);
|
||||
assert.equal(dispatchResult.action, "next");
|
||||
|
||||
|
|
@ -446,7 +447,7 @@ test("dispatch-match events include matchedRule field matching the rule name", a
|
|||
midTitle: "Test",
|
||||
};
|
||||
|
||||
await runDispatch(ic, preData, { recentUnits: [], stuckRecoveryAttempts: 0 });
|
||||
await runDispatch(ic, preData, { recentUnits: [], stuckRecoveryAttempts: 0, consecutiveFinalizeTimeouts: 0 });
|
||||
|
||||
const matchEvents = capture.events.filter(e => e.eventType === "dispatch-match");
|
||||
assert.equal(matchEvents.length, 1);
|
||||
|
|
@ -475,7 +476,7 @@ test("pre-dispatch-hook event is emitted when hooks fire", async () => {
|
|||
midTitle: "Test",
|
||||
};
|
||||
|
||||
await runDispatch(ic, preData, { recentUnits: [], stuckRecoveryAttempts: 0 });
|
||||
await runDispatch(ic, preData, { recentUnits: [], stuckRecoveryAttempts: 0, consecutiveFinalizeTimeouts: 0 });
|
||||
|
||||
const hookEvents = capture.events.filter(e => e.eventType === "pre-dispatch-hook");
|
||||
assert.equal(hookEvents.length, 1, "should emit one pre-dispatch-hook event");
|
||||
|
|
@ -497,7 +498,7 @@ test("terminal event is emitted on milestone-complete", async () => {
|
|||
}) as any,
|
||||
});
|
||||
const ic = makeIC(deps);
|
||||
const loopState: LoopState = { recentUnits: [], stuckRecoveryAttempts: 0 };
|
||||
const loopState: LoopState = { recentUnits: [], stuckRecoveryAttempts: 0, consecutiveFinalizeTimeouts: 0 };
|
||||
|
||||
const result = await runPreDispatch(ic, loopState);
|
||||
assert.equal(result.action, "break");
|
||||
|
|
@ -521,7 +522,7 @@ test("terminal event is emitted on blocked state", async () => {
|
|||
}) as any,
|
||||
});
|
||||
const ic = makeIC(deps);
|
||||
const loopState: LoopState = { recentUnits: [], stuckRecoveryAttempts: 0 };
|
||||
const loopState: LoopState = { recentUnits: [], stuckRecoveryAttempts: 0, consecutiveFinalizeTimeouts: 0 };
|
||||
|
||||
const result = await runPreDispatch(ic, loopState);
|
||||
assert.equal(result.action, "break");
|
||||
|
|
@ -550,7 +551,7 @@ test("milestone-transition event is emitted when milestone changes", async () =>
|
|||
const ic = makeIC(deps);
|
||||
// Session says current milestone is M001, but state will return M002
|
||||
ic.s.currentMilestoneId = "M001";
|
||||
const loopState: LoopState = { recentUnits: [], stuckRecoveryAttempts: 0 };
|
||||
const loopState: LoopState = { recentUnits: [], stuckRecoveryAttempts: 0, consecutiveFinalizeTimeouts: 0 };
|
||||
|
||||
await runPreDispatch(ic, loopState);
|
||||
|
||||
|
|
@ -580,7 +581,7 @@ test("unit-end event contains errorContext when unit is cancelled with structure
|
|||
isRetry: false,
|
||||
previousTier: undefined,
|
||||
};
|
||||
const loopState: LoopState = { recentUnits: [{ key: "execute-task/M001/S01/T01" }], stuckRecoveryAttempts: 0 };
|
||||
const loopState: LoopState = { recentUnits: [{ key: "execute-task/M001/S01/T01" }], stuckRecoveryAttempts: 0, consecutiveFinalizeTimeouts: 0 };
|
||||
|
||||
const unitPromise = runUnitPhase(ic, iterData, loopState);
|
||||
await new Promise(r => setTimeout(r, 50));
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue