diff --git a/src/resources/extensions/gsd/auto.ts b/src/resources/extensions/gsd/auto.ts index 7b52d2efe..7ce720edc 100644 --- a/src/resources/extensions/gsd/auto.ts +++ b/src/resources/extensions/gsd/auto.ts @@ -178,11 +178,13 @@ import { AutoSession, MAX_UNIT_DISPATCHES, STUB_RECOVERY_THRESHOLD, MAX_LIFETIME_DISPATCHES, MAX_CONSECUTIVE_SKIPS, DISPATCH_GAP_TIMEOUT_MS, MAX_SKIP_DEPTH, + NEW_SESSION_TIMEOUT_MS, DISPATCH_HANG_TIMEOUT_MS, } from "./auto/session.js"; import type { CompletedUnit, CurrentUnit, UnitRouting, StartModel, PendingVerificationRetry } from "./auto/session.js"; export { MAX_UNIT_DISPATCHES, STUB_RECOVERY_THRESHOLD, MAX_LIFETIME_DISPATCHES, MAX_CONSECUTIVE_SKIPS, DISPATCH_GAP_TIMEOUT_MS, MAX_SKIP_DEPTH, + NEW_SESSION_TIMEOUT_MS, DISPATCH_HANG_TIMEOUT_MS, } from "./auto/session.js"; export type { CompletedUnit, CurrentUnit, UnitRouting, StartModel } from "./auto/session.js"; @@ -1882,6 +1884,25 @@ export async function handleAgentEnd( return; } + // ── Dispatch with hang detection (#1073) ──────────────────────────────── + // Start a safety watchdog BEFORE calling dispatchNextUnit. If dispatch + // hangs at any await (newSession, model selection, etc.), the gap watchdog + // inside handleAgentEnd never fires because we never reach the check. + // This pre-dispatch watchdog ensures recovery even when dispatchNextUnit + // itself is permanently blocked. + const dispatchHangGuard = setTimeout(() => { + if (!s.active) return; + // dispatchNextUnit has been running for too long — it's likely hung. + // Start the gap watchdog which will retry dispatch from scratch. + if (!s.unitTimeoutHandle && !s.wrapupWarningHandle) { + ctx.ui.notify( + `Dispatch hang detected (${DISPATCH_HANG_TIMEOUT_MS / 1000}s without completion). Starting recovery watchdog.`, + "warning", + ); + startDispatchGapWatchdog(ctx, pi); + } + }, DISPATCH_HANG_TIMEOUT_MS); + try { await dispatchNextUnit(ctx, pi); } catch (dispatchErr) { @@ -1898,6 +1919,8 @@ export async function handleAgentEnd( // This gives transient issues (dirty working tree, branch state) time to settle. startDispatchGapWatchdog(ctx, pi); return; + } finally { + clearTimeout(dispatchHangGuard); } // If dispatchNextUnit returned normally but auto-mode is still s.active and @@ -3028,10 +3051,29 @@ async function dispatchNextUnit( // so the LLM doesn't have to get these right ensurePreconditions(unitType, unitId, s.basePath, state); - // Fresh session - const result = await s.cmdCtx!.newSession(); + // Fresh session — with timeout to prevent permanent hangs (#1073). + // If newSession() hangs (e.g., session manager deadlock, network issue), + // without this timeout the entire dispatch chain stalls permanently: no + // timeouts are set, no gap watchdog fires, and auto-mode is left active + // but idle until the user Ctrl+C's. + let result: { cancelled: boolean }; + try { + const sessionPromise = s.cmdCtx!.newSession(); + const timeoutPromise = new Promise<{ cancelled: true }>((resolve) => + setTimeout(() => resolve({ cancelled: true }), NEW_SESSION_TIMEOUT_MS), + ); + result = await Promise.race([sessionPromise, timeoutPromise]); + } catch (sessionErr) { + const msg = sessionErr instanceof Error ? sessionErr.message : String(sessionErr); + ctx.ui.notify(`Session creation failed: ${msg}. Retrying via watchdog.`, "error"); + throw new Error(`newSession() failed: ${msg}`); + } if (result.cancelled) { - await stopAuto(ctx, pi, "Session cancelled"); + ctx.ui.notify( + `Session creation timed out or was cancelled for ${unitType} ${unitId}. Will retry.`, + "warning", + ); + await stopAuto(ctx, pi, "Session creation failed"); return; } diff --git a/src/resources/extensions/gsd/auto/session.ts b/src/resources/extensions/gsd/auto/session.ts index 71642afff..70c6e8b65 100644 --- a/src/resources/extensions/gsd/auto/session.ts +++ b/src/resources/extensions/gsd/auto/session.ts @@ -60,6 +60,8 @@ export const MAX_LIFETIME_DISPATCHES = 6; export const MAX_CONSECUTIVE_SKIPS = 3; export const DISPATCH_GAP_TIMEOUT_MS = 5_000; export const MAX_SKIP_DEPTH = 20; +export const NEW_SESSION_TIMEOUT_MS = 30_000; +export const DISPATCH_HANG_TIMEOUT_MS = 60_000; // ─── AutoSession ───────────────────────────────────────────────────────────── diff --git a/src/resources/extensions/gsd/tests/dispatch-stall-guard.test.ts b/src/resources/extensions/gsd/tests/dispatch-stall-guard.test.ts new file mode 100644 index 000000000..7798f75b9 --- /dev/null +++ b/src/resources/extensions/gsd/tests/dispatch-stall-guard.test.ts @@ -0,0 +1,126 @@ +/** + * dispatch-stall-guard.test.ts — Verifies defensive guards against dispatch stalls (#1073). + * + * After a slice completes, dispatchNextUnit must reliably dispatch the next unit. + * These tests verify: + * 1. newSession() has timeout protection (prevents permanent hang if session creation stalls) + * 2. handleAgentEnd has a dispatch hang guard (catches dispatchNextUnit itself hanging) + * 3. Session timeout constants are exported for configurability + */ + +import test from "node:test"; +import assert from "node:assert/strict"; +import { readFileSync } from "node:fs"; +import { join, dirname } from "node:path"; +import { fileURLToPath } from "node:url"; + +const __dirname = dirname(fileURLToPath(import.meta.url)); +const AUTO_TS_PATH = join(__dirname, "..", "auto.ts"); +const SESSION_TS_PATH = join(__dirname, "..", "auto", "session.ts"); + +function getAutoTsSource(): string { + return readFileSync(AUTO_TS_PATH, "utf-8"); +} + +function getSessionTsSource(): string { + return readFileSync(SESSION_TS_PATH, "utf-8"); +} + +// ── Session timeout constants ─────────────────────────────────────────────── + +test("AutoSession exports NEW_SESSION_TIMEOUT_MS constant", () => { + const source = getSessionTsSource(); + assert.ok( + source.includes("NEW_SESSION_TIMEOUT_MS"), + "auto/session.ts must export NEW_SESSION_TIMEOUT_MS for newSession() timeout", + ); +}); + +test("AutoSession exports DISPATCH_HANG_TIMEOUT_MS constant", () => { + const source = getSessionTsSource(); + assert.ok( + source.includes("DISPATCH_HANG_TIMEOUT_MS"), + "auto/session.ts must export DISPATCH_HANG_TIMEOUT_MS for dispatch hang detection", + ); +}); + +test("NEW_SESSION_TIMEOUT_MS is a reasonable value (15-120 seconds)", () => { + const source = getSessionTsSource(); + const match = source.match(/NEW_SESSION_TIMEOUT_MS\s*=\s*(\d[\d_]*)/); + assert.ok(match, "NEW_SESSION_TIMEOUT_MS must have a numeric value"); + const value = parseInt(match![1]!.replace(/_/g, ""), 10); + assert.ok(value >= 15_000 && value <= 120_000, + `NEW_SESSION_TIMEOUT_MS must be 15-120s, got ${value}ms`, + ); +}); + +test("DISPATCH_HANG_TIMEOUT_MS is greater than NEW_SESSION_TIMEOUT_MS", () => { + const source = getSessionTsSource(); + const sessionMatch = source.match(/NEW_SESSION_TIMEOUT_MS\s*=\s*(\d[\d_]*)/); + const dispatchMatch = source.match(/DISPATCH_HANG_TIMEOUT_MS\s*=\s*(\d[\d_]*)/); + assert.ok(sessionMatch && dispatchMatch, "Both timeout constants must exist"); + const sessionTimeout = parseInt(sessionMatch![1]!.replace(/_/g, ""), 10); + const dispatchTimeout = parseInt(dispatchMatch![1]!.replace(/_/g, ""), 10); + assert.ok(dispatchTimeout > sessionTimeout, + `DISPATCH_HANG_TIMEOUT_MS (${dispatchTimeout}) must be > NEW_SESSION_TIMEOUT_MS (${sessionTimeout})`, + ); +}); + +// ── newSession() timeout in dispatchNextUnit ───────────────────────────────── + +test("dispatchNextUnit wraps newSession() with Promise.race timeout", () => { + const source = getAutoTsSource(); + // Search the full file — dispatchNextUnit is very large + assert.ok( + source.includes("Promise.race") && source.includes("NEW_SESSION_TIMEOUT_MS"), + "dispatchNextUnit must use Promise.race with NEW_SESSION_TIMEOUT_MS to timeout newSession() (#1073)", + ); +}); + +test("dispatchNextUnit handles newSession() timeout gracefully", () => { + const source = getAutoTsSource(); + // Must notify user when session times out + assert.ok( + source.includes("Session creation timed out") || source.includes("Session creation failed"), + "dispatchNextUnit must notify user when newSession() times out or fails (#1073)", + ); +}); + +// ── Dispatch hang guard in handleAgentEnd ──────────────────────────────────── + +test("handleAgentEnd has a dispatch hang guard before dispatchNextUnit", () => { + const source = getAutoTsSource(); + const fnIdx = source.indexOf("export async function handleAgentEnd"); + assert.ok(fnIdx > -1, "handleAgentEnd must exist"); + + // Find the section between step mode check and dispatchNextUnit call + const fnBlock = source.slice(fnIdx, source.indexOf("\n// ─── Step Mode", fnIdx + 100)); + assert.ok( + fnBlock.includes("DISPATCH_HANG_TIMEOUT_MS") || fnBlock.includes("dispatchHangGuard"), + "handleAgentEnd must have a dispatch hang guard before calling dispatchNextUnit (#1073)", + ); +}); + +test("dispatch hang guard is cleared in finally block", () => { + const source = getAutoTsSource(); + const fnIdx = source.indexOf("export async function handleAgentEnd"); + const fnBlock = source.slice(fnIdx, source.indexOf("\n// ─── Step Mode", fnIdx + 100)); + assert.ok( + fnBlock.includes("clearTimeout(dispatchHangGuard)"), + "dispatch hang guard must be cleared in finally block to prevent false alarms (#1073)", + ); +}); + +// ── Constants are imported in auto.ts ──────────────────────────────────────── + +test("auto.ts imports NEW_SESSION_TIMEOUT_MS and DISPATCH_HANG_TIMEOUT_MS", () => { + const source = getAutoTsSource(); + assert.ok( + source.includes("NEW_SESSION_TIMEOUT_MS"), + "auto.ts must import NEW_SESSION_TIMEOUT_MS from session.ts", + ); + assert.ok( + source.includes("DISPATCH_HANG_TIMEOUT_MS"), + "auto.ts must import DISPATCH_HANG_TIMEOUT_MS from session.ts", + ); +});