* fix: prevent summarizing phase stall by retrying dropped agent_end events (#1072) When handleAgentEnd dispatches a sub-unit (via hooks, triage, or quick-task early-dispatch paths) and that unit completes before handleAgentEnd returns, the resulting agent_end event is silently dropped by the reentrancy guard. This leaves auto-mode active but permanently stalled — no unit running, no watchdog set, process at high CPU doing nothing. Add a pendingAgentEndRetry flag to AutoSession that the reentrancy guard sets when it drops an agent_end event. The finally block in handleAgentEnd checks this flag and schedules a deferred retry via setImmediate, ensuring the completed unit's agent_end is always processed. * fix: add dispatch stall guards to prevent auto-mode pause after slice completion (#1073) After a slice completes all tasks, auto-mode can stall if newSession() hangs or dispatchNextUnit gets permanently blocked at any await point. The existing gap watchdog only fires AFTER dispatchNextUnit returns, so it cannot recover from hangs inside the function itself. - Wrap newSession() with Promise.race timeout (30s) to prevent permanent hangs from session manager deadlocks or network issues - Add pre-dispatch hang guard (60s) in handleAgentEnd that starts the gap watchdog if dispatchNextUnit hasn't completed — catches hangs at any await point (model selection, session creation, etc.) - Add better diagnostics: notify user when session creation times out or fails, with specific unit type/ID for debugging
This commit is contained in:
parent
b2befe3628
commit
288b399f88
3 changed files with 173 additions and 3 deletions
|
|
@ -178,11 +178,13 @@ import {
|
|||
AutoSession,
|
||||
MAX_UNIT_DISPATCHES, STUB_RECOVERY_THRESHOLD, MAX_LIFETIME_DISPATCHES,
|
||||
MAX_CONSECUTIVE_SKIPS, DISPATCH_GAP_TIMEOUT_MS, MAX_SKIP_DEPTH,
|
||||
NEW_SESSION_TIMEOUT_MS, DISPATCH_HANG_TIMEOUT_MS,
|
||||
} from "./auto/session.js";
|
||||
import type { CompletedUnit, CurrentUnit, UnitRouting, StartModel, PendingVerificationRetry } from "./auto/session.js";
|
||||
export {
|
||||
MAX_UNIT_DISPATCHES, STUB_RECOVERY_THRESHOLD, MAX_LIFETIME_DISPATCHES,
|
||||
MAX_CONSECUTIVE_SKIPS, DISPATCH_GAP_TIMEOUT_MS, MAX_SKIP_DEPTH,
|
||||
NEW_SESSION_TIMEOUT_MS, DISPATCH_HANG_TIMEOUT_MS,
|
||||
} from "./auto/session.js";
|
||||
export type { CompletedUnit, CurrentUnit, UnitRouting, StartModel } from "./auto/session.js";
|
||||
|
||||
|
|
@ -1882,6 +1884,25 @@ export async function handleAgentEnd(
|
|||
return;
|
||||
}
|
||||
|
||||
// ── Dispatch with hang detection (#1073) ────────────────────────────────
|
||||
// Start a safety watchdog BEFORE calling dispatchNextUnit. If dispatch
|
||||
// hangs at any await (newSession, model selection, etc.), the gap watchdog
|
||||
// inside handleAgentEnd never fires because we never reach the check.
|
||||
// This pre-dispatch watchdog ensures recovery even when dispatchNextUnit
|
||||
// itself is permanently blocked.
|
||||
const dispatchHangGuard = setTimeout(() => {
|
||||
if (!s.active) return;
|
||||
// dispatchNextUnit has been running for too long — it's likely hung.
|
||||
// Start the gap watchdog which will retry dispatch from scratch.
|
||||
if (!s.unitTimeoutHandle && !s.wrapupWarningHandle) {
|
||||
ctx.ui.notify(
|
||||
`Dispatch hang detected (${DISPATCH_HANG_TIMEOUT_MS / 1000}s without completion). Starting recovery watchdog.`,
|
||||
"warning",
|
||||
);
|
||||
startDispatchGapWatchdog(ctx, pi);
|
||||
}
|
||||
}, DISPATCH_HANG_TIMEOUT_MS);
|
||||
|
||||
try {
|
||||
await dispatchNextUnit(ctx, pi);
|
||||
} catch (dispatchErr) {
|
||||
|
|
@ -1898,6 +1919,8 @@ export async function handleAgentEnd(
|
|||
// This gives transient issues (dirty working tree, branch state) time to settle.
|
||||
startDispatchGapWatchdog(ctx, pi);
|
||||
return;
|
||||
} finally {
|
||||
clearTimeout(dispatchHangGuard);
|
||||
}
|
||||
|
||||
// If dispatchNextUnit returned normally but auto-mode is still s.active and
|
||||
|
|
@ -3028,10 +3051,29 @@ async function dispatchNextUnit(
|
|||
// so the LLM doesn't have to get these right
|
||||
ensurePreconditions(unitType, unitId, s.basePath, state);
|
||||
|
||||
// Fresh session
|
||||
const result = await s.cmdCtx!.newSession();
|
||||
// Fresh session — with timeout to prevent permanent hangs (#1073).
|
||||
// If newSession() hangs (e.g., session manager deadlock, network issue),
|
||||
// without this timeout the entire dispatch chain stalls permanently: no
|
||||
// timeouts are set, no gap watchdog fires, and auto-mode is left active
|
||||
// but idle until the user Ctrl+C's.
|
||||
let result: { cancelled: boolean };
|
||||
try {
|
||||
const sessionPromise = s.cmdCtx!.newSession();
|
||||
const timeoutPromise = new Promise<{ cancelled: true }>((resolve) =>
|
||||
setTimeout(() => resolve({ cancelled: true }), NEW_SESSION_TIMEOUT_MS),
|
||||
);
|
||||
result = await Promise.race([sessionPromise, timeoutPromise]);
|
||||
} catch (sessionErr) {
|
||||
const msg = sessionErr instanceof Error ? sessionErr.message : String(sessionErr);
|
||||
ctx.ui.notify(`Session creation failed: ${msg}. Retrying via watchdog.`, "error");
|
||||
throw new Error(`newSession() failed: ${msg}`);
|
||||
}
|
||||
if (result.cancelled) {
|
||||
await stopAuto(ctx, pi, "Session cancelled");
|
||||
ctx.ui.notify(
|
||||
`Session creation timed out or was cancelled for ${unitType} ${unitId}. Will retry.`,
|
||||
"warning",
|
||||
);
|
||||
await stopAuto(ctx, pi, "Session creation failed");
|
||||
return;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -60,6 +60,8 @@ export const MAX_LIFETIME_DISPATCHES = 6;
|
|||
export const MAX_CONSECUTIVE_SKIPS = 3;
|
||||
export const DISPATCH_GAP_TIMEOUT_MS = 5_000;
|
||||
export const MAX_SKIP_DEPTH = 20;
|
||||
export const NEW_SESSION_TIMEOUT_MS = 30_000;
|
||||
export const DISPATCH_HANG_TIMEOUT_MS = 60_000;
|
||||
|
||||
// ─── AutoSession ─────────────────────────────────────────────────────────────
|
||||
|
||||
|
|
|
|||
126
src/resources/extensions/gsd/tests/dispatch-stall-guard.test.ts
Normal file
126
src/resources/extensions/gsd/tests/dispatch-stall-guard.test.ts
Normal file
|
|
@ -0,0 +1,126 @@
|
|||
/**
|
||||
* dispatch-stall-guard.test.ts — Verifies defensive guards against dispatch stalls (#1073).
|
||||
*
|
||||
* After a slice completes, dispatchNextUnit must reliably dispatch the next unit.
|
||||
* These tests verify:
|
||||
* 1. newSession() has timeout protection (prevents permanent hang if session creation stalls)
|
||||
* 2. handleAgentEnd has a dispatch hang guard (catches dispatchNextUnit itself hanging)
|
||||
* 3. Session timeout constants are exported for configurability
|
||||
*/
|
||||
|
||||
import test from "node:test";
|
||||
import assert from "node:assert/strict";
|
||||
import { readFileSync } from "node:fs";
|
||||
import { join, dirname } from "node:path";
|
||||
import { fileURLToPath } from "node:url";
|
||||
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
const AUTO_TS_PATH = join(__dirname, "..", "auto.ts");
|
||||
const SESSION_TS_PATH = join(__dirname, "..", "auto", "session.ts");
|
||||
|
||||
function getAutoTsSource(): string {
|
||||
return readFileSync(AUTO_TS_PATH, "utf-8");
|
||||
}
|
||||
|
||||
function getSessionTsSource(): string {
|
||||
return readFileSync(SESSION_TS_PATH, "utf-8");
|
||||
}
|
||||
|
||||
// ── Session timeout constants ───────────────────────────────────────────────
|
||||
|
||||
test("AutoSession exports NEW_SESSION_TIMEOUT_MS constant", () => {
|
||||
const source = getSessionTsSource();
|
||||
assert.ok(
|
||||
source.includes("NEW_SESSION_TIMEOUT_MS"),
|
||||
"auto/session.ts must export NEW_SESSION_TIMEOUT_MS for newSession() timeout",
|
||||
);
|
||||
});
|
||||
|
||||
test("AutoSession exports DISPATCH_HANG_TIMEOUT_MS constant", () => {
|
||||
const source = getSessionTsSource();
|
||||
assert.ok(
|
||||
source.includes("DISPATCH_HANG_TIMEOUT_MS"),
|
||||
"auto/session.ts must export DISPATCH_HANG_TIMEOUT_MS for dispatch hang detection",
|
||||
);
|
||||
});
|
||||
|
||||
test("NEW_SESSION_TIMEOUT_MS is a reasonable value (15-120 seconds)", () => {
|
||||
const source = getSessionTsSource();
|
||||
const match = source.match(/NEW_SESSION_TIMEOUT_MS\s*=\s*(\d[\d_]*)/);
|
||||
assert.ok(match, "NEW_SESSION_TIMEOUT_MS must have a numeric value");
|
||||
const value = parseInt(match![1]!.replace(/_/g, ""), 10);
|
||||
assert.ok(value >= 15_000 && value <= 120_000,
|
||||
`NEW_SESSION_TIMEOUT_MS must be 15-120s, got ${value}ms`,
|
||||
);
|
||||
});
|
||||
|
||||
test("DISPATCH_HANG_TIMEOUT_MS is greater than NEW_SESSION_TIMEOUT_MS", () => {
|
||||
const source = getSessionTsSource();
|
||||
const sessionMatch = source.match(/NEW_SESSION_TIMEOUT_MS\s*=\s*(\d[\d_]*)/);
|
||||
const dispatchMatch = source.match(/DISPATCH_HANG_TIMEOUT_MS\s*=\s*(\d[\d_]*)/);
|
||||
assert.ok(sessionMatch && dispatchMatch, "Both timeout constants must exist");
|
||||
const sessionTimeout = parseInt(sessionMatch![1]!.replace(/_/g, ""), 10);
|
||||
const dispatchTimeout = parseInt(dispatchMatch![1]!.replace(/_/g, ""), 10);
|
||||
assert.ok(dispatchTimeout > sessionTimeout,
|
||||
`DISPATCH_HANG_TIMEOUT_MS (${dispatchTimeout}) must be > NEW_SESSION_TIMEOUT_MS (${sessionTimeout})`,
|
||||
);
|
||||
});
|
||||
|
||||
// ── newSession() timeout in dispatchNextUnit ─────────────────────────────────
|
||||
|
||||
test("dispatchNextUnit wraps newSession() with Promise.race timeout", () => {
|
||||
const source = getAutoTsSource();
|
||||
// Search the full file — dispatchNextUnit is very large
|
||||
assert.ok(
|
||||
source.includes("Promise.race") && source.includes("NEW_SESSION_TIMEOUT_MS"),
|
||||
"dispatchNextUnit must use Promise.race with NEW_SESSION_TIMEOUT_MS to timeout newSession() (#1073)",
|
||||
);
|
||||
});
|
||||
|
||||
test("dispatchNextUnit handles newSession() timeout gracefully", () => {
|
||||
const source = getAutoTsSource();
|
||||
// Must notify user when session times out
|
||||
assert.ok(
|
||||
source.includes("Session creation timed out") || source.includes("Session creation failed"),
|
||||
"dispatchNextUnit must notify user when newSession() times out or fails (#1073)",
|
||||
);
|
||||
});
|
||||
|
||||
// ── Dispatch hang guard in handleAgentEnd ────────────────────────────────────
|
||||
|
||||
test("handleAgentEnd has a dispatch hang guard before dispatchNextUnit", () => {
|
||||
const source = getAutoTsSource();
|
||||
const fnIdx = source.indexOf("export async function handleAgentEnd");
|
||||
assert.ok(fnIdx > -1, "handleAgentEnd must exist");
|
||||
|
||||
// Find the section between step mode check and dispatchNextUnit call
|
||||
const fnBlock = source.slice(fnIdx, source.indexOf("\n// ─── Step Mode", fnIdx + 100));
|
||||
assert.ok(
|
||||
fnBlock.includes("DISPATCH_HANG_TIMEOUT_MS") || fnBlock.includes("dispatchHangGuard"),
|
||||
"handleAgentEnd must have a dispatch hang guard before calling dispatchNextUnit (#1073)",
|
||||
);
|
||||
});
|
||||
|
||||
test("dispatch hang guard is cleared in finally block", () => {
|
||||
const source = getAutoTsSource();
|
||||
const fnIdx = source.indexOf("export async function handleAgentEnd");
|
||||
const fnBlock = source.slice(fnIdx, source.indexOf("\n// ─── Step Mode", fnIdx + 100));
|
||||
assert.ok(
|
||||
fnBlock.includes("clearTimeout(dispatchHangGuard)"),
|
||||
"dispatch hang guard must be cleared in finally block to prevent false alarms (#1073)",
|
||||
);
|
||||
});
|
||||
|
||||
// ── Constants are imported in auto.ts ────────────────────────────────────────
|
||||
|
||||
test("auto.ts imports NEW_SESSION_TIMEOUT_MS and DISPATCH_HANG_TIMEOUT_MS", () => {
|
||||
const source = getAutoTsSource();
|
||||
assert.ok(
|
||||
source.includes("NEW_SESSION_TIMEOUT_MS"),
|
||||
"auto.ts must import NEW_SESSION_TIMEOUT_MS from session.ts",
|
||||
);
|
||||
assert.ok(
|
||||
source.includes("DISPATCH_HANG_TIMEOUT_MS"),
|
||||
"auto.ts must import DISPATCH_HANG_TIMEOUT_MS from session.ts",
|
||||
);
|
||||
});
|
||||
Loading…
Add table
Reference in a new issue