fix(triage-apply): 8-minute watchdog on agent dispatch subprocess
Observed 2026-05-14: a triage --apply run hung for 33+ minutes because
the spawned subagent process stalled (provider SDK call without its own
timeout) and defaultAgentRunner had no watchdog — it waited indefinitely
on proc.on("close").
Adds a per-dispatch watchdog (default 8 min, override via
SF_TRIAGE_AGENT_TIMEOUT_MS env). On expiry: SIGTERM → 5s grace →
SIGKILL. Resolves immediately with ok=false / exitCode=124 (POSIX
timeout convention) so the trust / review / mutation gates surface
the failure as a real outcome instead of a silent stall.
Provider-agnostic: the timeout protects the orchestrator regardless of
which model the router picks. Operators running long-context provider
calls can bump the env var; default 8min matches runTriage /
runReflection's existing completeSimple timeout.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
7cb1eef948
commit
b19096800b
1 changed files with 60 additions and 2 deletions
|
|
@ -362,6 +362,26 @@ function buildSfPrintLaunchArgs(
|
|||
return { command: sfBinPath, args: baseArgs };
|
||||
}
|
||||
|
||||
/**
|
||||
* Default per-agent timeout: 8 minutes. Long enough for a real LLM reasoning
|
||||
* pass + tool calls; short enough that a hung gemini OAuth or stalled
|
||||
* provider doesn't lock the whole triage flow indefinitely. Operators can
|
||||
* override via SF_TRIAGE_AGENT_TIMEOUT_MS env var.
|
||||
*
|
||||
* The earlier version had no timeout at all — `defaultAgentRunner` waited
|
||||
* forever on `proc.on("close")`, so a single hung subagent dispatch
|
||||
* blocked the orchestrator until manual kill (observed 2026-05-14:
|
||||
* 33-minute-stuck triage --apply caused by an unresponsive provider).
|
||||
*/
|
||||
const DEFAULT_AGENT_TIMEOUT_MS = (() => {
|
||||
const fromEnv = Number.parseInt(
|
||||
process.env.SF_TRIAGE_AGENT_TIMEOUT_MS ?? "",
|
||||
10,
|
||||
);
|
||||
if (Number.isFinite(fromEnv) && fromEnv > 0) return fromEnv;
|
||||
return 8 * 60 * 1000;
|
||||
})();
|
||||
|
||||
async function defaultAgentRunner(
|
||||
agent: AgentConfig,
|
||||
task: string,
|
||||
|
|
@ -405,6 +425,42 @@ async function defaultAgentRunner(
|
|||
});
|
||||
let stdout = "";
|
||||
let stderr = "";
|
||||
let settled = false;
|
||||
const settle = (result: AgentRunResult) => {
|
||||
if (settled) return;
|
||||
settled = true;
|
||||
resolve(result);
|
||||
};
|
||||
// Watchdog: send SIGTERM at the timeout, escalate to SIGKILL
|
||||
// 5s later if the process didn't exit. The result is reported
|
||||
// as ok=false with a clear "timed out" stderr so the trust /
|
||||
// review gate sees a real failure (not a silent stall).
|
||||
const watchdog = setTimeout(() => {
|
||||
if (settled) return;
|
||||
try {
|
||||
proc.kill("SIGTERM");
|
||||
} catch {
|
||||
/* already dead */
|
||||
}
|
||||
const kill = setTimeout(() => {
|
||||
try {
|
||||
proc.kill("SIGKILL");
|
||||
} catch {
|
||||
/* already dead */
|
||||
}
|
||||
}, 5_000);
|
||||
// Resolve immediately on watchdog fire so the orchestrator
|
||||
// can proceed to the next gate; the kill is best-effort
|
||||
// cleanup of the abandoned subprocess.
|
||||
kill.unref?.();
|
||||
settle({
|
||||
ok: false,
|
||||
output: stdout,
|
||||
stderr: `${agent.name} timed out after ${DEFAULT_AGENT_TIMEOUT_MS}ms (configure SF_TRIAGE_AGENT_TIMEOUT_MS to extend)`,
|
||||
exitCode: 124, // POSIX convention for timeout
|
||||
});
|
||||
}, DEFAULT_AGENT_TIMEOUT_MS);
|
||||
watchdog.unref?.();
|
||||
proc.stdout.on("data", (chunk) => {
|
||||
stdout += chunk.toString();
|
||||
});
|
||||
|
|
@ -412,7 +468,8 @@ async function defaultAgentRunner(
|
|||
stderr += chunk.toString();
|
||||
});
|
||||
proc.on("error", (err) => {
|
||||
resolve({
|
||||
clearTimeout(watchdog);
|
||||
settle({
|
||||
ok: false,
|
||||
output: stdout,
|
||||
stderr: err instanceof Error ? err.message : String(err),
|
||||
|
|
@ -420,7 +477,8 @@ async function defaultAgentRunner(
|
|||
});
|
||||
});
|
||||
proc.on("close", (code) => {
|
||||
resolve({
|
||||
clearTimeout(watchdog);
|
||||
settle({
|
||||
ok: (code ?? 1) === 0,
|
||||
output: stdout.trim(),
|
||||
stderr: stderr.trim(),
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue