diff --git a/src/headless-triage.ts b/src/headless-triage.ts index 4fa88d58e..4d87a896c 100644 --- a/src/headless-triage.ts +++ b/src/headless-triage.ts @@ -362,6 +362,26 @@ function buildSfPrintLaunchArgs( return { command: sfBinPath, args: baseArgs }; } +/** + * Default per-agent timeout: 8 minutes. Long enough for a real LLM reasoning + * pass + tool calls; short enough that a hung gemini OAuth or stalled + * provider doesn't lock the whole triage flow indefinitely. Operators can + * override via SF_TRIAGE_AGENT_TIMEOUT_MS env var. + * + * The earlier version had no timeout at all — `defaultAgentRunner` waited + * forever on `proc.on("close")`, so a single hung subagent dispatch + * blocked the orchestrator until manual kill (observed 2026-05-14: + * 33-minute-stuck triage --apply caused by an unresponsive provider). + */ +const DEFAULT_AGENT_TIMEOUT_MS = (() => { + const fromEnv = Number.parseInt( + process.env.SF_TRIAGE_AGENT_TIMEOUT_MS ?? "", + 10, + ); + if (Number.isFinite(fromEnv) && fromEnv > 0) return fromEnv; + return 8 * 60 * 1000; +})(); + async function defaultAgentRunner( agent: AgentConfig, task: string, @@ -405,6 +425,42 @@ async function defaultAgentRunner( }); let stdout = ""; let stderr = ""; + let settled = false; + const settle = (result: AgentRunResult) => { + if (settled) return; + settled = true; + resolve(result); + }; + // Watchdog: send SIGTERM at the timeout, escalate to SIGKILL + // 5s later if the process didn't exit. The result is reported + // as ok=false with a clear "timed out" stderr so the trust / + // review gate sees a real failure (not a silent stall). + const watchdog = setTimeout(() => { + if (settled) return; + try { + proc.kill("SIGTERM"); + } catch { + /* already dead */ + } + const kill = setTimeout(() => { + try { + proc.kill("SIGKILL"); + } catch { + /* already dead */ + } + }, 5_000); + // Resolve immediately on watchdog fire so the orchestrator + // can proceed to the next gate; the kill is best-effort + // cleanup of the abandoned subprocess. + kill.unref?.(); + settle({ + ok: false, + output: stdout, + stderr: `${agent.name} timed out after ${DEFAULT_AGENT_TIMEOUT_MS}ms (configure SF_TRIAGE_AGENT_TIMEOUT_MS to extend)`, + exitCode: 124, // POSIX convention for timeout + }); + }, DEFAULT_AGENT_TIMEOUT_MS); + watchdog.unref?.(); proc.stdout.on("data", (chunk) => { stdout += chunk.toString(); }); @@ -412,7 +468,8 @@ async function defaultAgentRunner( stderr += chunk.toString(); }); proc.on("error", (err) => { - resolve({ + clearTimeout(watchdog); + settle({ ok: false, output: stdout, stderr: err instanceof Error ? err.message : String(err), @@ -420,7 +477,8 @@ async function defaultAgentRunner( }); }); proc.on("close", (code) => { - resolve({ + clearTimeout(watchdog); + settle({ ok: (code ?? 1) === 0, output: stdout.trim(), stderr: stderr.trim(),