fix(triage-apply): 8-minute watchdog on agent dispatch subprocess

Observed 2026-05-14: a triage --apply run hung for 33+ minutes because the spawned subagent process stalled (provider SDK call without its own timeout) and defaultAgentRunner had no watchdog — it waited indefinitely on proc.on("close"). Adds a per-dispatch watchdog (default 8 min, override via SF_TRIAGE_AGENT_TIMEOUT_MS env). On expiry: SIGTERM → 5s grace → SIGKILL. Resolves immediately with ok=false / exitCode=124 (POSIX timeout convention) so the trust / review / mutation gates surface the failure as a real outcome instead of a silent stall. Provider-agnostic: the timeout protects the orchestrator regardless of which model the router picks. Operators running long-context provider calls can bump the env var; default 8min matches runTriage / runReflection's existing completeSimple timeout. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-14 20:28:05 +02:00 · 2026-05-14 20:28:05 +02:00 · b19096800b
commit b19096800b
parent 7cb1eef948
1 changed files with 60 additions and 2 deletions
--- a/src/headless-triage.ts
+++ b/src/headless-triage.ts
@ -362,6 +362,26 @@ function buildSfPrintLaunchArgs(
 	return { command: sfBinPath, args: baseArgs };
 }

+/**
+ * Default per-agent timeout: 8 minutes. Long enough for a real LLM reasoning
+ * pass + tool calls; short enough that a hung gemini OAuth or stalled
+ * provider doesn't lock the whole triage flow indefinitely. Operators can
+ * override via SF_TRIAGE_AGENT_TIMEOUT_MS env var.
+ *
+ * The earlier version had no timeout at all — `defaultAgentRunner` waited
+ * forever on `proc.on("close")`, so a single hung subagent dispatch
+ * blocked the orchestrator until manual kill (observed 2026-05-14:
+ * 33-minute-stuck triage --apply caused by an unresponsive provider).
+ */
+const DEFAULT_AGENT_TIMEOUT_MS = (() => {
+	const fromEnv = Number.parseInt(
+		process.env.SF_TRIAGE_AGENT_TIMEOUT_MS ?? "",
+		10,
+	);
+	if (Number.isFinite(fromEnv) && fromEnv > 0) return fromEnv;
+	return 8 * 60 * 1000;
+})();
+
 async function defaultAgentRunner(
 	agent: AgentConfig,
 	task: string,
@ -405,6 +425,42 @@ async function defaultAgentRunner(
 			});
 			let stdout = "";
 			let stderr = "";
+			let settled = false;
+			const settle = (result: AgentRunResult) => {
+				if (settled) return;
+				settled = true;
+				resolve(result);
+			};
+			// Watchdog: send SIGTERM at the timeout, escalate to SIGKILL
+			// 5s later if the process didn't exit. The result is reported
+			// as ok=false with a clear "timed out" stderr so the trust /
+			// review gate sees a real failure (not a silent stall).
+			const watchdog = setTimeout(() => {
+				if (settled) return;
+				try {
+					proc.kill("SIGTERM");
+				} catch {
+					/* already dead */
+				}
+				const kill = setTimeout(() => {
+					try {
+						proc.kill("SIGKILL");
+					} catch {
+						/* already dead */
+					}
+				}, 5_000);
+				// Resolve immediately on watchdog fire so the orchestrator
+				// can proceed to the next gate; the kill is best-effort
+				// cleanup of the abandoned subprocess.
+				kill.unref?.();
+				settle({
+					ok: false,
+					output: stdout,
+					stderr: `${agent.name} timed out after ${DEFAULT_AGENT_TIMEOUT_MS}ms (configure SF_TRIAGE_AGENT_TIMEOUT_MS to extend)`,
+					exitCode: 124, // POSIX convention for timeout
+				});
+			}, DEFAULT_AGENT_TIMEOUT_MS);
+			watchdog.unref?.();
 			proc.stdout.on("data", (chunk) => {
 				stdout += chunk.toString();
 			});
@ -412,7 +468,8 @@ async function defaultAgentRunner(
 				stderr += chunk.toString();
 			});
 			proc.on("error", (err) => {
-				resolve({
+				clearTimeout(watchdog);
+				settle({
 					ok: false,
 					output: stdout,
 					stderr: err instanceof Error ? err.message : String(err),
@ -420,7 +477,8 @@ async function defaultAgentRunner(
 				});
 			});
 			proc.on("close", (code) => {
-				resolve({
+				clearTimeout(watchdog);
+				settle({
 					ok: (code ?? 1) === 0,
 					output: stdout.trim(),
 					stderr: stderr.trim(),