From 67e5ac9db1e79563255f8086d12090dc27353a7c Mon Sep 17 00:00:00 2001
From: Mikael Hugo <mikkihugo@users.noreply.github.com>
Date: Sat, 16 May 2026 20:40:17 +0200
Subject: [PATCH] diag(subagent-runner): per-phase timing + stuck-watchdog for
 sf-mp8e02m1-zpk903
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds visible diagnostics to runSubagent so the next time the
"session initialized but no LLM call" bug fires, the log identifies
which setup phase hangs.

Phases instrumented:
  - resourceLoader.reload()
  - createAgentSession()
  - bindExtensions(runLifecycle=...)
  - session.prompt() entry → return

Output format (stderr, prefixed with [subagent:<name>]):
  phase=resourceLoader.reload 23ms
  phase=createAgentSession 142ms
  phase=bindExtensions 89ms runLifecycle=true
  phase=session.prompt-entered taskLen=8421 timeoutMs=480000 noOutputMs=180000
  phase=session.prompt-returned 16234ms          ← normal completion
  STUCK phase=<X> 10000ms (no completion signal ...)   ← when watchdog fires

Each phase has a soft 10s watchdog that emits a STUCK line if the
await doesn't complete in time. The watchdog never aborts — just
surfaces visibility. Existing timeoutMs / noOutputTimeoutMs handle
actual termination.

This is investigation infrastructure for the third prompt-never-sent
seam (coding-agent/subagent-runner). The agent-runner.js seam
(sf-mp8g4rcd-w01tkh) was fixed in commit 8ee4d8358 with bounded
retries. This commit doesn't fix the underlying bug — it makes the
bug self-reporting next time it fires so operator and autonomous
loop both get actionable signal.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .../coding-agent/src/core/subagent-runner.ts  | 59 ++++++++++++++++++-
 1 file changed, 58 insertions(+), 1 deletion(-)
diff --git a/packages/coding-agent/src/core/subagent-runner.ts b/packages/coding-agent/src/core/subagent-runner.ts
index 8ce6da0db..42a496346 100644
--- a/packages/coding-agent/src/core/subagent-runner.ts
+++ b/packages/coding-agent/src/core/subagent-runner.ts
@@ -130,6 +130,26 @@ export async function runSubagent(
 	const timeoutMs = options?.timeoutMs ?? DEFAULT_SUBAGENT_TIMEOUT_MS;
 	const noOutputTimeoutMs = options?.noOutputTimeoutMs ?? 0;
 
+	// Per-phase timing diagnostics for sf-mp8e02m1-zpk903: the prompt-never-sent
+	// bug surfaces here as "session initialized but no LLM call". Each phase
+	// logs start/end so when an iteration silently hangs, the log identifies
+	// which step is stuck. Also wraps each await in a soft watchdog that emits
+	// a "STUCK in phase X" warning after 10s — no abort, just visibility so
+	// the auto-supervisor can correlate with idle-halt errors.
+	const phaseStartedAt = (): number => Date.now();
+	const logPhase = (phase: string, startedAt: number, extra?: string) => {
+		const ms = Date.now() - startedAt;
+		const tail = extra ? ` ${extra}` : "";
+		process.stderr.write(`[subagent:${name}] phase=${phase} ${ms}ms${tail}\n`);
+	};
+	const phaseWatchdog = (phase: string, startedAt: number): NodeJS.Timeout =>
+		setTimeout(() => {
+			const ms = Date.now() - startedAt;
+			process.stderr.write(
+				`[subagent:${name}] STUCK phase=${phase} ${ms}ms (no completion signal — see sf-mp8e02m1-zpk903 for context)\n`,
+			);
+		}, 10_000);
+
 	// Build an isolated resource loader with the caller's system prompt appended.
 	const agentDir = getAgentDir();
 	const settingsManager = SettingsManager.create(cwd, agentDir);
@@ -139,10 +159,16 @@ export async function runSubagent(
 		settingsManager,
 		appendSystemPrompt: config.systemPrompt,
 	});
+	let phaseAt = phaseStartedAt();
+	let watchdog = phaseWatchdog("resourceLoader.reload", phaseAt);
 	await resourceLoader.reload();
+	clearTimeout(watchdog);
+	logPhase("resourceLoader.reload", phaseAt);
 
 	// Create a fresh in-memory session so it never touches the user's session files.
 	// Tool filtering is applied below via setActiveToolsByName after session creation.
+	phaseAt = phaseStartedAt();
+	watchdog = phaseWatchdog("createAgentSession", phaseAt);
 	const { session, modelFallbackMessage } = await createAgentSession({
 		cwd,
 		agentDir,
@@ -151,6 +177,8 @@ export async function runSubagent(
 		settingsManager,
 		persistModelChanges: false,
 	});
+	clearTimeout(watchdog);
+	logPhase("createAgentSession", phaseAt);
 
 	if (modelFallbackMessage) {
 		// Non-fatal: log but continue with whatever model was resolved.
@@ -202,6 +230,8 @@ export async function runSubagent(
 	if (config.permissionLevel) {
 		process.env.SF_PERMISSION_LEVEL = config.permissionLevel;
 	}
+	phaseAt = phaseStartedAt();
+	watchdog = phaseWatchdog("bindExtensions", phaseAt);
 	try {
 		await session.bindExtensions({
 			uiContext: createSubagentUIContext(),
@@ -212,6 +242,12 @@ export async function runSubagent(
 			},
 			runLifecycle: Boolean(config.permissionLevel),
 		});
+		clearTimeout(watchdog);
+		logPhase(
+			"bindExtensions",
+			phaseAt,
+			`runLifecycle=${Boolean(config.permissionLevel)}`,
+		);
 	} finally {
 		if (config.permissionLevel) {
 			if (previousPermissionLevel === undefined) {
@@ -281,6 +317,18 @@ export async function runSubagent(
 
 	const promptTask = `Task: ${task}`;
 
+	// Final diagnostic point: about to call session.prompt. If this never logs
+	// "phase=session.prompt-returned" and the subagent hangs, the LLM dispatch
+	// path itself is the silent abort seam (likely a runExtensionHooks=false
+	// path A/D bypass via slash/sf-route prefix expansion, or a model-dispatch
+	// hang). Combined with the no-output watchdog below, operator gets a clear
+	// "which phase" diagnostic for any future hang report.
+	process.stderr.write(
+		`[subagent:${name}] phase=session.prompt-entered taskLen=${promptTask.length} timeoutMs=${timeoutMs} noOutputMs=${noOutputTimeoutMs}\n`,
+	);
+	const promptEnteredAt = phaseStartedAt();
+	const promptWatchdog = phaseWatchdog("session.prompt", promptEnteredAt);
+
 	try {
 		if (timeoutMs === 0 && noOutputTimeoutMs === 0 && !options?.signal) {
 			// Fast path: no watchdog, no cancellation.
@@ -288,6 +336,8 @@ export async function runSubagent(
 				runExtensionHooks: false,
 			});
 			await promptPromise;
+			clearTimeout(promptWatchdog);
+			logPhase("session.prompt-returned", promptEnteredAt);
 			cleanup();
 			return { ok: true, output: extractFinalOutput(), exitCode: 0 };
 		}
@@ -364,11 +414,18 @@ export async function runSubagent(
 		const promptPromise = session.prompt(promptTask, {
 			runExtensionHooks: false,
 		});
-		competitors.unshift(promptPromise.then(() => ({}) as RaceResult));
+		competitors.unshift(
+			promptPromise.then(() => {
+				clearTimeout(promptWatchdog);
+				logPhase("session.prompt-returned", promptEnteredAt);
+				return {} as RaceResult;
+			}),
+		);
 
 		const result = await Promise.race(competitors);
 
 		cleanup();
+		clearTimeout(promptWatchdog);
 
 		if (result.timedOut) {
 			return {