diag(subagent-runner): per-phase timing + stuck-watchdog for sf-mp8e02m1-zpk903

Adds visible diagnostics to runSubagent so the next time the "session initialized but no LLM call" bug fires, the log identifies which setup phase hangs. Phases instrumented: - resourceLoader.reload() - createAgentSession() - bindExtensions(runLifecycle=...) - session.prompt() entry → return Output format (stderr, prefixed with [subagent:<name>]): phase=resourceLoader.reload 23ms phase=createAgentSession 142ms phase=bindExtensions 89ms runLifecycle=true phase=session.prompt-entered taskLen=8421 timeoutMs=480000 noOutputMs=180000 phase=session.prompt-returned 16234ms ← normal completion STUCK phase=<X> 10000ms (no completion signal ...) ← when watchdog fires Each phase has a soft 10s watchdog that emits a STUCK line if the await doesn't complete in time. The watchdog never aborts — just surfaces visibility. Existing timeoutMs / noOutputTimeoutMs handle actual termination. This is investigation infrastructure for the third prompt-never-sent seam (coding-agent/subagent-runner). The agent-runner.js seam (sf-mp8g4rcd-w01tkh) was fixed in commit 8ee4d8358 with bounded retries. This commit doesn't fix the underlying bug — it makes the bug self-reporting next time it fires so operator and autonomous loop both get actionable signal. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-16 20:40:17 +02:00 · 2026-05-16 20:40:17 +02:00 · 67e5ac9db1
commit 67e5ac9db1
parent 8ee4d83581
1 changed files with 58 additions and 1 deletions
--- a/packages/coding-agent/src/core/subagent-runner.ts
+++ b/packages/coding-agent/src/core/subagent-runner.ts
@ -130,6 +130,26 @@ export async function runSubagent(
 	const timeoutMs = options?.timeoutMs ?? DEFAULT_SUBAGENT_TIMEOUT_MS;
 	const noOutputTimeoutMs = options?.noOutputTimeoutMs ?? 0;

+	// Per-phase timing diagnostics for sf-mp8e02m1-zpk903: the prompt-never-sent
+	// bug surfaces here as "session initialized but no LLM call". Each phase
+	// logs start/end so when an iteration silently hangs, the log identifies
+	// which step is stuck. Also wraps each await in a soft watchdog that emits
+	// a "STUCK in phase X" warning after 10s — no abort, just visibility so
+	// the auto-supervisor can correlate with idle-halt errors.
+	const phaseStartedAt = (): number => Date.now();
+	const logPhase = (phase: string, startedAt: number, extra?: string) => {
+		const ms = Date.now() - startedAt;
+		const tail = extra ? ` ${extra}` : "";
+		process.stderr.write(`[subagent:${name}] phase=${phase} ${ms}ms${tail}\n`);
+	};
+	const phaseWatchdog = (phase: string, startedAt: number): NodeJS.Timeout =>
+		setTimeout(() => {
+			const ms = Date.now() - startedAt;
+			process.stderr.write(
+				`[subagent:${name}] STUCK phase=${phase} ${ms}ms (no completion signal — see sf-mp8e02m1-zpk903 for context)\n`,
+			);
+		}, 10_000);
+
 	// Build an isolated resource loader with the caller's system prompt appended.
 	const agentDir = getAgentDir();
 	const settingsManager = SettingsManager.create(cwd, agentDir);
@ -139,10 +159,16 @@ export async function runSubagent(
 		settingsManager,
 		appendSystemPrompt: config.systemPrompt,
 	});
+	let phaseAt = phaseStartedAt();
+	let watchdog = phaseWatchdog("resourceLoader.reload", phaseAt);
 	await resourceLoader.reload();
+	clearTimeout(watchdog);
+	logPhase("resourceLoader.reload", phaseAt);

 	// Create a fresh in-memory session so it never touches the user's session files.
 	// Tool filtering is applied below via setActiveToolsByName after session creation.
+	phaseAt = phaseStartedAt();
+	watchdog = phaseWatchdog("createAgentSession", phaseAt);
 	const { session, modelFallbackMessage } = await createAgentSession({
 		cwd,
 		agentDir,
@ -151,6 +177,8 @@ export async function runSubagent(
 		settingsManager,
 		persistModelChanges: false,
 	});
+	clearTimeout(watchdog);
+	logPhase("createAgentSession", phaseAt);

 	if (modelFallbackMessage) {
 		// Non-fatal: log but continue with whatever model was resolved.
@ -202,6 +230,8 @@ export async function runSubagent(
 	if (config.permissionLevel) {
 		process.env.SF_PERMISSION_LEVEL = config.permissionLevel;
 	}
+	phaseAt = phaseStartedAt();
+	watchdog = phaseWatchdog("bindExtensions", phaseAt);
 	try {
 		await session.bindExtensions({
 			uiContext: createSubagentUIContext(),
@ -212,6 +242,12 @@ export async function runSubagent(
 			},
 			runLifecycle: Boolean(config.permissionLevel),
 		});
+		clearTimeout(watchdog);
+		logPhase(
+			"bindExtensions",
+			phaseAt,
+			`runLifecycle=${Boolean(config.permissionLevel)}`,
+		);
 	} finally {
 		if (config.permissionLevel) {
 			if (previousPermissionLevel === undefined) {
@ -281,6 +317,18 @@ export async function runSubagent(

 	const promptTask = `Task: ${task}`;

+	// Final diagnostic point: about to call session.prompt. If this never logs
+	// "phase=session.prompt-returned" and the subagent hangs, the LLM dispatch
+	// path itself is the silent abort seam (likely a runExtensionHooks=false
+	// path A/D bypass via slash/sf-route prefix expansion, or a model-dispatch
+	// hang). Combined with the no-output watchdog below, operator gets a clear
+	// "which phase" diagnostic for any future hang report.
+	process.stderr.write(
+		`[subagent:${name}] phase=session.prompt-entered taskLen=${promptTask.length} timeoutMs=${timeoutMs} noOutputMs=${noOutputTimeoutMs}\n`,
+	);
+	const promptEnteredAt = phaseStartedAt();
+	const promptWatchdog = phaseWatchdog("session.prompt", promptEnteredAt);
+
 	try {
 		if (timeoutMs === 0 && noOutputTimeoutMs === 0 && !options?.signal) {
 			// Fast path: no watchdog, no cancellation.
@ -288,6 +336,8 @@ export async function runSubagent(
 				runExtensionHooks: false,
 			});
 			await promptPromise;
+			clearTimeout(promptWatchdog);
+			logPhase("session.prompt-returned", promptEnteredAt);
 			cleanup();
 			return { ok: true, output: extractFinalOutput(), exitCode: 0 };
 		}
@ -364,11 +414,18 @@ export async function runSubagent(
 		const promptPromise = session.prompt(promptTask, {
 			runExtensionHooks: false,
 		});
-		competitors.unshift(promptPromise.then(() => ({}) as RaceResult));
+		competitors.unshift(
+			promptPromise.then(() => {
+				clearTimeout(promptWatchdog);
+				logPhase("session.prompt-returned", promptEnteredAt);
+				return {} as RaceResult;
+			}),
+		);

 		const result = await Promise.race(competitors);

 		cleanup();
+		clearTimeout(promptWatchdog);

 		if (result.timedOut) {
 			return {