From 67e5ac9db1e79563255f8086d12090dc27353a7c Mon Sep 17 00:00:00 2001 From: Mikael Hugo Date: Sat, 16 May 2026 20:40:17 +0200 Subject: [PATCH] diag(subagent-runner): per-phase timing + stuck-watchdog for sf-mp8e02m1-zpk903 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds visible diagnostics to runSubagent so the next time the "session initialized but no LLM call" bug fires, the log identifies which setup phase hangs. Phases instrumented: - resourceLoader.reload() - createAgentSession() - bindExtensions(runLifecycle=...) - session.prompt() entry → return Output format (stderr, prefixed with [subagent:]): phase=resourceLoader.reload 23ms phase=createAgentSession 142ms phase=bindExtensions 89ms runLifecycle=true phase=session.prompt-entered taskLen=8421 timeoutMs=480000 noOutputMs=180000 phase=session.prompt-returned 16234ms ← normal completion STUCK phase= 10000ms (no completion signal ...) ← when watchdog fires Each phase has a soft 10s watchdog that emits a STUCK line if the await doesn't complete in time. The watchdog never aborts — just surfaces visibility. Existing timeoutMs / noOutputTimeoutMs handle actual termination. This is investigation infrastructure for the third prompt-never-sent seam (coding-agent/subagent-runner). The agent-runner.js seam (sf-mp8g4rcd-w01tkh) was fixed in commit 8ee4d8358 with bounded retries. This commit doesn't fix the underlying bug — it makes the bug self-reporting next time it fires so operator and autonomous loop both get actionable signal. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../coding-agent/src/core/subagent-runner.ts | 59 ++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/packages/coding-agent/src/core/subagent-runner.ts b/packages/coding-agent/src/core/subagent-runner.ts index 8ce6da0db..42a496346 100644 --- a/packages/coding-agent/src/core/subagent-runner.ts +++ b/packages/coding-agent/src/core/subagent-runner.ts @@ -130,6 +130,26 @@ export async function runSubagent( const timeoutMs = options?.timeoutMs ?? DEFAULT_SUBAGENT_TIMEOUT_MS; const noOutputTimeoutMs = options?.noOutputTimeoutMs ?? 0; + // Per-phase timing diagnostics for sf-mp8e02m1-zpk903: the prompt-never-sent + // bug surfaces here as "session initialized but no LLM call". Each phase + // logs start/end so when an iteration silently hangs, the log identifies + // which step is stuck. Also wraps each await in a soft watchdog that emits + // a "STUCK in phase X" warning after 10s — no abort, just visibility so + // the auto-supervisor can correlate with idle-halt errors. + const phaseStartedAt = (): number => Date.now(); + const logPhase = (phase: string, startedAt: number, extra?: string) => { + const ms = Date.now() - startedAt; + const tail = extra ? ` ${extra}` : ""; + process.stderr.write(`[subagent:${name}] phase=${phase} ${ms}ms${tail}\n`); + }; + const phaseWatchdog = (phase: string, startedAt: number): NodeJS.Timeout => + setTimeout(() => { + const ms = Date.now() - startedAt; + process.stderr.write( + `[subagent:${name}] STUCK phase=${phase} ${ms}ms (no completion signal — see sf-mp8e02m1-zpk903 for context)\n`, + ); + }, 10_000); + // Build an isolated resource loader with the caller's system prompt appended. const agentDir = getAgentDir(); const settingsManager = SettingsManager.create(cwd, agentDir); @@ -139,10 +159,16 @@ export async function runSubagent( settingsManager, appendSystemPrompt: config.systemPrompt, }); + let phaseAt = phaseStartedAt(); + let watchdog = phaseWatchdog("resourceLoader.reload", phaseAt); await resourceLoader.reload(); + clearTimeout(watchdog); + logPhase("resourceLoader.reload", phaseAt); // Create a fresh in-memory session so it never touches the user's session files. // Tool filtering is applied below via setActiveToolsByName after session creation. + phaseAt = phaseStartedAt(); + watchdog = phaseWatchdog("createAgentSession", phaseAt); const { session, modelFallbackMessage } = await createAgentSession({ cwd, agentDir, @@ -151,6 +177,8 @@ export async function runSubagent( settingsManager, persistModelChanges: false, }); + clearTimeout(watchdog); + logPhase("createAgentSession", phaseAt); if (modelFallbackMessage) { // Non-fatal: log but continue with whatever model was resolved. @@ -202,6 +230,8 @@ export async function runSubagent( if (config.permissionLevel) { process.env.SF_PERMISSION_LEVEL = config.permissionLevel; } + phaseAt = phaseStartedAt(); + watchdog = phaseWatchdog("bindExtensions", phaseAt); try { await session.bindExtensions({ uiContext: createSubagentUIContext(), @@ -212,6 +242,12 @@ export async function runSubagent( }, runLifecycle: Boolean(config.permissionLevel), }); + clearTimeout(watchdog); + logPhase( + "bindExtensions", + phaseAt, + `runLifecycle=${Boolean(config.permissionLevel)}`, + ); } finally { if (config.permissionLevel) { if (previousPermissionLevel === undefined) { @@ -281,6 +317,18 @@ export async function runSubagent( const promptTask = `Task: ${task}`; + // Final diagnostic point: about to call session.prompt. If this never logs + // "phase=session.prompt-returned" and the subagent hangs, the LLM dispatch + // path itself is the silent abort seam (likely a runExtensionHooks=false + // path A/D bypass via slash/sf-route prefix expansion, or a model-dispatch + // hang). Combined with the no-output watchdog below, operator gets a clear + // "which phase" diagnostic for any future hang report. + process.stderr.write( + `[subagent:${name}] phase=session.prompt-entered taskLen=${promptTask.length} timeoutMs=${timeoutMs} noOutputMs=${noOutputTimeoutMs}\n`, + ); + const promptEnteredAt = phaseStartedAt(); + const promptWatchdog = phaseWatchdog("session.prompt", promptEnteredAt); + try { if (timeoutMs === 0 && noOutputTimeoutMs === 0 && !options?.signal) { // Fast path: no watchdog, no cancellation. @@ -288,6 +336,8 @@ export async function runSubagent( runExtensionHooks: false, }); await promptPromise; + clearTimeout(promptWatchdog); + logPhase("session.prompt-returned", promptEnteredAt); cleanup(); return { ok: true, output: extractFinalOutput(), exitCode: 0 }; } @@ -364,11 +414,18 @@ export async function runSubagent( const promptPromise = session.prompt(promptTask, { runExtensionHooks: false, }); - competitors.unshift(promptPromise.then(() => ({}) as RaceResult)); + competitors.unshift( + promptPromise.then(() => { + clearTimeout(promptWatchdog); + logPhase("session.prompt-returned", promptEnteredAt); + return {} as RaceResult; + }), + ); const result = await Promise.race(competitors); cleanup(); + clearTimeout(promptWatchdog); if (result.timedOut) { return {