diag(subagent-runner): per-phase timing + stuck-watchdog for sf-mp8e02m1-zpk903
Some checks are pending
CI / detect-changes (push) Waiting to run
CI / docs-check (push) Blocked by required conditions
CI / lint (push) Blocked by required conditions
CI / build (push) Blocked by required conditions
CI / integration-tests (push) Blocked by required conditions
CI / windows-portability (push) Blocked by required conditions
CI / rtk-portability (linux, blacksmith-4vcpu-ubuntu-2404) (push) Blocked by required conditions
CI / rtk-portability (macos, macos-15) (push) Blocked by required conditions
CI / rtk-portability (windows, blacksmith-4vcpu-windows-2025) (push) Blocked by required conditions

Adds visible diagnostics to runSubagent so the next time the
"session initialized but no LLM call" bug fires, the log identifies
which setup phase hangs.

Phases instrumented:
  - resourceLoader.reload()
  - createAgentSession()
  - bindExtensions(runLifecycle=...)
  - session.prompt() entry → return

Output format (stderr, prefixed with [subagent:<name>]):
  phase=resourceLoader.reload 23ms
  phase=createAgentSession 142ms
  phase=bindExtensions 89ms runLifecycle=true
  phase=session.prompt-entered taskLen=8421 timeoutMs=480000 noOutputMs=180000
  phase=session.prompt-returned 16234ms          ← normal completion
  STUCK phase=<X> 10000ms (no completion signal ...)   ← when watchdog fires

Each phase has a soft 10s watchdog that emits a STUCK line if the
await doesn't complete in time. The watchdog never aborts — just
surfaces visibility. Existing timeoutMs / noOutputTimeoutMs handle
actual termination.

This is investigation infrastructure for the third prompt-never-sent
seam (coding-agent/subagent-runner). The agent-runner.js seam
(sf-mp8g4rcd-w01tkh) was fixed in commit 8ee4d8358 with bounded
retries. This commit doesn't fix the underlying bug — it makes the
bug self-reporting next time it fires so operator and autonomous
loop both get actionable signal.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Mikael Hugo 2026-05-16 20:40:17 +02:00
parent 8ee4d83581
commit 67e5ac9db1

View file

@ -130,6 +130,26 @@ export async function runSubagent(
const timeoutMs = options?.timeoutMs ?? DEFAULT_SUBAGENT_TIMEOUT_MS;
const noOutputTimeoutMs = options?.noOutputTimeoutMs ?? 0;
// Per-phase timing diagnostics for sf-mp8e02m1-zpk903: the prompt-never-sent
// bug surfaces here as "session initialized but no LLM call". Each phase
// logs start/end so when an iteration silently hangs, the log identifies
// which step is stuck. Also wraps each await in a soft watchdog that emits
// a "STUCK in phase X" warning after 10s — no abort, just visibility so
// the auto-supervisor can correlate with idle-halt errors.
const phaseStartedAt = (): number => Date.now();
const logPhase = (phase: string, startedAt: number, extra?: string) => {
const ms = Date.now() - startedAt;
const tail = extra ? ` ${extra}` : "";
process.stderr.write(`[subagent:${name}] phase=${phase} ${ms}ms${tail}\n`);
};
const phaseWatchdog = (phase: string, startedAt: number): NodeJS.Timeout =>
setTimeout(() => {
const ms = Date.now() - startedAt;
process.stderr.write(
`[subagent:${name}] STUCK phase=${phase} ${ms}ms (no completion signal — see sf-mp8e02m1-zpk903 for context)\n`,
);
}, 10_000);
// Build an isolated resource loader with the caller's system prompt appended.
const agentDir = getAgentDir();
const settingsManager = SettingsManager.create(cwd, agentDir);
@ -139,10 +159,16 @@ export async function runSubagent(
settingsManager,
appendSystemPrompt: config.systemPrompt,
});
let phaseAt = phaseStartedAt();
let watchdog = phaseWatchdog("resourceLoader.reload", phaseAt);
await resourceLoader.reload();
clearTimeout(watchdog);
logPhase("resourceLoader.reload", phaseAt);
// Create a fresh in-memory session so it never touches the user's session files.
// Tool filtering is applied below via setActiveToolsByName after session creation.
phaseAt = phaseStartedAt();
watchdog = phaseWatchdog("createAgentSession", phaseAt);
const { session, modelFallbackMessage } = await createAgentSession({
cwd,
agentDir,
@ -151,6 +177,8 @@ export async function runSubagent(
settingsManager,
persistModelChanges: false,
});
clearTimeout(watchdog);
logPhase("createAgentSession", phaseAt);
if (modelFallbackMessage) {
// Non-fatal: log but continue with whatever model was resolved.
@ -202,6 +230,8 @@ export async function runSubagent(
if (config.permissionLevel) {
process.env.SF_PERMISSION_LEVEL = config.permissionLevel;
}
phaseAt = phaseStartedAt();
watchdog = phaseWatchdog("bindExtensions", phaseAt);
try {
await session.bindExtensions({
uiContext: createSubagentUIContext(),
@ -212,6 +242,12 @@ export async function runSubagent(
},
runLifecycle: Boolean(config.permissionLevel),
});
clearTimeout(watchdog);
logPhase(
"bindExtensions",
phaseAt,
`runLifecycle=${Boolean(config.permissionLevel)}`,
);
} finally {
if (config.permissionLevel) {
if (previousPermissionLevel === undefined) {
@ -281,6 +317,18 @@ export async function runSubagent(
const promptTask = `Task: ${task}`;
// Final diagnostic point: about to call session.prompt. If this never logs
// "phase=session.prompt-returned" and the subagent hangs, the LLM dispatch
// path itself is the silent abort seam (likely a runExtensionHooks=false
// path A/D bypass via slash/sf-route prefix expansion, or a model-dispatch
// hang). Combined with the no-output watchdog below, operator gets a clear
// "which phase" diagnostic for any future hang report.
process.stderr.write(
`[subagent:${name}] phase=session.prompt-entered taskLen=${promptTask.length} timeoutMs=${timeoutMs} noOutputMs=${noOutputTimeoutMs}\n`,
);
const promptEnteredAt = phaseStartedAt();
const promptWatchdog = phaseWatchdog("session.prompt", promptEnteredAt);
try {
if (timeoutMs === 0 && noOutputTimeoutMs === 0 && !options?.signal) {
// Fast path: no watchdog, no cancellation.
@ -288,6 +336,8 @@ export async function runSubagent(
runExtensionHooks: false,
});
await promptPromise;
clearTimeout(promptWatchdog);
logPhase("session.prompt-returned", promptEnteredAt);
cleanup();
return { ok: true, output: extractFinalOutput(), exitCode: 0 };
}
@ -364,11 +414,18 @@ export async function runSubagent(
const promptPromise = session.prompt(promptTask, {
runExtensionHooks: false,
});
competitors.unshift(promptPromise.then(() => ({}) as RaceResult));
competitors.unshift(
promptPromise.then(() => {
clearTimeout(promptWatchdog);
logPhase("session.prompt-returned", promptEnteredAt);
return {} as RaceResult;
}),
);
const result = await Promise.race(competitors);
cleanup();
clearTimeout(promptWatchdog);
if (result.timedOut) {
return {