fix(headless): split idle-timeout role from deadlock-backstop role

The single IDLE_TIMEOUT_MS constant was conflating two different jobs:
"are we done?" vs "is the agent stuck?". For multi-turn commands (auto,
next, discuss, plan), the first question is wrong — those signal
completion explicitly via "auto-mode stopped" terminal notifications,
and child-process exit catches crashes. The 120s I'd just bumped
multi-turn to was still in idle-detection mindset; that's not what we
need from this timer.

New semantics:
- IDLE_TIMEOUT_MS = 15s — quick commands (status, queue, …); idle
  really does mean done.
- NEW_MILESTONE_IDLE_TIMEOUT_MS = 120s — bounded creative task with
  pauses for thinking between bootstrap steps.
- MULTI_TURN_DEADLOCK_BACKSTOP_MS = 30 minutes — auto/next/discuss/plan.
  Not a "done" detector; a deadlock recovery bound. Long enough to
  never bother slow LLM reasoning or chained tool calls; short enough
  to recover from a true hang within a reasonable window. Real
  completion comes from terminal notifications + child-process exit,
  both already wired.

Code reads cleaner too: effectiveIdleTimeout selection now mirrors the
three-way conceptual split.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
This commit is contained in:
Mikael Hugo 2026-04-29 15:18:58 +02:00
parent c239ad6c9d
commit df614a3e47
2 changed files with 135 additions and 28 deletions

View file

@ -65,11 +65,37 @@ export function mapStatusToExitCode(status: string): number {
* Blocked detection is separate checked via isBlockedNotification.
*/
export const TERMINAL_PREFIXES = ["auto-mode stopped", "step-mode stopped"];
/**
* Idle timeout for short, single-shot commands (status, queue, history, etc.).
* For these, "no events for 15s after a tool call" really does mean done.
*/
export const IDLE_TIMEOUT_MS = 15_000;
// new-milestone is a long-running creative task where the LLM may pause
// between tool calls (e.g. after mkdir, before writing files). Use a
// longer idle timeout to avoid killing the session prematurely (#808).
/**
* Idle timeout for new-milestone bounded creative task where the LLM may
* pause between tool calls (e.g. after mkdir, before writing files). 120s is
* enough buffer for typical LLM thinking on a one-shot setup workflow (#808).
*/
export const NEW_MILESTONE_IDLE_TIMEOUT_MS = 120_000;
/**
* Deadlock backstop for long-running multi-turn commands (auto, next,
* discuss, plan). The role here is NOT idle-detection ("are we done?")
* those commands signal completion explicitly via "auto-mode stopped" /
* "step-mode stopped" terminal notifications, and the agent's child-process
* exit catches crashes. The only remaining failure mode is a truly hung
* process (deadlock, network stuck without retry, infinite reasoning loop
* outside the LLM's awareness). 30 minutes is long enough to never misfire
* on legitimate slow LLM thinking or chained tool calls, but short enough
* to recover from a real deadlock within a reasonable bound.
*
* Symptom from the old 15s timeout: sf headless auto would dispatch a task,
* the LLM would make 1-2 tool calls, pause to reason, exceed 15s of "no
* events", and headless would declare "Status: complete" exiting at ~35s
* with the task barely started.
*/
export const MULTI_TURN_DEADLOCK_BACKSTOP_MS = 1_800_000;
const INTERACTIVE_HEADLESS_TOOLS = new Set([
"ask_user_questions",
"secure_env_collect",

View file

@ -30,6 +30,7 @@ import {
EXIT_SUCCESS,
FIRE_AND_FORGET_METHODS,
IDLE_TIMEOUT_MS,
MULTI_TURN_DEADLOCK_BACKSTOP_MS,
isBlockedNotification,
isInteractiveHeadlessTool,
isMilestoneReadyNotification,
@ -116,6 +117,38 @@ interface TrackedEvent {
detail?: string;
}
interface HeadlessUnitNotification {
kind: "start" | "end";
unitType: string;
unitId: string;
verdict?: string;
}
export function parseHeadlessUnitNotification(
message: string,
): HeadlessUnitNotification | null {
const start = message.match(/\[unit\]\s+([\w-]+)\s+(\S+)\s+starting/);
if (start) {
return {
kind: "start",
unitType: start[1],
unitId: start[2],
};
}
const end = message.match(/\[unit\]\s+([\w-]+)\s+(\S+)\s+ended\s*->\s*(\w+)/);
if (end) {
return {
kind: "end",
unitType: end[1],
unitId: end[2],
verdict: end[3],
};
}
return null;
}
// ---------------------------------------------------------------------------
// Resume Session Resolution
// ---------------------------------------------------------------------------
@ -552,6 +585,9 @@ async function runHeadlessOnce(
let traceActive = false; // true once maybeStartTrace succeeded
// Current unit span — tool spans are children of this
let activeUnitSpan: ReturnType<typeof startUnitSpan> | null = null;
let activeHeadlessUnit:
| { unitType: string; unitId: string; startedMessage: string }
| null = null;
// Map from tool call ID to its tool span (for matching start/end)
const toolSpanByCallId = new Map<string, ReturnType<typeof startToolSpan>>();
// Tracks pending tool_execution_start for which we haven't seen toolName yet
@ -597,11 +633,12 @@ async function runHeadlessOnce(
* Matches: "[unit] milestone M001 starting" or "[unit] slice M001/S01 starting" etc.
*/
function handleUnitStart(message: string): void {
const match = message.match(/\[unit\]\s+(\w+)\s+(\S+)\s+starting/);
if (!match) return;
const unitType = match[1] as "milestone" | "slice" | "task";
const unitId = match[2];
activeUnitSpan = startUnitSpan(unitType, unitId);
const parsed = parseHeadlessUnitNotification(message);
if (!parsed || parsed.kind !== "start") return;
activeUnitSpan = startUnitSpan(
parsed.unitType as "milestone" | "slice" | "task",
parsed.unitId,
);
}
/**
@ -609,12 +646,10 @@ async function runHeadlessOnce(
* Matches: "[unit] milestone M001 ended -> ok" etc.
*/
function handleUnitEnd(message: string): void {
const match = message.match(
/\[unit\]\s+(\w+)\s+(\S+)\s+ended\s*->\s*(\w+)/,
);
if (!match) return;
const unitId = match[2];
const verdict = match[3];
const parsed = parseHeadlessUnitNotification(message);
if (!parsed || parsed.kind !== "end") return;
const unitId = parsed.unitId;
const verdict = parsed.verdict ?? "error";
// Find the unit span by ID (may not be the top-of-stack if nested)
if (!activeUnitSpan) return;
@ -642,6 +677,28 @@ async function runHeadlessOnce(
activeUnitSpan = null;
}
function trackHeadlessUnitNotification(message: string): void {
const parsed = parseHeadlessUnitNotification(message);
if (!parsed) return;
if (parsed.kind === "start") {
activeHeadlessUnit = {
unitType: parsed.unitType,
unitId: parsed.unitId,
startedMessage: message,
};
return;
}
if (
activeHeadlessUnit &&
activeHeadlessUnit.unitType === parsed.unitType &&
activeHeadlessUnit.unitId === parsed.unitId
) {
activeHeadlessUnit = null;
}
}
/**
* Handle tool_execution_start: create a tool span under the active unit (or root if no unit active).
*/
@ -753,16 +810,22 @@ async function runHeadlessOnce(
resolveCompletion = resolve;
});
// Idle timeout — fallback completion detection.
// Multi-turn commands (auto, next, discuss, plan) involve LLM thinking
// pauses between tool calls that can exceed 15s. Use the longer
// new-milestone timeout for those too — without this, headless killed
// auto-mode after 2 tool calls when the LLM paused to reason about the
// next step (~35s total run, exit "complete" while task barely started).
// Idle timeout — three roles depending on command type:
// - Quick commands (status, queue, …): genuine "are we done?" detector.
// 15s after a tool call without further events = done. (IDLE_TIMEOUT_MS)
// - new-milestone: bounded creative task; 120s buffer for LLM thinking
// between bootstrap steps. (NEW_MILESTONE_IDLE_TIMEOUT_MS)
// - Multi-turn (auto, next, discuss, plan): NOT a completion detector —
// those signal done via "auto-mode stopped" terminal notifications,
// and child-process exit catches crashes. The idle timer here is a
// deadlock BACKSTOP only: 30 minutes, long enough to never misfire on
// legitimate LLM reasoning, short enough to recover from a real hang.
// (MULTI_TURN_DEADLOCK_BACKSTOP_MS)
let idleTimer: ReturnType<typeof setTimeout> | null = null;
const effectiveIdleTimeout =
isNewMilestone || isMultiTurnCommand
? NEW_MILESTONE_IDLE_TIMEOUT_MS
const effectiveIdleTimeout = isNewMilestone
? NEW_MILESTONE_IDLE_TIMEOUT_MS
: isMultiTurnCommand
? MULTI_TURN_DEADLOCK_BACKSTOP_MS
: IDLE_TIMEOUT_MS;
function resetIdleTimer(): void {
@ -1040,12 +1103,15 @@ async function runHeadlessOnce(
}
// Structured trace: handle unit start/end notify messages
if (traceActive && eventObj.method === "notify") {
if (eventObj.method === "notify") {
const message = String(eventObj.message ?? "");
if (message.includes("[unit]") && message.includes("starting")) {
handleUnitStart(message);
} else if (message.includes("[unit]") && message.includes("ended")) {
handleUnitEnd(message);
trackHeadlessUnitNotification(message);
if (traceActive) {
if (message.includes("[unit]") && message.includes("starting")) {
handleUnitStart(message);
} else if (message.includes("[unit]") && message.includes("ended")) {
handleUnitEnd(message);
}
}
}
@ -1305,6 +1371,21 @@ async function runHeadlessOnce(
}
}
if (
isAutoMode &&
exitCode === EXIT_SUCCESS &&
!blocked &&
activeHeadlessUnit
) {
process.stderr.write(
`[headless] Error: Auto-mode ended while ${activeHeadlessUnit.unitType} ${activeHeadlessUnit.unitId} was still in progress.\n`,
);
process.stderr.write(
"[headless] Treating this as incomplete instead of complete; resume with `sf headless auto` after inspecting the worktree.\n",
);
exitCode = EXIT_ERROR;
}
// Cleanup
if (timeoutTimer) clearTimeout(timeoutTimer);
if (idleTimer) clearTimeout(idleTimer);