From ae4ae8e8d8a17379a85e780ee31605a2ae6d3a52 Mon Sep 17 00:00:00 2001 From: Jeremy McSpadden Date: Mon, 16 Mar 2026 18:04:27 -0500 Subject: [PATCH] fix(auto): add stalled-tool detection and background process prompt guidance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two additional layers to address #733 (background command hang): 1. Stalled-tool detection in idle watchdog (auto.ts) - Change inFlightTools from Set to Map to track per-tool start timestamps - Idle watchdog now compares the oldest in-flight tool's age to the idle timeout. Tools in-flight for < idleTimeoutMs continue to suppress recovery as before. Tools running >= idleTimeoutMs are treated as stuck and recovery proceeds — preventing infinite hang when the bash rewrite is bypassed or a tool hangs for other reasons. - Export getOldestInFlightToolAgeMs() for testability 2. Prompt guidance in execute-task.md - Add explicit "Background process rule" to step 5 explaining why bare `command &` hangs the Bash tool and showing the correct `command > /dev/null 2>&1 &` pattern - Recommends bg_shell tool as the preferred approach 3. Test updates (in-flight-tool-tracking.test.ts) - Import and verify getOldestInFlightToolAgeMs export - Update header comment to reflect Map-with-timestamps design --- src/resources/extensions/gsd/auto.ts | 48 +++++++++++++++---- .../extensions/gsd/prompts/execute-task.md | 5 ++ .../gsd/tests/in-flight-tool-tracking.test.ts | 23 ++++++--- 3 files changed, 60 insertions(+), 16 deletions(-) diff --git a/src/resources/extensions/gsd/auto.ts b/src/resources/extensions/gsd/auto.ts index 87ef155f4..4eb7cbc44 100644 --- a/src/resources/extensions/gsd/auto.ts +++ b/src/resources/extensions/gsd/auto.ts @@ -337,8 +337,12 @@ let lastBaselineCharCount: number | undefined; /** SIGTERM handler registered while auto-mode is active — cleared on stop/pause. */ let _sigtermHandler: (() => void) | null = null; -/** Tool calls currently being executed — prevents false idle detection during long-running tools. */ -const inFlightTools = new Set(); +/** + * Tool calls currently being executed — prevents false idle detection during long-running tools. + * Maps toolCallId → start timestamp (ms) so the idle watchdog can detect tools that have been + * running suspiciously long (e.g., a Bash command hung because `&` kept stdout open). + */ +const inFlightTools = new Map(); type BudgetAlertLevel = 0 | 75 | 90 | 100; @@ -417,11 +421,11 @@ export function isAutoPaused(): boolean { /** * Mark a tool execution as in-flight. Called from index.ts on tool_execution_start. - * Prevents the idle watchdog from declaring the agent idle while tools are executing. + * Records start time so the idle watchdog can detect tools hung longer than the idle timeout. */ export function markToolStart(toolCallId: string): void { if (!active) return; - inFlightTools.add(toolCallId); + inFlightTools.set(toolCallId, Date.now()); } /** @@ -431,6 +435,16 @@ export function markToolEnd(toolCallId: string): void { inFlightTools.delete(toolCallId); } +/** + * Returns the age (ms) of the oldest currently in-flight tool, or 0 if none. + * Exported for testing. + */ +export function getOldestInFlightToolAgeMs(): number { + if (inFlightTools.size === 0) return 0; + const oldestStart = Math.min(...inFlightTools.values()); + return Date.now() - oldestStart; +} + /** * Return the base path to use for the auto.lock file. * Always uses the original project root (not the worktree) so that @@ -2844,13 +2858,27 @@ async function dispatchNextUnit( if (Date.now() - runtime.lastProgressAt < idleTimeoutMs) return; // Agent has tool calls currently executing (await_job, long bash, etc.) — - // not idle, just waiting for tool completion. + // not idle, just waiting for tool completion. But only suppress recovery + // if the tool started recently. A tool in-flight for longer than the idle + // timeout is likely stuck — e.g., `python -m http.server 8080 &` keeps the + // shell's stdout/stderr open, causing the Bash tool to hang indefinitely. if (inFlightTools.size > 0) { - writeUnitRuntimeRecord(basePath, unitType, unitId, currentUnit.startedAt, { - lastProgressAt: Date.now(), - lastProgressKind: "tool-in-flight", - }); - return; + const oldestStart = Math.min(...inFlightTools.values()); + const toolAgeMs = Date.now() - oldestStart; + if (toolAgeMs < idleTimeoutMs) { + writeUnitRuntimeRecord(basePath, unitType, unitId, currentUnit.startedAt, { + lastProgressAt: Date.now(), + lastProgressKind: "tool-in-flight", + }); + return; + } + // Oldest tool has been running >= idleTimeoutMs — treat as a stuck/hung + // tool (e.g., background process holding stdout open). Fall through to + // idle recovery without resetting the progress clock. + ctx.ui.notify( + `Stalled tool detected: a tool has been in-flight for ${Math.round(toolAgeMs / 60000)}min. Treating as hung — attempting idle recovery.`, + "warning", + ); } // Before triggering recovery, check if the agent is actually producing diff --git a/src/resources/extensions/gsd/prompts/execute-task.md b/src/resources/extensions/gsd/prompts/execute-task.md index 5e40368bd..8f4690f74 100644 --- a/src/resources/extensions/gsd/prompts/execute-task.md +++ b/src/resources/extensions/gsd/prompts/execute-task.md @@ -31,6 +31,11 @@ Then: 3. Build the real thing. If the task plan says "create login endpoint", build an endpoint that actually authenticates against a real store, not one that returns a hardcoded success response. If the task plan says "create dashboard page", build a page that renders real data from the API, not a component with hardcoded props. Stubs and mocks are for tests, not for the shipped feature. 4. Write or update tests as part of execution — tests are verification, not an afterthought. If the slice plan defines test files in its Verification section and this is the first task, create them (they should initially fail). 5. When implementing non-trivial runtime behavior (async flows, API boundaries, background processes, error paths), add or preserve agent-usable observability. Skip this for simple changes where it doesn't apply. + + **Background process rule:** Never use bare `command &` to run background processes. The shell's `&` operator leaves stdout/stderr attached to the parent, which causes the Bash tool to hang indefinitely waiting for those streams to close. Always redirect output before backgrounding: + - Correct: `command > /dev/null 2>&1 &` or `nohup command > /dev/null 2>&1 &` + - Example: `python -m http.server 8080 > /dev/null 2>&1 &` (NOT `python -m http.server 8080 &`) + - Preferred: use the `bg_shell` tool if available — it manages process lifecycle correctly without stream-inheritance issues 6. Verify must-haves are met by running concrete checks (tests, commands, observable behaviors) 7. Run the slice-level verification checks defined in the slice plan's Verification section. Track which pass. On the final task of the slice, all must pass before marking done. On intermediate tasks, partial passes are expected — note which ones pass in the summary. 8. If the task touches UI, browser flows, DOM behavior, or user-visible web state: diff --git a/src/resources/extensions/gsd/tests/in-flight-tool-tracking.test.ts b/src/resources/extensions/gsd/tests/in-flight-tool-tracking.test.ts index 9e80f00c7..b39fbce93 100644 --- a/src/resources/extensions/gsd/tests/in-flight-tool-tracking.test.ts +++ b/src/resources/extensions/gsd/tests/in-flight-tool-tracking.test.ts @@ -1,6 +1,6 @@ /** * In-flight tool tracking tests — verifies that markToolStart/markToolEnd - * correctly manage the in-flight tools set used by the idle watchdog to + * correctly manage the in-flight tools map used by the idle watchdog to * distinguish "agent waiting on long-running tool" from "agent is idle". * * Background: The idle watchdog checks every 15s for agent progress. Without @@ -8,12 +8,15 @@ * can run 20+ minutes for evaluations, deployments, test suites) are falsely * declared idle and interrupted by recovery steering messages. * - * The fix hooks tool_execution_start/end events to track active tool calls. - * When tools are in-flight, the watchdog resets lastProgressAt instead of - * triggering idle recovery. + * The fix hooks tool_execution_start/end events to track active tool calls + * with start timestamps. When tools are in-flight and started recently + * (< idleTimeoutMs), the watchdog resets lastProgressAt instead of triggering + * idle recovery. When a tool has been in-flight for longer than idleTimeoutMs, + * it is treated as stuck (e.g., `command &` keeping stdout open) and recovery + * proceeds anyway. */ -import { markToolStart, markToolEnd, isAutoActive } from "../auto.ts"; +import { markToolStart, markToolEnd, isAutoActive, getOldestInFlightToolAgeMs } from "../auto.ts"; import { createTestContext } from './test-helpers.ts'; const { assertEq, assertTrue, report } = createTestContext(); @@ -49,9 +52,17 @@ const { assertEq, assertTrue, report } = createTestContext(); // ═══ Integration contract: expected exports from auto.ts ═════════════════════ { - console.log("\n=== auto.ts exports markToolStart and markToolEnd ==="); + console.log("\n=== auto.ts exports markToolStart, markToolEnd, and getOldestInFlightToolAgeMs ==="); assertEq(typeof markToolStart, "function", "markToolStart should be a function"); assertEq(typeof markToolEnd, "function", "markToolEnd should be a function"); + assertEq(typeof getOldestInFlightToolAgeMs, "function", "getOldestInFlightToolAgeMs should be a function"); +} + +{ + console.log("\n=== getOldestInFlightToolAgeMs: returns 0 when no tools in-flight ==="); + // When auto-mode is inactive, inFlightTools map is empty → age is 0 + const age = getOldestInFlightToolAgeMs(); + assertEq(age, 0, "should return 0 when no tools are in-flight"); } {