fix: idle watchdog stalled-tool detection overridden by filesystem activity (#2697)

Bug 1: When a tool stalls longer than idle_timeout, the watchdog notifies
but falls through to detectWorkingTreeActivity(), which resets
lastProgressAt when files were modified earlier in the task. Recovery is
never called — the session burns tokens indefinitely.

Fix: Add stalledToolDetected flag + clearInFlightTools() call. The
filesystem-activity check is guarded by !stalledToolDetected so it
cannot override the stall verdict.

Bug 2: After async recoverTimedOutUnit(), pauseAuto/stopAuto may set
s.currentUnit = null during the await, but the next line accesses
s.currentUnit.startedAt without a null guard — crash.

Fix: Add null guard for s.currentUnit after the recovery call.

Closes #2527
This commit is contained in:
mastertyko 2026-03-26 23:14:09 +01:00 committed by GitHub
parent f2113f1353
commit 11b38b8bb7
2 changed files with 140 additions and 1 deletions

View file

@ -15,6 +15,7 @@ import { computeBudgets, resolveExecutorContextWindow } from "./context-budget.j
import {
getInFlightToolCount,
getOldestInFlightToolStart,
clearInFlightTools,
} from "./auto-tool-tracking.js";
import { detectWorkingTreeActivity } from "./auto-supervisor.js";
import { closeoutUnit, type CloseoutOptions } from "./auto-unit-closeout.js";
@ -146,6 +147,7 @@ export function startUnitSupervision(sctx: SupervisionContext): void {
// Agent has tool calls currently executing — not idle, just waiting.
// But only suppress recovery if the tool started recently.
let stalledToolDetected = false;
if (getInFlightToolCount() > 0) {
const oldestStart = getOldestInFlightToolStart()!;
const toolAgeMs = Date.now() - oldestStart;
@ -156,6 +158,12 @@ export function startUnitSupervision(sctx: SupervisionContext): void {
});
return;
}
// Tool has been in-flight longer than idle timeout — treat as hung.
// Clear the stale entries so subsequent ticks don't re-detect them,
// and set the flag so the filesystem-activity check below does not
// override the stall verdict (#2527).
stalledToolDetected = true;
clearInFlightTools();
ctx.ui.notify(
`Stalled tool detected: a tool has been in-flight for ${Math.round(toolAgeMs / 60000)}min. Treating as hung — attempting idle recovery.`,
"warning",
@ -163,7 +171,9 @@ export function startUnitSupervision(sctx: SupervisionContext): void {
}
// Check if the agent is producing work on disk.
if (detectWorkingTreeActivity(s.basePath)) {
// Skip this when a stalled tool was just detected — filesystem changes
// from earlier in the task should not override the stall verdict (#2527).
if (!stalledToolDetected && detectWorkingTreeActivity(s.basePath)) {
writeUnitRuntimeRecord(s.basePath, unitType, unitId, s.currentUnit.startedAt, {
lastProgressAt: Date.now(),
lastProgressKind: "filesystem-activity",
@ -180,6 +190,10 @@ export function startUnitSupervision(sctx: SupervisionContext): void {
const recovery = await recoverTimedOutUnit(ctx, pi, unitType, unitId, "idle", buildRecoveryContext());
if (recovery === "recovered") return;
// Guard: recoverTimedOutUnit is async — pauseAuto/stopAuto may have
// set s.currentUnit = null during the await (#2527).
if (!s.currentUnit) return;
writeUnitRuntimeRecord(s.basePath, unitType, unitId, s.currentUnit.startedAt, {
phase: "paused",
});

View file

@ -0,0 +1,125 @@
/**
* Regression tests for #2527: idle watchdog stalled-tool detection.
*
* Bug 1: When a tool is stalled longer than idle_timeout, the watchdog
* notifies but falls through to detectWorkingTreeActivity(), which
* resets lastProgressAt if files were modified earlier. Recovery is
* never called the session burns tokens indefinitely.
*
* Bug 2: After async recoverTimedOutUnit(), pauseAuto/stopAuto may set
* s.currentUnit = null, but the next line accesses .startedAt crash.
*
* These tests verify the auto-timers.ts source contains the structural
* fixes: the stalledToolDetected flag, clearInFlightTools() call, the
* filesystem-check guard, and the null guard after recovery.
*/
import { readFileSync } from "node:fs";
import { join } from "node:path";
import { test, describe } from "node:test";
import assert from "node:assert/strict";
const TIMERS_SRC = readFileSync(
join(import.meta.dirname, "..", "auto-timers.ts"),
"utf-8",
);
// ═══ Bug 1: stalledToolDetected flag prevents filesystem-activity override ═══
describe("#2527 Bug 1: stalled tool should not be overridden by filesystem activity", () => {
test("auto-timers.ts imports clearInFlightTools", () => {
assert.ok(
TIMERS_SRC.includes("clearInFlightTools"),
"clearInFlightTools must be imported from auto-tool-tracking",
);
});
test("auto-timers.ts declares stalledToolDetected flag", () => {
assert.ok(
TIMERS_SRC.includes("stalledToolDetected"),
"stalledToolDetected flag must exist in idle watchdog",
);
});
test("stalled tool sets flag to true", () => {
// The flag must be set before the filesystem check
const flagSet = TIMERS_SRC.indexOf("stalledToolDetected = true");
assert.ok(flagSet > -1, "stalledToolDetected must be set to true when tool is stalled");
const notify = TIMERS_SRC.indexOf("Stalled tool detected:");
assert.ok(flagSet < notify, "flag must be set before the stall notification");
});
test("stalled tool calls clearInFlightTools", () => {
// clearInFlightTools() must be called when tool is stalled, so subsequent
// watchdog ticks don't re-detect the same stale entries
const clearCall = TIMERS_SRC.indexOf("clearInFlightTools()");
assert.ok(clearCall > -1, "clearInFlightTools() must be called when tool is stalled");
const flagSet = TIMERS_SRC.indexOf("stalledToolDetected = true");
assert.ok(
Math.abs(clearCall - flagSet) < 200,
"clearInFlightTools() should be near stalledToolDetected = true",
);
});
test("filesystem-activity check is guarded by stalledToolDetected", () => {
// The detectWorkingTreeActivity check must be skipped when stalledToolDetected is true
assert.ok(
TIMERS_SRC.includes("!stalledToolDetected && detectWorkingTreeActivity"),
"detectWorkingTreeActivity must be guarded by !stalledToolDetected",
);
});
test("control flow: stalled tool → skip filesystem check → reach recovery", () => {
// Verify the structural ordering: flag declaration → stall block → guarded fs check → recovery
const flagDecl = TIMERS_SRC.indexOf("let stalledToolDetected = false");
const stallBlock = TIMERS_SRC.indexOf("stalledToolDetected = true");
const fsGuard = TIMERS_SRC.indexOf("!stalledToolDetected && detectWorkingTreeActivity");
const recovery = TIMERS_SRC.indexOf("recoverTimedOutUnit(ctx, pi, unitType, unitId, \"idle\"");
assert.ok(flagDecl > -1, "flag declaration must exist");
assert.ok(flagDecl < stallBlock, "flag declared before stall block");
assert.ok(stallBlock < fsGuard, "stall block before filesystem guard");
assert.ok(fsGuard < recovery, "filesystem guard before recovery call");
});
});
// ═══ Bug 2: null guard after async recoverTimedOutUnit ═══════════════════════
describe("#2527 Bug 2: null guard after async recovery prevents crash", () => {
test("idle watchdog has null guard after recoverTimedOutUnit", () => {
// Find the idle recovery call
const idleRecovery = TIMERS_SRC.indexOf(
'recoverTimedOutUnit(ctx, pi, unitType, unitId, "idle"',
);
assert.ok(idleRecovery > -1, "idle recovery call must exist");
// The null guard must appear between the recovery call and the next
// writeUnitRuntimeRecord that accesses s.currentUnit.startedAt
const afterRecovery = TIMERS_SRC.slice(idleRecovery, idleRecovery + 400);
assert.ok(
afterRecovery.includes("if (!s.currentUnit) return"),
"null guard for s.currentUnit must exist after idle recoverTimedOutUnit",
);
});
test("null guard is between recovery and writeUnitRuntimeRecord", () => {
const idleRecovery = TIMERS_SRC.indexOf(
'recoverTimedOutUnit(ctx, pi, unitType, unitId, "idle"',
);
const afterRecovery = TIMERS_SRC.slice(idleRecovery);
const recoveredReturn = afterRecovery.indexOf('if (recovery === "recovered") return');
const nullGuard = afterRecovery.indexOf("if (!s.currentUnit) return");
const writeRecord = afterRecovery.indexOf("writeUnitRuntimeRecord(s.basePath");
assert.ok(recoveredReturn > -1, "recovered return must exist");
assert.ok(nullGuard > -1, "null guard must exist");
assert.ok(writeRecord > -1, "writeUnitRuntimeRecord must exist after recovery");
assert.ok(
recoveredReturn < nullGuard && nullGuard < writeRecord,
"order must be: recovered-return → null-guard → writeUnitRuntimeRecord",
);
});
});