fix: idle watchdog stalled-tool detection overridden by filesystem activity (#2697)
Bug 1: When a tool stalls longer than idle_timeout, the watchdog notifies but falls through to detectWorkingTreeActivity(), which resets lastProgressAt when files were modified earlier in the task. Recovery is never called — the session burns tokens indefinitely. Fix: Add stalledToolDetected flag + clearInFlightTools() call. The filesystem-activity check is guarded by !stalledToolDetected so it cannot override the stall verdict. Bug 2: After async recoverTimedOutUnit(), pauseAuto/stopAuto may set s.currentUnit = null during the await, but the next line accesses s.currentUnit.startedAt without a null guard — crash. Fix: Add null guard for s.currentUnit after the recovery call. Closes #2527
This commit is contained in:
parent
f2113f1353
commit
11b38b8bb7
2 changed files with 140 additions and 1 deletions
|
|
@ -15,6 +15,7 @@ import { computeBudgets, resolveExecutorContextWindow } from "./context-budget.j
|
|||
import {
|
||||
getInFlightToolCount,
|
||||
getOldestInFlightToolStart,
|
||||
clearInFlightTools,
|
||||
} from "./auto-tool-tracking.js";
|
||||
import { detectWorkingTreeActivity } from "./auto-supervisor.js";
|
||||
import { closeoutUnit, type CloseoutOptions } from "./auto-unit-closeout.js";
|
||||
|
|
@ -146,6 +147,7 @@ export function startUnitSupervision(sctx: SupervisionContext): void {
|
|||
|
||||
// Agent has tool calls currently executing — not idle, just waiting.
|
||||
// But only suppress recovery if the tool started recently.
|
||||
let stalledToolDetected = false;
|
||||
if (getInFlightToolCount() > 0) {
|
||||
const oldestStart = getOldestInFlightToolStart()!;
|
||||
const toolAgeMs = Date.now() - oldestStart;
|
||||
|
|
@ -156,6 +158,12 @@ export function startUnitSupervision(sctx: SupervisionContext): void {
|
|||
});
|
||||
return;
|
||||
}
|
||||
// Tool has been in-flight longer than idle timeout — treat as hung.
|
||||
// Clear the stale entries so subsequent ticks don't re-detect them,
|
||||
// and set the flag so the filesystem-activity check below does not
|
||||
// override the stall verdict (#2527).
|
||||
stalledToolDetected = true;
|
||||
clearInFlightTools();
|
||||
ctx.ui.notify(
|
||||
`Stalled tool detected: a tool has been in-flight for ${Math.round(toolAgeMs / 60000)}min. Treating as hung — attempting idle recovery.`,
|
||||
"warning",
|
||||
|
|
@ -163,7 +171,9 @@ export function startUnitSupervision(sctx: SupervisionContext): void {
|
|||
}
|
||||
|
||||
// Check if the agent is producing work on disk.
|
||||
if (detectWorkingTreeActivity(s.basePath)) {
|
||||
// Skip this when a stalled tool was just detected — filesystem changes
|
||||
// from earlier in the task should not override the stall verdict (#2527).
|
||||
if (!stalledToolDetected && detectWorkingTreeActivity(s.basePath)) {
|
||||
writeUnitRuntimeRecord(s.basePath, unitType, unitId, s.currentUnit.startedAt, {
|
||||
lastProgressAt: Date.now(),
|
||||
lastProgressKind: "filesystem-activity",
|
||||
|
|
@ -180,6 +190,10 @@ export function startUnitSupervision(sctx: SupervisionContext): void {
|
|||
const recovery = await recoverTimedOutUnit(ctx, pi, unitType, unitId, "idle", buildRecoveryContext());
|
||||
if (recovery === "recovered") return;
|
||||
|
||||
// Guard: recoverTimedOutUnit is async — pauseAuto/stopAuto may have
|
||||
// set s.currentUnit = null during the await (#2527).
|
||||
if (!s.currentUnit) return;
|
||||
|
||||
writeUnitRuntimeRecord(s.basePath, unitType, unitId, s.currentUnit.startedAt, {
|
||||
phase: "paused",
|
||||
});
|
||||
|
|
|
|||
|
|
@ -0,0 +1,125 @@
|
|||
/**
|
||||
* Regression tests for #2527: idle watchdog stalled-tool detection.
|
||||
*
|
||||
* Bug 1: When a tool is stalled longer than idle_timeout, the watchdog
|
||||
* notifies but falls through to detectWorkingTreeActivity(), which
|
||||
* resets lastProgressAt if files were modified earlier. Recovery is
|
||||
* never called — the session burns tokens indefinitely.
|
||||
*
|
||||
* Bug 2: After async recoverTimedOutUnit(), pauseAuto/stopAuto may set
|
||||
* s.currentUnit = null, but the next line accesses .startedAt — crash.
|
||||
*
|
||||
* These tests verify the auto-timers.ts source contains the structural
|
||||
* fixes: the stalledToolDetected flag, clearInFlightTools() call, the
|
||||
* filesystem-check guard, and the null guard after recovery.
|
||||
*/
|
||||
|
||||
import { readFileSync } from "node:fs";
|
||||
import { join } from "node:path";
|
||||
import { test, describe } from "node:test";
|
||||
import assert from "node:assert/strict";
|
||||
|
||||
const TIMERS_SRC = readFileSync(
|
||||
join(import.meta.dirname, "..", "auto-timers.ts"),
|
||||
"utf-8",
|
||||
);
|
||||
|
||||
// ═══ Bug 1: stalledToolDetected flag prevents filesystem-activity override ═══
|
||||
|
||||
describe("#2527 Bug 1: stalled tool should not be overridden by filesystem activity", () => {
|
||||
test("auto-timers.ts imports clearInFlightTools", () => {
|
||||
assert.ok(
|
||||
TIMERS_SRC.includes("clearInFlightTools"),
|
||||
"clearInFlightTools must be imported from auto-tool-tracking",
|
||||
);
|
||||
});
|
||||
|
||||
test("auto-timers.ts declares stalledToolDetected flag", () => {
|
||||
assert.ok(
|
||||
TIMERS_SRC.includes("stalledToolDetected"),
|
||||
"stalledToolDetected flag must exist in idle watchdog",
|
||||
);
|
||||
});
|
||||
|
||||
test("stalled tool sets flag to true", () => {
|
||||
// The flag must be set before the filesystem check
|
||||
const flagSet = TIMERS_SRC.indexOf("stalledToolDetected = true");
|
||||
assert.ok(flagSet > -1, "stalledToolDetected must be set to true when tool is stalled");
|
||||
|
||||
const notify = TIMERS_SRC.indexOf("Stalled tool detected:");
|
||||
assert.ok(flagSet < notify, "flag must be set before the stall notification");
|
||||
});
|
||||
|
||||
test("stalled tool calls clearInFlightTools", () => {
|
||||
// clearInFlightTools() must be called when tool is stalled, so subsequent
|
||||
// watchdog ticks don't re-detect the same stale entries
|
||||
const clearCall = TIMERS_SRC.indexOf("clearInFlightTools()");
|
||||
assert.ok(clearCall > -1, "clearInFlightTools() must be called when tool is stalled");
|
||||
|
||||
const flagSet = TIMERS_SRC.indexOf("stalledToolDetected = true");
|
||||
assert.ok(
|
||||
Math.abs(clearCall - flagSet) < 200,
|
||||
"clearInFlightTools() should be near stalledToolDetected = true",
|
||||
);
|
||||
});
|
||||
|
||||
test("filesystem-activity check is guarded by stalledToolDetected", () => {
|
||||
// The detectWorkingTreeActivity check must be skipped when stalledToolDetected is true
|
||||
assert.ok(
|
||||
TIMERS_SRC.includes("!stalledToolDetected && detectWorkingTreeActivity"),
|
||||
"detectWorkingTreeActivity must be guarded by !stalledToolDetected",
|
||||
);
|
||||
});
|
||||
|
||||
test("control flow: stalled tool → skip filesystem check → reach recovery", () => {
|
||||
// Verify the structural ordering: flag declaration → stall block → guarded fs check → recovery
|
||||
const flagDecl = TIMERS_SRC.indexOf("let stalledToolDetected = false");
|
||||
const stallBlock = TIMERS_SRC.indexOf("stalledToolDetected = true");
|
||||
const fsGuard = TIMERS_SRC.indexOf("!stalledToolDetected && detectWorkingTreeActivity");
|
||||
const recovery = TIMERS_SRC.indexOf("recoverTimedOutUnit(ctx, pi, unitType, unitId, \"idle\"");
|
||||
|
||||
assert.ok(flagDecl > -1, "flag declaration must exist");
|
||||
assert.ok(flagDecl < stallBlock, "flag declared before stall block");
|
||||
assert.ok(stallBlock < fsGuard, "stall block before filesystem guard");
|
||||
assert.ok(fsGuard < recovery, "filesystem guard before recovery call");
|
||||
});
|
||||
});
|
||||
|
||||
// ═══ Bug 2: null guard after async recoverTimedOutUnit ═══════════════════════
|
||||
|
||||
describe("#2527 Bug 2: null guard after async recovery prevents crash", () => {
|
||||
test("idle watchdog has null guard after recoverTimedOutUnit", () => {
|
||||
// Find the idle recovery call
|
||||
const idleRecovery = TIMERS_SRC.indexOf(
|
||||
'recoverTimedOutUnit(ctx, pi, unitType, unitId, "idle"',
|
||||
);
|
||||
assert.ok(idleRecovery > -1, "idle recovery call must exist");
|
||||
|
||||
// The null guard must appear between the recovery call and the next
|
||||
// writeUnitRuntimeRecord that accesses s.currentUnit.startedAt
|
||||
const afterRecovery = TIMERS_SRC.slice(idleRecovery, idleRecovery + 400);
|
||||
assert.ok(
|
||||
afterRecovery.includes("if (!s.currentUnit) return"),
|
||||
"null guard for s.currentUnit must exist after idle recoverTimedOutUnit",
|
||||
);
|
||||
});
|
||||
|
||||
test("null guard is between recovery and writeUnitRuntimeRecord", () => {
|
||||
const idleRecovery = TIMERS_SRC.indexOf(
|
||||
'recoverTimedOutUnit(ctx, pi, unitType, unitId, "idle"',
|
||||
);
|
||||
const afterRecovery = TIMERS_SRC.slice(idleRecovery);
|
||||
|
||||
const recoveredReturn = afterRecovery.indexOf('if (recovery === "recovered") return');
|
||||
const nullGuard = afterRecovery.indexOf("if (!s.currentUnit) return");
|
||||
const writeRecord = afterRecovery.indexOf("writeUnitRuntimeRecord(s.basePath");
|
||||
|
||||
assert.ok(recoveredReturn > -1, "recovered return must exist");
|
||||
assert.ok(nullGuard > -1, "null guard must exist");
|
||||
assert.ok(writeRecord > -1, "writeUnitRuntimeRecord must exist after recovery");
|
||||
assert.ok(
|
||||
recoveredReturn < nullGuard && nullGuard < writeRecord,
|
||||
"order must be: recovered-return → null-guard → writeUnitRuntimeRecord",
|
||||
);
|
||||
});
|
||||
});
|
||||
Loading…
Add table
Reference in a new issue