From 99032444eb94efeed89e22107e67ccf7d2743d7b Mon Sep 17 00:00:00 2001 From: Tom Boucher Date: Sat, 21 Mar 2026 17:24:37 -0400 Subject: [PATCH] fix: populate RecoveryContext in hook unit supervision to prevent crash on stalled tool recovery (#1867) The buildRecoveryContext callback in auto/phases.ts returned an empty object instead of a valid RecoveryContext. When the idle watchdog detected a stalled tool and called recoverTimedOutUnit, basePath was undefined, causing join(undefined, ".gsd") to throw "The path argument must be of type string. Received undefined". The error left the session permanently hung because the unit promise was never resolved. Fixes #1855 Co-authored-by: Claude Opus 4.6 (1M context) --- src/resources/extensions/gsd/auto/phases.ts | 7 +- .../gsd/tests/stalled-tool-recovery.test.ts | 102 ++++++++++++++++++ 2 files changed, 108 insertions(+), 1 deletion(-) create mode 100644 src/resources/extensions/gsd/tests/stalled-tool-recovery.test.ts diff --git a/src/resources/extensions/gsd/auto/phases.ts b/src/resources/extensions/gsd/auto/phases.ts index bda534c0b..b82f7e560 100644 --- a/src/resources/extensions/gsd/auto/phases.ts +++ b/src/resources/extensions/gsd/auto/phases.ts @@ -991,7 +991,12 @@ export async function runUnitPhase( unitId, prefs, buildSnapshotOpts: () => deps.buildSnapshotOpts(unitType, unitId), - buildRecoveryContext: () => ({}), + buildRecoveryContext: () => ({ + basePath: s.basePath, + verbose: s.verbose, + currentUnitStartedAt: s.currentUnit?.startedAt ?? Date.now(), + unitRecoveryCount: s.unitRecoveryCount, + }), pauseAuto: deps.pauseAuto, }); diff --git a/src/resources/extensions/gsd/tests/stalled-tool-recovery.test.ts b/src/resources/extensions/gsd/tests/stalled-tool-recovery.test.ts new file mode 100644 index 000000000..7d46c1128 --- /dev/null +++ b/src/resources/extensions/gsd/tests/stalled-tool-recovery.test.ts @@ -0,0 +1,102 @@ +/** + * Regression test for #1855: Stalled tool detection crashes with + * "The path argument must be of type string. Received undefined" + * + * When a tool stalls in-flight for 10+ minutes, the idle watchdog fires + * recoverTimedOutUnit(). In auto/phases.ts, buildRecoveryContext was + * returning an empty object `{}`, so basePath was undefined. The recovery + * code passed undefined to readUnitRuntimeRecord → runtimePath → join(), + * which throws a TypeError. The session is permanently frozen because the + * error propagates into the idle watchdog catch handler but the unit + * promise is never resolved. + * + * This test calls recoverTimedOutUnit with an empty RecoveryContext (the + * bug) and verifies it crashes, then calls it with a valid RecoveryContext + * (the fix) and verifies it does not crash. + */ + +import { mkdtempSync, mkdirSync, rmSync } from "node:fs"; +import { join } from "node:path"; +import { tmpdir } from "node:os"; +import { recoverTimedOutUnit, type RecoveryContext } from "../auto-timeout-recovery.ts"; +import { createTestContext } from './test-helpers.ts'; + +const { assertTrue, report } = createTestContext(); + +// Minimal mock for ExtensionContext — only the fields recoverTimedOutUnit touches. +function makeMockCtx() { + return { + ui: { + notify: () => {}, + }, + } as any; +} + +// Minimal mock for ExtensionAPI — only sendMessage is called during recovery. +function makeMockPi() { + return { + sendMessage: () => {}, + } as any; +} + +// ═══ #1855: empty RecoveryContext (basePath undefined) crashes ════════════════ + +{ + console.log("\n=== #1855: recoverTimedOutUnit crashes when basePath is undefined ==="); + const ctx = makeMockCtx(); + const pi = makeMockPi(); + + // Simulate the bug: buildRecoveryContext returns {} (empty object). + // basePath is undefined, which causes join(undefined, ".gsd") to throw. + const emptyRctx = {} as RecoveryContext; + + let crashed = false; + try { + await recoverTimedOutUnit(ctx, pi, "execute-task", "M001/S01/T01", "idle", emptyRctx); + } catch (err: any) { + crashed = true; + assertTrue( + err.message.includes("path") || err.message.includes("string") || err.code === "ERR_INVALID_ARG_TYPE", + `should crash with path/type error, got: ${err.message}`, + ); + } + assertTrue(crashed, "should crash when basePath is undefined (reproduces #1855)"); +} + +// ═══ #1855: valid RecoveryContext does not crash ═════════════════════════════ + +{ + console.log("\n=== #1855: recoverTimedOutUnit succeeds with valid RecoveryContext ==="); + const base = mkdtempSync(join(tmpdir(), "gsd-stalled-tool-test-")); + mkdirSync(join(base, ".gsd", "milestones", "M001", "slices", "S01", "tasks"), { recursive: true }); + mkdirSync(join(base, ".gsd", "runtime", "units"), { recursive: true }); + + try { + const ctx = makeMockCtx(); + const pi = makeMockPi(); + + const validRctx: RecoveryContext = { + basePath: base, + verbose: false, + currentUnitStartedAt: Date.now(), + unitRecoveryCount: new Map(), + }; + + let crashed = false; + let result: string | undefined; + try { + result = await recoverTimedOutUnit(ctx, pi, "execute-task", "M001/S01/T01", "idle", validRctx); + } catch (err: any) { + crashed = true; + console.error(` Unexpected crash: ${err.message}`); + } + assertTrue(!crashed, "should not crash with valid basePath"); + // With no runtime record on disk and recoveryAttempts=0, the function + // should attempt steering recovery (sendMessage) and return "recovered". + assertTrue(result === "recovered", `should return 'recovered', got '${result}'`); + } finally { + rmSync(base, { recursive: true, force: true }); + } +} + +report();