From 27916344dfea58778e80596904ab73369cc4d20b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?T=C3=82CHES?= Date: Sat, 21 Mar 2026 09:46:22 -0600 Subject: [PATCH] fix: stop auto-mode immediately on infrastructure errors (ENOSPC, ENOMEM, etc.) (#1780) The blanket catch in auto/loop.ts treated all errors as transient and retried up to 3 times, burning ~$20 per retry on guaranteed failures like disk-full. Infrastructure errors (ENOSPC, ENOMEM, EROFS, EDQUOT, EMFILE, ENFILE) are now detected before the retry logic and trigger an immediate stop with a clear error message. Also adds a pre-dispatch disk space check to the health gate so low-disk conditions are caught before dispatching a unit. Closes #1694 Co-authored-by: Claude Opus 4.6 (1M context) --- src/resources/extensions/gsd/auto-loop.ts | 1 + .../extensions/gsd/auto/infra-errors.ts | 41 +++++++ src/resources/extensions/gsd/auto/loop.ts | 27 ++++- .../extensions/gsd/doctor-proactive.ts | 14 +++ .../extensions/gsd/tests/infra-error.test.ts | 101 ++++++++++++++++++ 5 files changed, 183 insertions(+), 1 deletion(-) create mode 100644 src/resources/extensions/gsd/auto/infra-errors.ts create mode 100644 src/resources/extensions/gsd/tests/infra-error.test.ts diff --git a/src/resources/extensions/gsd/auto-loop.ts b/src/resources/extensions/gsd/auto-loop.ts index 43f221ef5..74fcc8f16 100644 --- a/src/resources/extensions/gsd/auto-loop.ts +++ b/src/resources/extensions/gsd/auto-loop.ts @@ -8,6 +8,7 @@ */ export { autoLoop } from "./auto/loop.js"; +export { isInfrastructureError, INFRA_ERROR_CODES } from "./auto/infra-errors.js"; export { resolveAgentEnd, resolveAgentEndCancelled, isSessionSwitchInFlight, _resetPendingResolve, _setActiveSession } from "./auto/resolve.js"; export { detectStuck } from "./auto/detect-stuck.js"; export { runUnit } from "./auto/run-unit.js"; diff --git a/src/resources/extensions/gsd/auto/infra-errors.ts b/src/resources/extensions/gsd/auto/infra-errors.ts new file mode 100644 index 000000000..92edf26fc --- /dev/null +++ b/src/resources/extensions/gsd/auto/infra-errors.ts @@ -0,0 +1,41 @@ +/** + * auto/infra-errors.ts — Infrastructure error detection. + * + * Leaf module with zero transitive dependencies. Used by the auto-loop catch + * block to distinguish unrecoverable OS/filesystem errors from transient + * failures that merit retry. + */ + +/** + * Error codes indicating infrastructure failures that cannot be recovered by + * retrying. Each retry re-dispatches the unit at full LLM cost, so we bail + * immediately rather than burning budget on guaranteed failures. + */ +export const INFRA_ERROR_CODES: ReadonlySet = new Set([ + "ENOSPC", // disk full + "ENOMEM", // out of memory + "EROFS", // read-only file system + "EDQUOT", // disk quota exceeded + "EMFILE", // too many open files (process) + "ENFILE", // too many open files (system) +]); + +/** + * Detect whether an error is an unrecoverable infrastructure failure. + * Checks the `code` property (Node system errors) and falls back to + * scanning the message string for known error code tokens. + * + * Returns the matched code string, or null if the error is not an + * infrastructure failure. + */ +export function isInfrastructureError(err: unknown): string | null { + if (err && typeof err === "object") { + const code = (err as Record).code; + if (typeof code === "string" && INFRA_ERROR_CODES.has(code)) return code; + } + const msg = err instanceof Error ? err.message : String(err); + for (const code of INFRA_ERROR_CODES) { + if (msg.includes(code)) return code; + } + return null; +} diff --git a/src/resources/extensions/gsd/auto/loop.ts b/src/resources/extensions/gsd/auto/loop.ts index 8436587fa..c2e545851 100644 --- a/src/resources/extensions/gsd/auto/loop.ts +++ b/src/resources/extensions/gsd/auto/loop.ts @@ -26,6 +26,7 @@ import { runFinalize, } from "./phases.js"; import { debugLog } from "../debug-logger.js"; +import { isInfrastructureError } from "./infra-errors.js"; /** * Main auto-mode execution loop. Iterates: derive → dispatch → guards → @@ -155,8 +156,32 @@ export async function autoLoop( debugLog("autoLoop", { phase: "iteration-complete", iteration }); } catch (loopErr) { // ── Blanket catch: absorb unexpected exceptions, apply graduated recovery ── - consecutiveErrors++; const msg = loopErr instanceof Error ? loopErr.message : String(loopErr); + + // ── Infrastructure errors: immediate stop, no retry ── + // These are unrecoverable (disk full, OOM, etc.). Retrying just burns + // LLM budget on guaranteed failures. + const infraCode = isInfrastructureError(loopErr); + if (infraCode) { + debugLog("autoLoop", { + phase: "infrastructure-error", + iteration, + code: infraCode, + error: msg, + }); + ctx.ui.notify( + `Auto-mode stopped: infrastructure error ${infraCode} — ${msg}`, + "error", + ); + await deps.stopAuto( + ctx, + pi, + `Infrastructure error (${infraCode}): not recoverable by retry`, + ); + break; + } + + consecutiveErrors++; debugLog("autoLoop", { phase: "iteration-error", iteration, diff --git a/src/resources/extensions/gsd/doctor-proactive.ts b/src/resources/extensions/gsd/doctor-proactive.ts index 83e8fe431..0eb3b016f 100644 --- a/src/resources/extensions/gsd/doctor-proactive.ts +++ b/src/resources/extensions/gsd/doctor-proactive.ts @@ -24,6 +24,7 @@ import { deriveState } from "./state.js"; import { resolveMilestoneIntegrationBranch } from "./git-service.js"; import { nativeIsRepo } from "./native-git-bridge.js"; import { loadEffectiveGSDPreferences } from "./preferences.js"; +import { runEnvironmentChecks } from "./doctor-environment.js"; // ── Health Score Tracking ────────────────────────────────────────────────── @@ -294,6 +295,19 @@ export async function preDispatchHealthGate(basePath: string): Promise r.name === "disk_space" && r.status === "error"); + if (diskError) { + issues.push(`${diskError.message}${diskError.detail ? ` — ${diskError.detail}` : ""}`); + } + } catch { + // Non-fatal — dispatch continues if env check fails + } + // If we had critical issues that couldn't be auto-healed, block dispatch if (issues.length > 0) { return { diff --git a/src/resources/extensions/gsd/tests/infra-error.test.ts b/src/resources/extensions/gsd/tests/infra-error.test.ts new file mode 100644 index 000000000..0eb379156 --- /dev/null +++ b/src/resources/extensions/gsd/tests/infra-error.test.ts @@ -0,0 +1,101 @@ +import test from "node:test"; +import assert from "node:assert/strict"; + +// Import directly from the leaf module — no transitive dependencies. +import { isInfrastructureError, INFRA_ERROR_CODES } from "../auto/infra-errors.js"; + +// ── INFRA_ERROR_CODES constant ─────────────────────────────────────────────── + +test("INFRA_ERROR_CODES contains the expected codes", () => { + for (const code of ["ENOSPC", "ENOMEM", "EROFS", "EDQUOT", "EMFILE", "ENFILE"]) { + assert.ok(INFRA_ERROR_CODES.has(code), `missing ${code}`); + } + assert.equal(INFRA_ERROR_CODES.size, 6, "unexpected extra codes"); +}); + +// ── isInfrastructureError: code property detection ─────────────────────────── + +test("detects ENOSPC via code property", () => { + const err = Object.assign(new Error("write ENOSPC"), { code: "ENOSPC" }); + assert.equal(isInfrastructureError(err), "ENOSPC"); +}); + +test("detects ENOMEM via code property", () => { + const err = Object.assign(new Error("Cannot allocate memory"), { code: "ENOMEM" }); + assert.equal(isInfrastructureError(err), "ENOMEM"); +}); + +test("detects EROFS via code property", () => { + const err = Object.assign(new Error("read-only filesystem"), { code: "EROFS" }); + assert.equal(isInfrastructureError(err), "EROFS"); +}); + +test("detects EDQUOT via code property", () => { + const err = Object.assign(new Error("quota exceeded"), { code: "EDQUOT" }); + assert.equal(isInfrastructureError(err), "EDQUOT"); +}); + +test("detects EMFILE via code property", () => { + const err = Object.assign(new Error("too many open files"), { code: "EMFILE" }); + assert.equal(isInfrastructureError(err), "EMFILE"); +}); + +test("detects ENFILE via code property", () => { + const err = Object.assign(new Error("file table overflow"), { code: "ENFILE" }); + assert.equal(isInfrastructureError(err), "ENFILE"); +}); + +// ── isInfrastructureError: message fallback ────────────────────────────────── + +test("falls back to message scanning when no code property", () => { + const err = new Error("pip install failed: ENOSPC: no space left on device"); + assert.equal(isInfrastructureError(err), "ENOSPC"); +}); + +test("detects code in stringified non-Error value", () => { + assert.equal(isInfrastructureError("ENOMEM: cannot allocate memory"), "ENOMEM"); +}); + +test("detects EDQUOT in nested error message", () => { + const err = new Error("write failed: EDQUOT disk quota exceeded on /dev/sda1"); + assert.equal(isInfrastructureError(err), "EDQUOT"); +}); + +// ── isInfrastructureError: negative cases ──────────────────────────────────── + +test("returns null for transient network errors", () => { + assert.equal(isInfrastructureError(new Error("ETIMEDOUT: connection timed out")), null); +}); + +test("returns null for generic errors", () => { + assert.equal(isInfrastructureError(new Error("Something went wrong")), null); +}); + +test("returns null for null input", () => { + assert.equal(isInfrastructureError(null), null); +}); + +test("returns null for undefined input", () => { + assert.equal(isInfrastructureError(undefined), null); +}); + +test("returns null for non-infra code property", () => { + const err = Object.assign(new Error("connection reset"), { code: "ECONNRESET" }); + assert.equal(isInfrastructureError(err), null); +}); + +// ── isInfrastructureError: edge cases ──────────────────────────────────────── + +test("message fallback still fires even if code property is non-infra", () => { + // code is ECONNRESET (not infra) but message contains ENOSPC + const err = Object.assign(new Error("something ENOSPC happened"), { code: "ECONNRESET" }); + assert.equal(isInfrastructureError(err), "ENOSPC"); +}); + +test("plain object with code property works", () => { + assert.equal(isInfrastructureError({ code: "ENOSPC", message: "disk full" }), "ENOSPC"); +}); + +test("numeric error input returns null", () => { + assert.equal(isInfrastructureError(42), null); +});