fix: stop auto-mode immediately on infrastructure errors (ENOSPC, ENOMEM, etc.) (#1780)

The blanket catch in auto/loop.ts treated all errors as transient and retried
up to 3 times, burning ~$20 per retry on guaranteed failures like disk-full.
Infrastructure errors (ENOSPC, ENOMEM, EROFS, EDQUOT, EMFILE, ENFILE) are now
detected before the retry logic and trigger an immediate stop with a clear
error message. Also adds a pre-dispatch disk space check to the health gate
so low-disk conditions are caught before dispatching a unit.

Closes #1694

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
TÂCHES 2026-03-21 09:46:22 -06:00 committed by GitHub
parent 33caef89d0
commit 27916344df
5 changed files with 183 additions and 1 deletions

View file

@ -8,6 +8,7 @@
*/
export { autoLoop } from "./auto/loop.js";
export { isInfrastructureError, INFRA_ERROR_CODES } from "./auto/infra-errors.js";
export { resolveAgentEnd, resolveAgentEndCancelled, isSessionSwitchInFlight, _resetPendingResolve, _setActiveSession } from "./auto/resolve.js";
export { detectStuck } from "./auto/detect-stuck.js";
export { runUnit } from "./auto/run-unit.js";

View file

@ -0,0 +1,41 @@
/**
* auto/infra-errors.ts Infrastructure error detection.
*
* Leaf module with zero transitive dependencies. Used by the auto-loop catch
* block to distinguish unrecoverable OS/filesystem errors from transient
* failures that merit retry.
*/
/**
* Error codes indicating infrastructure failures that cannot be recovered by
* retrying. Each retry re-dispatches the unit at full LLM cost, so we bail
* immediately rather than burning budget on guaranteed failures.
*/
export const INFRA_ERROR_CODES: ReadonlySet<string> = new Set([
"ENOSPC", // disk full
"ENOMEM", // out of memory
"EROFS", // read-only file system
"EDQUOT", // disk quota exceeded
"EMFILE", // too many open files (process)
"ENFILE", // too many open files (system)
]);
/**
* Detect whether an error is an unrecoverable infrastructure failure.
* Checks the `code` property (Node system errors) and falls back to
* scanning the message string for known error code tokens.
*
* Returns the matched code string, or null if the error is not an
* infrastructure failure.
*/
export function isInfrastructureError(err: unknown): string | null {
if (err && typeof err === "object") {
const code = (err as Record<string, unknown>).code;
if (typeof code === "string" && INFRA_ERROR_CODES.has(code)) return code;
}
const msg = err instanceof Error ? err.message : String(err);
for (const code of INFRA_ERROR_CODES) {
if (msg.includes(code)) return code;
}
return null;
}

View file

@ -26,6 +26,7 @@ import {
runFinalize,
} from "./phases.js";
import { debugLog } from "../debug-logger.js";
import { isInfrastructureError } from "./infra-errors.js";
/**
* Main auto-mode execution loop. Iterates: derive dispatch guards
@ -155,8 +156,32 @@ export async function autoLoop(
debugLog("autoLoop", { phase: "iteration-complete", iteration });
} catch (loopErr) {
// ── Blanket catch: absorb unexpected exceptions, apply graduated recovery ──
consecutiveErrors++;
const msg = loopErr instanceof Error ? loopErr.message : String(loopErr);
// ── Infrastructure errors: immediate stop, no retry ──
// These are unrecoverable (disk full, OOM, etc.). Retrying just burns
// LLM budget on guaranteed failures.
const infraCode = isInfrastructureError(loopErr);
if (infraCode) {
debugLog("autoLoop", {
phase: "infrastructure-error",
iteration,
code: infraCode,
error: msg,
});
ctx.ui.notify(
`Auto-mode stopped: infrastructure error ${infraCode}${msg}`,
"error",
);
await deps.stopAuto(
ctx,
pi,
`Infrastructure error (${infraCode}): not recoverable by retry`,
);
break;
}
consecutiveErrors++;
debugLog("autoLoop", {
phase: "iteration-error",
iteration,

View file

@ -24,6 +24,7 @@ import { deriveState } from "./state.js";
import { resolveMilestoneIntegrationBranch } from "./git-service.js";
import { nativeIsRepo } from "./native-git-bridge.js";
import { loadEffectiveGSDPreferences } from "./preferences.js";
import { runEnvironmentChecks } from "./doctor-environment.js";
// ── Health Score Tracking ──────────────────────────────────────────────────
@ -294,6 +295,19 @@ export async function preDispatchHealthGate(basePath: string): Promise<PreDispat
// Non-fatal — dispatch continues if state/branch check fails
}
// ── Disk space check ──
// Catches low-disk conditions before dispatch rather than letting the unit
// fail mid-execution with ENOSPC (which wastes a full LLM turn).
try {
const envResults = runEnvironmentChecks(basePath);
const diskError = envResults.find(r => r.name === "disk_space" && r.status === "error");
if (diskError) {
issues.push(`${diskError.message}${diskError.detail ? `${diskError.detail}` : ""}`);
}
} catch {
// Non-fatal — dispatch continues if env check fails
}
// If we had critical issues that couldn't be auto-healed, block dispatch
if (issues.length > 0) {
return {

View file

@ -0,0 +1,101 @@
import test from "node:test";
import assert from "node:assert/strict";
// Import directly from the leaf module — no transitive dependencies.
import { isInfrastructureError, INFRA_ERROR_CODES } from "../auto/infra-errors.js";
// ── INFRA_ERROR_CODES constant ───────────────────────────────────────────────
test("INFRA_ERROR_CODES contains the expected codes", () => {
for (const code of ["ENOSPC", "ENOMEM", "EROFS", "EDQUOT", "EMFILE", "ENFILE"]) {
assert.ok(INFRA_ERROR_CODES.has(code), `missing ${code}`);
}
assert.equal(INFRA_ERROR_CODES.size, 6, "unexpected extra codes");
});
// ── isInfrastructureError: code property detection ───────────────────────────
test("detects ENOSPC via code property", () => {
const err = Object.assign(new Error("write ENOSPC"), { code: "ENOSPC" });
assert.equal(isInfrastructureError(err), "ENOSPC");
});
test("detects ENOMEM via code property", () => {
const err = Object.assign(new Error("Cannot allocate memory"), { code: "ENOMEM" });
assert.equal(isInfrastructureError(err), "ENOMEM");
});
test("detects EROFS via code property", () => {
const err = Object.assign(new Error("read-only filesystem"), { code: "EROFS" });
assert.equal(isInfrastructureError(err), "EROFS");
});
test("detects EDQUOT via code property", () => {
const err = Object.assign(new Error("quota exceeded"), { code: "EDQUOT" });
assert.equal(isInfrastructureError(err), "EDQUOT");
});
test("detects EMFILE via code property", () => {
const err = Object.assign(new Error("too many open files"), { code: "EMFILE" });
assert.equal(isInfrastructureError(err), "EMFILE");
});
test("detects ENFILE via code property", () => {
const err = Object.assign(new Error("file table overflow"), { code: "ENFILE" });
assert.equal(isInfrastructureError(err), "ENFILE");
});
// ── isInfrastructureError: message fallback ──────────────────────────────────
test("falls back to message scanning when no code property", () => {
const err = new Error("pip install failed: ENOSPC: no space left on device");
assert.equal(isInfrastructureError(err), "ENOSPC");
});
test("detects code in stringified non-Error value", () => {
assert.equal(isInfrastructureError("ENOMEM: cannot allocate memory"), "ENOMEM");
});
test("detects EDQUOT in nested error message", () => {
const err = new Error("write failed: EDQUOT disk quota exceeded on /dev/sda1");
assert.equal(isInfrastructureError(err), "EDQUOT");
});
// ── isInfrastructureError: negative cases ────────────────────────────────────
test("returns null for transient network errors", () => {
assert.equal(isInfrastructureError(new Error("ETIMEDOUT: connection timed out")), null);
});
test("returns null for generic errors", () => {
assert.equal(isInfrastructureError(new Error("Something went wrong")), null);
});
test("returns null for null input", () => {
assert.equal(isInfrastructureError(null), null);
});
test("returns null for undefined input", () => {
assert.equal(isInfrastructureError(undefined), null);
});
test("returns null for non-infra code property", () => {
const err = Object.assign(new Error("connection reset"), { code: "ECONNRESET" });
assert.equal(isInfrastructureError(err), null);
});
// ── isInfrastructureError: edge cases ────────────────────────────────────────
test("message fallback still fires even if code property is non-infra", () => {
// code is ECONNRESET (not infra) but message contains ENOSPC
const err = Object.assign(new Error("something ENOSPC happened"), { code: "ECONNRESET" });
assert.equal(isInfrastructureError(err), "ENOSPC");
});
test("plain object with code property works", () => {
assert.equal(isInfrastructureError({ code: "ENOSPC", message: "disk full" }), "ENOSPC");
});
test("numeric error input returns null", () => {
assert.equal(isInfrastructureError(42), null);
});