fix: stop auto-mode immediately on infrastructure errors (ENOSPC, ENOMEM, etc.) (#1780)
The blanket catch in auto/loop.ts treated all errors as transient and retried up to 3 times, burning ~$20 per retry on guaranteed failures like disk-full. Infrastructure errors (ENOSPC, ENOMEM, EROFS, EDQUOT, EMFILE, ENFILE) are now detected before the retry logic and trigger an immediate stop with a clear error message. Also adds a pre-dispatch disk space check to the health gate so low-disk conditions are caught before dispatching a unit. Closes #1694 Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
33caef89d0
commit
27916344df
5 changed files with 183 additions and 1 deletions
|
|
@ -8,6 +8,7 @@
|
|||
*/
|
||||
|
||||
export { autoLoop } from "./auto/loop.js";
|
||||
export { isInfrastructureError, INFRA_ERROR_CODES } from "./auto/infra-errors.js";
|
||||
export { resolveAgentEnd, resolveAgentEndCancelled, isSessionSwitchInFlight, _resetPendingResolve, _setActiveSession } from "./auto/resolve.js";
|
||||
export { detectStuck } from "./auto/detect-stuck.js";
|
||||
export { runUnit } from "./auto/run-unit.js";
|
||||
|
|
|
|||
41
src/resources/extensions/gsd/auto/infra-errors.ts
Normal file
41
src/resources/extensions/gsd/auto/infra-errors.ts
Normal file
|
|
@ -0,0 +1,41 @@
|
|||
/**
|
||||
* auto/infra-errors.ts — Infrastructure error detection.
|
||||
*
|
||||
* Leaf module with zero transitive dependencies. Used by the auto-loop catch
|
||||
* block to distinguish unrecoverable OS/filesystem errors from transient
|
||||
* failures that merit retry.
|
||||
*/
|
||||
|
||||
/**
|
||||
* Error codes indicating infrastructure failures that cannot be recovered by
|
||||
* retrying. Each retry re-dispatches the unit at full LLM cost, so we bail
|
||||
* immediately rather than burning budget on guaranteed failures.
|
||||
*/
|
||||
export const INFRA_ERROR_CODES: ReadonlySet<string> = new Set([
|
||||
"ENOSPC", // disk full
|
||||
"ENOMEM", // out of memory
|
||||
"EROFS", // read-only file system
|
||||
"EDQUOT", // disk quota exceeded
|
||||
"EMFILE", // too many open files (process)
|
||||
"ENFILE", // too many open files (system)
|
||||
]);
|
||||
|
||||
/**
|
||||
* Detect whether an error is an unrecoverable infrastructure failure.
|
||||
* Checks the `code` property (Node system errors) and falls back to
|
||||
* scanning the message string for known error code tokens.
|
||||
*
|
||||
* Returns the matched code string, or null if the error is not an
|
||||
* infrastructure failure.
|
||||
*/
|
||||
export function isInfrastructureError(err: unknown): string | null {
|
||||
if (err && typeof err === "object") {
|
||||
const code = (err as Record<string, unknown>).code;
|
||||
if (typeof code === "string" && INFRA_ERROR_CODES.has(code)) return code;
|
||||
}
|
||||
const msg = err instanceof Error ? err.message : String(err);
|
||||
for (const code of INFRA_ERROR_CODES) {
|
||||
if (msg.includes(code)) return code;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
|
@ -26,6 +26,7 @@ import {
|
|||
runFinalize,
|
||||
} from "./phases.js";
|
||||
import { debugLog } from "../debug-logger.js";
|
||||
import { isInfrastructureError } from "./infra-errors.js";
|
||||
|
||||
/**
|
||||
* Main auto-mode execution loop. Iterates: derive → dispatch → guards →
|
||||
|
|
@ -155,8 +156,32 @@ export async function autoLoop(
|
|||
debugLog("autoLoop", { phase: "iteration-complete", iteration });
|
||||
} catch (loopErr) {
|
||||
// ── Blanket catch: absorb unexpected exceptions, apply graduated recovery ──
|
||||
consecutiveErrors++;
|
||||
const msg = loopErr instanceof Error ? loopErr.message : String(loopErr);
|
||||
|
||||
// ── Infrastructure errors: immediate stop, no retry ──
|
||||
// These are unrecoverable (disk full, OOM, etc.). Retrying just burns
|
||||
// LLM budget on guaranteed failures.
|
||||
const infraCode = isInfrastructureError(loopErr);
|
||||
if (infraCode) {
|
||||
debugLog("autoLoop", {
|
||||
phase: "infrastructure-error",
|
||||
iteration,
|
||||
code: infraCode,
|
||||
error: msg,
|
||||
});
|
||||
ctx.ui.notify(
|
||||
`Auto-mode stopped: infrastructure error ${infraCode} — ${msg}`,
|
||||
"error",
|
||||
);
|
||||
await deps.stopAuto(
|
||||
ctx,
|
||||
pi,
|
||||
`Infrastructure error (${infraCode}): not recoverable by retry`,
|
||||
);
|
||||
break;
|
||||
}
|
||||
|
||||
consecutiveErrors++;
|
||||
debugLog("autoLoop", {
|
||||
phase: "iteration-error",
|
||||
iteration,
|
||||
|
|
|
|||
|
|
@ -24,6 +24,7 @@ import { deriveState } from "./state.js";
|
|||
import { resolveMilestoneIntegrationBranch } from "./git-service.js";
|
||||
import { nativeIsRepo } from "./native-git-bridge.js";
|
||||
import { loadEffectiveGSDPreferences } from "./preferences.js";
|
||||
import { runEnvironmentChecks } from "./doctor-environment.js";
|
||||
|
||||
// ── Health Score Tracking ──────────────────────────────────────────────────
|
||||
|
||||
|
|
@ -294,6 +295,19 @@ export async function preDispatchHealthGate(basePath: string): Promise<PreDispat
|
|||
// Non-fatal — dispatch continues if state/branch check fails
|
||||
}
|
||||
|
||||
// ── Disk space check ──
|
||||
// Catches low-disk conditions before dispatch rather than letting the unit
|
||||
// fail mid-execution with ENOSPC (which wastes a full LLM turn).
|
||||
try {
|
||||
const envResults = runEnvironmentChecks(basePath);
|
||||
const diskError = envResults.find(r => r.name === "disk_space" && r.status === "error");
|
||||
if (diskError) {
|
||||
issues.push(`${diskError.message}${diskError.detail ? ` — ${diskError.detail}` : ""}`);
|
||||
}
|
||||
} catch {
|
||||
// Non-fatal — dispatch continues if env check fails
|
||||
}
|
||||
|
||||
// If we had critical issues that couldn't be auto-healed, block dispatch
|
||||
if (issues.length > 0) {
|
||||
return {
|
||||
|
|
|
|||
101
src/resources/extensions/gsd/tests/infra-error.test.ts
Normal file
101
src/resources/extensions/gsd/tests/infra-error.test.ts
Normal file
|
|
@ -0,0 +1,101 @@
|
|||
import test from "node:test";
|
||||
import assert from "node:assert/strict";
|
||||
|
||||
// Import directly from the leaf module — no transitive dependencies.
|
||||
import { isInfrastructureError, INFRA_ERROR_CODES } from "../auto/infra-errors.js";
|
||||
|
||||
// ── INFRA_ERROR_CODES constant ───────────────────────────────────────────────
|
||||
|
||||
test("INFRA_ERROR_CODES contains the expected codes", () => {
|
||||
for (const code of ["ENOSPC", "ENOMEM", "EROFS", "EDQUOT", "EMFILE", "ENFILE"]) {
|
||||
assert.ok(INFRA_ERROR_CODES.has(code), `missing ${code}`);
|
||||
}
|
||||
assert.equal(INFRA_ERROR_CODES.size, 6, "unexpected extra codes");
|
||||
});
|
||||
|
||||
// ── isInfrastructureError: code property detection ───────────────────────────
|
||||
|
||||
test("detects ENOSPC via code property", () => {
|
||||
const err = Object.assign(new Error("write ENOSPC"), { code: "ENOSPC" });
|
||||
assert.equal(isInfrastructureError(err), "ENOSPC");
|
||||
});
|
||||
|
||||
test("detects ENOMEM via code property", () => {
|
||||
const err = Object.assign(new Error("Cannot allocate memory"), { code: "ENOMEM" });
|
||||
assert.equal(isInfrastructureError(err), "ENOMEM");
|
||||
});
|
||||
|
||||
test("detects EROFS via code property", () => {
|
||||
const err = Object.assign(new Error("read-only filesystem"), { code: "EROFS" });
|
||||
assert.equal(isInfrastructureError(err), "EROFS");
|
||||
});
|
||||
|
||||
test("detects EDQUOT via code property", () => {
|
||||
const err = Object.assign(new Error("quota exceeded"), { code: "EDQUOT" });
|
||||
assert.equal(isInfrastructureError(err), "EDQUOT");
|
||||
});
|
||||
|
||||
test("detects EMFILE via code property", () => {
|
||||
const err = Object.assign(new Error("too many open files"), { code: "EMFILE" });
|
||||
assert.equal(isInfrastructureError(err), "EMFILE");
|
||||
});
|
||||
|
||||
test("detects ENFILE via code property", () => {
|
||||
const err = Object.assign(new Error("file table overflow"), { code: "ENFILE" });
|
||||
assert.equal(isInfrastructureError(err), "ENFILE");
|
||||
});
|
||||
|
||||
// ── isInfrastructureError: message fallback ──────────────────────────────────
|
||||
|
||||
test("falls back to message scanning when no code property", () => {
|
||||
const err = new Error("pip install failed: ENOSPC: no space left on device");
|
||||
assert.equal(isInfrastructureError(err), "ENOSPC");
|
||||
});
|
||||
|
||||
test("detects code in stringified non-Error value", () => {
|
||||
assert.equal(isInfrastructureError("ENOMEM: cannot allocate memory"), "ENOMEM");
|
||||
});
|
||||
|
||||
test("detects EDQUOT in nested error message", () => {
|
||||
const err = new Error("write failed: EDQUOT disk quota exceeded on /dev/sda1");
|
||||
assert.equal(isInfrastructureError(err), "EDQUOT");
|
||||
});
|
||||
|
||||
// ── isInfrastructureError: negative cases ────────────────────────────────────
|
||||
|
||||
test("returns null for transient network errors", () => {
|
||||
assert.equal(isInfrastructureError(new Error("ETIMEDOUT: connection timed out")), null);
|
||||
});
|
||||
|
||||
test("returns null for generic errors", () => {
|
||||
assert.equal(isInfrastructureError(new Error("Something went wrong")), null);
|
||||
});
|
||||
|
||||
test("returns null for null input", () => {
|
||||
assert.equal(isInfrastructureError(null), null);
|
||||
});
|
||||
|
||||
test("returns null for undefined input", () => {
|
||||
assert.equal(isInfrastructureError(undefined), null);
|
||||
});
|
||||
|
||||
test("returns null for non-infra code property", () => {
|
||||
const err = Object.assign(new Error("connection reset"), { code: "ECONNRESET" });
|
||||
assert.equal(isInfrastructureError(err), null);
|
||||
});
|
||||
|
||||
// ── isInfrastructureError: edge cases ────────────────────────────────────────
|
||||
|
||||
test("message fallback still fires even if code property is non-infra", () => {
|
||||
// code is ECONNRESET (not infra) but message contains ENOSPC
|
||||
const err = Object.assign(new Error("something ENOSPC happened"), { code: "ECONNRESET" });
|
||||
assert.equal(isInfrastructureError(err), "ENOSPC");
|
||||
});
|
||||
|
||||
test("plain object with code property works", () => {
|
||||
assert.equal(isInfrastructureError({ code: "ENOSPC", message: "disk full" }), "ENOSPC");
|
||||
});
|
||||
|
||||
test("numeric error input returns null", () => {
|
||||
assert.equal(isInfrastructureError(42), null);
|
||||
});
|
||||
Loading…
Add table
Reference in a new issue