fix: skip verification retry on spawn infra errors (ETIMEDOUT, ENOENT) (#1340)

2026-03-19 14:39:13 +01:00 · 2026-03-19 14:39:13 +01:00 · e6ceb8dfe8
commit e6ceb8dfe8
parent d7bf3d4e72
4 changed files with 112 additions and 4 deletions
--- a/src/resources/extensions/gsd/auto-verification.ts
+++ b/src/resources/extensions/gsd/auto-verification.ts
@ -175,7 +175,23 @@ export async function runPostUnitVerification(
      s.verificationRetryCount.delete(s.currentUnit.id);
      s.pendingVerificationRetry = null;
      return "continue";
-    } else if (result.discoverySource === "package-json") {
+    }
+
+    // Check if all failures are infra errors (ETIMEDOUT, ENOENT, etc.).
+    // Infra errors are transient OS-level problems the agent cannot fix —
+    // retrying the entire task is wasteful and creates phantom failures.
+    const failedChecks = result.checks.filter(c => c.exitCode !== 0);
+    const allInfraErrors = failedChecks.length > 0 && failedChecks.every(c => c.infraError === true);
+    if (allInfraErrors) {
+      const infraNames = failedChecks.map(f => f.command).join(", ");
+      ctx.ui.notify(`Verification gate: infra error (${infraNames}) — skipping retry, not a code issue`, "warning");
+      process.stderr.write(`verification-gate: all ${failedChecks.length} failure(s) are infra errors — treating as transient, no retry\n`);
+      s.verificationRetryCount.delete(s.currentUnit.id);
+      s.pendingVerificationRetry = null;
+      return "continue";
+    }
+
+    if (result.discoverySource === "package-json") {
      // Auto-discovered checks from package.json may fail on pre-existing errors
      // that the current task didn't introduce. Don't trigger the retry loop —
      // log a warning and let the task proceed (#1186).
@ -189,7 +205,9 @@ export async function runPostUnitVerification(
      s.verificationRetryCount.delete(s.currentUnit.id);
      s.pendingVerificationRetry = null;
      return "continue";
-    } else if (autoFixEnabled && attempt + 1 <= maxRetries) {
+    }
+
+    if (autoFixEnabled && attempt + 1 <= maxRetries) {
      const nextAttempt = attempt + 1;
      s.verificationRetryCount.set(s.currentUnit.id, nextAttempt);
      s.pendingVerificationRetry = {
--- a/src/resources/extensions/gsd/tests/verification-gate.test.ts
+++ b/src/resources/extensions/gsd/tests/verification-gate.test.ts
@ -261,6 +261,71 @@ test("verification-gate: each check has durationMs", () => {
  }
 });

+// ─── Infra Error Tagging Tests ───────────────────────────────────────────────
+
+test("verification-gate: spawnSync ETIMEDOUT → infraError: true on the check", () => {
+  const tmp = makeTempDir("vg-etimedout");
+  try {
+    // Use a short timeout against a long sleep to guarantee ETIMEDOUT
+    const result = runVerificationGate({
+      basePath: tmp,
+      unitId: "T01",
+      cwd: tmp,
+      preferenceCommands: ["sleep 60"],
+      commandTimeoutMs: 200,
+    });
+    assert.equal(result.passed, false);
+    assert.equal(result.checks.length, 1);
+    assert.ok(result.checks[0].exitCode !== 0, "should have non-zero exit code");
+    assert.equal(result.checks[0].infraError, true, "ETIMEDOUT should be tagged as infraError");
+  } finally {
+    rmSync(tmp, { recursive: true, force: true, maxRetries: 3, retryDelay: 100 });
+  }
+});
+
+test("verification-gate: real command failure does NOT have infraError", () => {
+  const tmp = makeTempDir("vg-real-fail");
+  try {
+    const result = runVerificationGate({
+      basePath: tmp,
+      unitId: "T01",
+      cwd: tmp,
+      // Cross-platform: node with --eval flag and no shell-sensitive characters
+      preferenceCommands: ["node --eval \"process.exitCode=1\""],
+    });
+    assert.equal(result.passed, false);
+    assert.equal(result.checks.length, 1);
+    assert.equal(result.checks[0].exitCode, 1);
+    assert.equal(result.checks[0].infraError, undefined, "real failure should not be tagged as infraError");
+  } finally {
+    rmSync(tmp, { recursive: true, force: true, maxRetries: 3, retryDelay: 100 });
+  }
+});
+
+test("verification-gate: mixed infra + real failure — only infra check is tagged", () => {
+  const tmp = makeTempDir("vg-mixed-infra");
+  try {
+    // Use a timeout that kills "sleep 60" but lets "node --eval" complete (~80ms).
+    // The gate applies the same timeout to each command sequentially.
+    const result = runVerificationGate({
+      basePath: tmp,
+      unitId: "T01",
+      cwd: tmp,
+      preferenceCommands: ["sleep 60", "node --eval \"process.exitCode=2\""],
+      commandTimeoutMs: 500,
+    });
+    assert.equal(result.passed, false);
+    assert.equal(result.checks.length, 2);
+    // First check: ETIMEDOUT → infraError
+    assert.equal(result.checks[0].infraError, true, "timed-out command should be infraError");
+    // Second check: real exit 2 → no infraError
+    assert.equal(result.checks[1].exitCode, 2);
+    assert.equal(result.checks[1].infraError, undefined, "real failure should not be infraError");
+  } finally {
+    rmSync(tmp, { recursive: true, force: true, maxRetries: 3, retryDelay: 100 });
+  }
+});
+
 // ─── Preference Validation Tests ─────────────────────────────────────────────

 test("verification-gate: validatePreferences accepts valid verification keys", () => {
--- a/src/resources/extensions/gsd/types.ts
+++ b/src/resources/extensions/gsd/types.ts
@ -56,6 +56,10 @@ export interface VerificationCheck {
  stderr: string;
  durationMs: number;
  blocking: boolean;     // true for preference/task-plan sources, false for package-json (advisory only)
+  /** True when the failure was a spawn/infra error (ETIMEDOUT, ENOENT, ENOMEM)
+   *  rather than the command itself failing. Infra errors are transient and
+   *  should not trigger auto-fix retries — the agent cannot fix the OS. */
+  infraError?: boolean;
 }

 /** A runtime error captured from bg-shell processes or browser console */
--- a/src/resources/extensions/gsd/verification-gate.ts
+++ b/src/resources/extensions/gsd/verification-gate.ts
@ -232,13 +232,20 @@ export interface RunVerificationGateOptions {
  commandTimeoutMs?: number;
 }

+/** Error codes from spawnSync that indicate infrastructure/OS-level failures
+ *  rather than the command itself failing. These are transient — the agent
+ *  cannot fix them, so they should not trigger auto-fix retries. */
+const INFRA_ERROR_CODES = new Set(["ETIMEDOUT", "ENOENT", "ENOMEM", "EMFILE", "ENFILE", "EAGAIN"]);
+
 /**
 * Run the verification gate: discover commands, execute each via spawnSync,
 * and return a structured result.
 *
 * - All commands run sequentially regardless of individual pass/fail.
- * - `passed` is true when every command exits 0 (or no commands are discovered).
+ * - `passed` is true when every blocking command exits 0 (or no commands are discovered).
 * - stdout/stderr per command are truncated to 10 KB.
+ * - Spawn/infra errors (ETIMEDOUT, ENOENT, etc.) are tagged with `infraError: true`
+ *   so the retry logic can distinguish "the OS couldn't run this" from "the tests failed".
 */
 export function runVerificationGate(options: RunVerificationGateOptions): VerificationResult {
  const timestamp = Date.now();
@ -279,12 +286,26 @@ export function runVerificationGate(options: RunVerificationGateOptions): Verifi
    let stderr: string;

    if (result.error) {
-      // Command not found or spawn failure
+      // Spawn infrastructure failure — OS-level, not a test failure.
+      // Tag with infraError so the retry logic can skip auto-fix attempts.
+      const errCode = (result.error as NodeJS.ErrnoException).code;
+      const isInfra = !!errCode && INFRA_ERROR_CODES.has(errCode);
      exitCode = 127;
      stderr = truncate(
        (result.stderr || "") + "\n" + (result.error as Error).message,
        MAX_OUTPUT_BYTES,
      );
+
+      checks.push({
+        command,
+        exitCode,
+        stdout: truncate(result.stdout, MAX_OUTPUT_BYTES),
+        stderr,
+        durationMs,
+        blocking,
+        ...(isInfra ? { infraError: true } : {}),
+      });
+      continue;
    } else {
      // status is null when killed by signal — treat as failure
      exitCode = result.status ?? 1;