From e6ceb8dfe8fe363abdd4f01ede7ddc6e0967f334 Mon Sep 17 00:00:00 2001
From: deseltrus <101901449+deseltrus@users.noreply.github.com>
Date: Thu, 19 Mar 2026 14:39:13 +0100
Subject: [PATCH] fix: skip verification retry on spawn infra errors
 (ETIMEDOUT, ENOENT) (#1340)

---
 .../extensions/gsd/auto-verification.ts       | 22 ++++++-
 .../gsd/tests/verification-gate.test.ts       | 65 +++++++++++++++++++
 src/resources/extensions/gsd/types.ts         |  4 ++
 .../extensions/gsd/verification-gate.ts       | 25 ++++++-
 4 files changed, 112 insertions(+), 4 deletions(-)

diff --git a/src/resources/extensions/gsd/auto-verification.ts b/src/resources/extensions/gsd/auto-verification.ts
index e4dd26d85..6f794bc66 100644
--- a/src/resources/extensions/gsd/auto-verification.ts
+++ b/src/resources/extensions/gsd/auto-verification.ts
@@ -175,7 +175,23 @@ export async function runPostUnitVerification(
       s.verificationRetryCount.delete(s.currentUnit.id);
       s.pendingVerificationRetry = null;
       return "continue";
-    } else if (result.discoverySource === "package-json") {
+    }
+
+    // Check if all failures are infra errors (ETIMEDOUT, ENOENT, etc.).
+    // Infra errors are transient OS-level problems the agent cannot fix —
+    // retrying the entire task is wasteful and creates phantom failures.
+    const failedChecks = result.checks.filter(c => c.exitCode !== 0);
+    const allInfraErrors = failedChecks.length > 0 && failedChecks.every(c => c.infraError === true);
+    if (allInfraErrors) {
+      const infraNames = failedChecks.map(f => f.command).join(", ");
+      ctx.ui.notify(`Verification gate: infra error (${infraNames}) — skipping retry, not a code issue`, "warning");
+      process.stderr.write(`verification-gate: all ${failedChecks.length} failure(s) are infra errors — treating as transient, no retry\n`);
+      s.verificationRetryCount.delete(s.currentUnit.id);
+      s.pendingVerificationRetry = null;
+      return "continue";
+    }
+
+    if (result.discoverySource === "package-json") {
       // Auto-discovered checks from package.json may fail on pre-existing errors
       // that the current task didn't introduce. Don't trigger the retry loop —
       // log a warning and let the task proceed (#1186).
@@ -189,7 +205,9 @@ export async function runPostUnitVerification(
       s.verificationRetryCount.delete(s.currentUnit.id);
       s.pendingVerificationRetry = null;
       return "continue";
-    } else if (autoFixEnabled && attempt + 1 <= maxRetries) {
+    }
+
+    if (autoFixEnabled && attempt + 1 <= maxRetries) {
       const nextAttempt = attempt + 1;
       s.verificationRetryCount.set(s.currentUnit.id, nextAttempt);
       s.pendingVerificationRetry = {
diff --git a/src/resources/extensions/gsd/tests/verification-gate.test.ts b/src/resources/extensions/gsd/tests/verification-gate.test.ts
index 4bc4aa0a1..c1c89aa4e 100644
--- a/src/resources/extensions/gsd/tests/verification-gate.test.ts
+++ b/src/resources/extensions/gsd/tests/verification-gate.test.ts
@@ -261,6 +261,71 @@ test("verification-gate: each check has durationMs", () => {
   }
 });
 
+// ─── Infra Error Tagging Tests ───────────────────────────────────────────────
+
+test("verification-gate: spawnSync ETIMEDOUT → infraError: true on the check", () => {
+  const tmp = makeTempDir("vg-etimedout");
+  try {
+    // Use a short timeout against a long sleep to guarantee ETIMEDOUT
+    const result = runVerificationGate({
+      basePath: tmp,
+      unitId: "T01",
+      cwd: tmp,
+      preferenceCommands: ["sleep 60"],
+      commandTimeoutMs: 200,
+    });
+    assert.equal(result.passed, false);
+    assert.equal(result.checks.length, 1);
+    assert.ok(result.checks[0].exitCode !== 0, "should have non-zero exit code");
+    assert.equal(result.checks[0].infraError, true, "ETIMEDOUT should be tagged as infraError");
+  } finally {
+    rmSync(tmp, { recursive: true, force: true, maxRetries: 3, retryDelay: 100 });
+  }
+});
+
+test("verification-gate: real command failure does NOT have infraError", () => {
+  const tmp = makeTempDir("vg-real-fail");
+  try {
+    const result = runVerificationGate({
+      basePath: tmp,
+      unitId: "T01",
+      cwd: tmp,
+      // Cross-platform: node with --eval flag and no shell-sensitive characters
+      preferenceCommands: ["node --eval \"process.exitCode=1\""],
+    });
+    assert.equal(result.passed, false);
+    assert.equal(result.checks.length, 1);
+    assert.equal(result.checks[0].exitCode, 1);
+    assert.equal(result.checks[0].infraError, undefined, "real failure should not be tagged as infraError");
+  } finally {
+    rmSync(tmp, { recursive: true, force: true, maxRetries: 3, retryDelay: 100 });
+  }
+});
+
+test("verification-gate: mixed infra + real failure — only infra check is tagged", () => {
+  const tmp = makeTempDir("vg-mixed-infra");
+  try {
+    // Use a timeout that kills "sleep 60" but lets "node --eval" complete (~80ms).
+    // The gate applies the same timeout to each command sequentially.
+    const result = runVerificationGate({
+      basePath: tmp,
+      unitId: "T01",
+      cwd: tmp,
+      preferenceCommands: ["sleep 60", "node --eval \"process.exitCode=2\""],
+      commandTimeoutMs: 500,
+    });
+    assert.equal(result.passed, false);
+    assert.equal(result.checks.length, 2);
+    // First check: ETIMEDOUT → infraError
+    assert.equal(result.checks[0].infraError, true, "timed-out command should be infraError");
+    // Second check: real exit 2 → no infraError
+    assert.equal(result.checks[1].exitCode, 2);
+    assert.equal(result.checks[1].infraError, undefined, "real failure should not be infraError");
+  } finally {
+    rmSync(tmp, { recursive: true, force: true, maxRetries: 3, retryDelay: 100 });
+  }
+});
+
 // ─── Preference Validation Tests ─────────────────────────────────────────────
 
 test("verification-gate: validatePreferences accepts valid verification keys", () => {
diff --git a/src/resources/extensions/gsd/types.ts b/src/resources/extensions/gsd/types.ts
index d8105bd46..ce7fe9cf4 100644
--- a/src/resources/extensions/gsd/types.ts
+++ b/src/resources/extensions/gsd/types.ts
@@ -56,6 +56,10 @@ export interface VerificationCheck {
   stderr: string;
   durationMs: number;
   blocking: boolean;     // true for preference/task-plan sources, false for package-json (advisory only)
+  /** True when the failure was a spawn/infra error (ETIMEDOUT, ENOENT, ENOMEM)
+   *  rather than the command itself failing. Infra errors are transient and
+   *  should not trigger auto-fix retries — the agent cannot fix the OS. */
+  infraError?: boolean;
 }
 
 /** A runtime error captured from bg-shell processes or browser console */
diff --git a/src/resources/extensions/gsd/verification-gate.ts b/src/resources/extensions/gsd/verification-gate.ts
index 3a5ac64a7..04665d97b 100644
--- a/src/resources/extensions/gsd/verification-gate.ts
+++ b/src/resources/extensions/gsd/verification-gate.ts
@@ -232,13 +232,20 @@ export interface RunVerificationGateOptions {
   commandTimeoutMs?: number;
 }
 
+/** Error codes from spawnSync that indicate infrastructure/OS-level failures
+ *  rather than the command itself failing. These are transient — the agent
+ *  cannot fix them, so they should not trigger auto-fix retries. */
+const INFRA_ERROR_CODES = new Set(["ETIMEDOUT", "ENOENT", "ENOMEM", "EMFILE", "ENFILE", "EAGAIN"]);
+
 /**
  * Run the verification gate: discover commands, execute each via spawnSync,
  * and return a structured result.
  *
  * - All commands run sequentially regardless of individual pass/fail.
- * - `passed` is true when every command exits 0 (or no commands are discovered).
+ * - `passed` is true when every blocking command exits 0 (or no commands are discovered).
  * - stdout/stderr per command are truncated to 10 KB.
+ * - Spawn/infra errors (ETIMEDOUT, ENOENT, etc.) are tagged with `infraError: true`
+ *   so the retry logic can distinguish "the OS couldn't run this" from "the tests failed".
  */
 export function runVerificationGate(options: RunVerificationGateOptions): VerificationResult {
   const timestamp = Date.now();
@@ -279,12 +286,26 @@ export function runVerificationGate(options: RunVerificationGateOptions): Verifi
     let stderr: string;
 
     if (result.error) {
-      // Command not found or spawn failure
+      // Spawn infrastructure failure — OS-level, not a test failure.
+      // Tag with infraError so the retry logic can skip auto-fix attempts.
+      const errCode = (result.error as NodeJS.ErrnoException).code;
+      const isInfra = !!errCode && INFRA_ERROR_CODES.has(errCode);
       exitCode = 127;
       stderr = truncate(
         (result.stderr || "") + "\n" + (result.error as Error).message,
         MAX_OUTPUT_BYTES,
       );
+
+      checks.push({
+        command,
+        exitCode,
+        stdout: truncate(result.stdout, MAX_OUTPUT_BYTES),
+        stderr,
+        durationMs,
+        blocking,
+        ...(isInfra ? { infraError: true } : {}),
+      });
+      continue;
     } else {
       // status is null when killed by signal — treat as failure
       exitCode = result.status ?? 1;