From aa3ac89bf87ec5ac7d811161562a60d41a952bfb Mon Sep 17 00:00:00 2001
From: Tom Boucher <trekkie@nomorestars.com>
Date: Wed, 25 Mar 2026 02:06:23 -0400
Subject: [PATCH] fix(auto): reset recoveryAttempts on unit re-dispatch (#2322)
 (#2424)

The dispatch-time writeUnitRuntimeRecord call in runUnitPhase did not
reset recoveryAttempts, so the counter from a prior execution's timeout
carried over to subsequent dispatches. This caused re-dispatched units
to be instantly skipped (recoveryAttempts >= maxRecoveryAttempts) with
no steering message or second chance.

Add `recoveryAttempts: 0` to the dispatch-time runtime record write so
each execution starts with its full recovery budget.

Fixes #2322

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/resources/extensions/gsd/auto/phases.ts   |   1 +
 .../gsd/tests/recovery-attempts-reset.test.ts | 176 ++++++++++++++++++
 2 files changed, 177 insertions(+)
 create mode 100644 src/resources/extensions/gsd/tests/recovery-attempts-reset.test.ts

diff --git a/src/resources/extensions/gsd/auto/phases.ts b/src/resources/extensions/gsd/auto/phases.ts
index 0008db09b..1768a57dd 100644
--- a/src/resources/extensions/gsd/auto/phases.ts
+++ b/src/resources/extensions/gsd/auto/phases.ts
@@ -857,6 +857,7 @@ export async function runUnitPhase(
       lastProgressAt: s.currentUnit.startedAt,
       progressCount: 0,
       lastProgressKind: "dispatch",
+      recoveryAttempts: 0, // Reset so re-dispatched units get full recovery budget (#2322)
     },
   );
 
diff --git a/src/resources/extensions/gsd/tests/recovery-attempts-reset.test.ts b/src/resources/extensions/gsd/tests/recovery-attempts-reset.test.ts
new file mode 100644
index 000000000..0b540d3d3
--- /dev/null
+++ b/src/resources/extensions/gsd/tests/recovery-attempts-reset.test.ts
@@ -0,0 +1,176 @@
+/**
+ * Regression test for #2322: recoveryAttempts persists across re-dispatches,
+ * causing instant task skip.
+ *
+ * When a unit hits recovery limits and is later re-dispatched, the
+ * recoveryAttempts counter from the prior execution carries over because
+ * the dispatch-time writeUnitRuntimeRecord call does not reset it.
+ * This causes the next execution to be instantly skipped with no steering
+ * message or second chance.
+ *
+ * The fix: include `recoveryAttempts: 0` in the dispatch-time runtime
+ * record write in runUnitPhase.
+ */
+
+import { mkdtempSync, mkdirSync, rmSync } from "node:fs";
+import { join } from "node:path";
+import { tmpdir } from "node:os";
+import {
+  writeUnitRuntimeRecord,
+  readUnitRuntimeRecord,
+} from "../unit-runtime.ts";
+import { createTestContext } from "./test-helpers.ts";
+
+const { assertEq, assertTrue, report } = createTestContext();
+
+// ═══ Setup ════════════════════════════════════════════════════════════════════
+
+const base = mkdtempSync(join(tmpdir(), "gsd-recovery-reset-test-"));
+mkdirSync(join(base, ".gsd", "runtime", "units"), { recursive: true });
+
+try {
+  // ═══ #2322: recoveryAttempts should reset on re-dispatch ═══════════════════
+
+  {
+    console.log("\n=== #2322: recoveryAttempts should reset on re-dispatch ===");
+
+    const unitType = "execute-task";
+    const unitId = "M001/S01/T01";
+    const startedAt1 = Date.now() - 10000;
+
+    // Simulate first dispatch — clean state
+    writeUnitRuntimeRecord(base, unitType, unitId, startedAt1, {
+      phase: "dispatched",
+      wrapupWarningSent: false,
+      timeoutAt: null,
+      lastProgressAt: startedAt1,
+      progressCount: 0,
+      lastProgressKind: "dispatch",
+    });
+
+    // Simulate timeout recovery incrementing recoveryAttempts
+    writeUnitRuntimeRecord(base, unitType, unitId, startedAt1, {
+      phase: "recovered",
+      recoveryAttempts: 1,
+      lastRecoveryReason: "hard",
+    });
+
+    const afterRecovery = readUnitRuntimeRecord(base, unitType, unitId);
+    assertEq(afterRecovery?.recoveryAttempts, 1, "recoveryAttempts should be 1 after recovery");
+    assertEq(afterRecovery?.lastRecoveryReason, "hard", "lastRecoveryReason should be 'hard'");
+
+    // Simulate re-dispatch (second execution of same unit).
+    // This is what runUnitPhase should do at dispatch time — explicitly reset
+    // recoveryAttempts so the new execution gets its full recovery budget.
+    const startedAt2 = Date.now();
+    writeUnitRuntimeRecord(base, unitType, unitId, startedAt2, {
+      phase: "dispatched",
+      wrapupWarningSent: false,
+      timeoutAt: null,
+      lastProgressAt: startedAt2,
+      progressCount: 0,
+      lastProgressKind: "dispatch",
+      recoveryAttempts: 0, // FIX: must be explicitly reset
+    });
+
+    const afterRedispatch = readUnitRuntimeRecord(base, unitType, unitId);
+    assertEq(
+      afterRedispatch?.recoveryAttempts,
+      0,
+      "recoveryAttempts should be 0 after re-dispatch (was carried over from prior execution)",
+    );
+  }
+
+  // ═══ Verify the BUG scenario: omitting recoveryAttempts carries it over ═══
+
+  {
+    console.log("\n=== #2322: demonstrates bug — omitting recoveryAttempts carries it over ===");
+
+    const unitType = "execute-task";
+    const unitId = "M001/S01/T02";
+    const startedAt1 = Date.now() - 10000;
+
+    // First dispatch
+    writeUnitRuntimeRecord(base, unitType, unitId, startedAt1, {
+      phase: "dispatched",
+    });
+
+    // Timeout bumps recoveryAttempts to 1
+    writeUnitRuntimeRecord(base, unitType, unitId, startedAt1, {
+      recoveryAttempts: 1,
+      lastRecoveryReason: "hard",
+    });
+
+    // Re-dispatch WITHOUT resetting recoveryAttempts (the bug)
+    const startedAt2 = Date.now();
+    writeUnitRuntimeRecord(base, unitType, unitId, startedAt2, {
+      phase: "dispatched",
+      wrapupWarningSent: false,
+      timeoutAt: null,
+      lastProgressAt: startedAt2,
+      progressCount: 0,
+      lastProgressKind: "dispatch",
+      // recoveryAttempts: NOT included — this is the bug
+    });
+
+    const afterBuggyRedispatch = readUnitRuntimeRecord(base, unitType, unitId);
+    // This DEMONSTRATES the bug: recoveryAttempts is still 1
+    assertEq(
+      afterBuggyRedispatch?.recoveryAttempts,
+      1,
+      "BUG DEMO: recoveryAttempts carries over when not explicitly reset",
+    );
+  }
+
+  // ═══ Hard timeout maxRecoveryAttempts=1 — second dispatch must get full budget ═══
+
+  {
+    console.log("\n=== #2322: second dispatch gets full hard-timeout budget after reset ===");
+
+    const unitType = "execute-task";
+    const unitId = "M001/S01/T03";
+
+    // First dispatch
+    const start1 = Date.now() - 20000;
+    writeUnitRuntimeRecord(base, unitType, unitId, start1, {
+      phase: "dispatched",
+      recoveryAttempts: 0,
+    });
+
+    // Hard timeout recovery — exhausts the budget (maxRecoveryAttempts=1 for hard)
+    writeUnitRuntimeRecord(base, unitType, unitId, start1, {
+      phase: "recovered",
+      recoveryAttempts: 1,
+      lastRecoveryReason: "hard",
+    });
+
+    const afterExhausted = readUnitRuntimeRecord(base, unitType, unitId);
+    assertEq(afterExhausted?.recoveryAttempts, 1, "budget exhausted after hard recovery");
+
+    // Second dispatch with fix: reset recoveryAttempts
+    const start2 = Date.now();
+    writeUnitRuntimeRecord(base, unitType, unitId, start2, {
+      phase: "dispatched",
+      wrapupWarningSent: false,
+      timeoutAt: null,
+      lastProgressAt: start2,
+      progressCount: 0,
+      lastProgressKind: "dispatch",
+      recoveryAttempts: 0,
+    });
+
+    const afterReset = readUnitRuntimeRecord(base, unitType, unitId);
+    assertEq(afterReset?.recoveryAttempts, 0, "second dispatch has full recovery budget");
+
+    // Now a hard timeout should be recoverable (0 < 1)
+    assertTrue(
+      (afterReset?.recoveryAttempts ?? 0) < 1,
+      "hard recovery should be allowed (recoveryAttempts < maxRecoveryAttempts)",
+    );
+  }
+
+} finally {
+  rmSync(base, { recursive: true, force: true });
+}
+
+report();