fix: resolve auto-mode infinite loop and closeout instability (#96, #109)

Three defects in auto.ts + worktree.ts combined to produce an infinite alternating loop and unreliable unit closeout in GSD auto-mode. **D1 (auto.ts)** — `state.phase === "summarizing"` is now the first branch in the dispatch if-else chain, evaluated before `needsRunUat` and `needsReassess`. Previously, if an execute-task agent wrote slice-level artifacts early, `needsReassess` fired instead and `mergeSliceToMain` was permanently skipped. **D2 (worktree.ts)** — New slice branches are now created from the current HEAD instead of `main`. When a prior slice merge was skipped, the new branch would inherit a stale ROADMAP from main, creating divergent state that drove the A→B→A→B alternation. **D3 (auto.ts)** — Replaced `lastUnit`/`retryCount` consecutive-repeat detection with a `unitDispatchCount` map that tracks total dispatches per unit key. The old guard reset to 0 on every ID change; the map catches alternating-loop patterns and stops after MAX_UNIT_DISPATCHES=3. **Atomic closeout (auto.ts)** — `persistCompletedKey` writes the unit key to `.gsd/completed-units.json` before any in-memory update. A crash mid-closeout is now recoverable: on next start `loadPersistedKeys` re-populates `completedKeySet` and the idempotency guard skips already- completed units. **Persistent idempotency (auto.ts)** — `completedKeySet` is loaded from disk on `startAuto` and checked before every dispatch, preventing re- dispatch of units completed in a prior session even after a restart. **Startup self-heal (auto.ts + unit-runtime.ts)** — `selfHealRuntimeRecords` runs on start and resume; it scans all on-disk runtime records, checks whether each unit's expected artifact exists, and clears any orphaned records. Added `listUnitRuntimeRecords` to unit-runtime.ts to support this scan. **Recovery backoff (auto.ts)** — `recoverTimedOutUnit` now tracks cross-invocation recovery attempts per unit in `unitRecoveryCount` and applies exponential backoff (1s→2s→4s…30s cap) between attempts. Attempt number is included in all recovery notify messages for traceability. Closes #96 Closes #109 Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-12 10:27:18 -04:00 · 2026-03-12 10:27:18 -04:00 · 62d4ca74a2
commit 62d4ca74a2
parent 9fb348b123
3 changed files with 188 additions and 56 deletions
--- a/src/resources/extensions/gsd/auto.ts
+++ b/src/resources/extensions/gsd/auto.ts
@ -69,6 +69,39 @@ import { truncateToWidth, visibleWidth } from "@mariozechner/pi-tui";
 import { makeUI, GLYPH, INDENT } from "../shared/ui.js";
 import { showNextAction } from "../shared/next-action-ui.js";

+// ─── Disk-backed completed-unit helpers ───────────────────────────────────────
+
+/** Path to the persisted completed-unit keys file. */
+function completedKeysPath(base: string): string {
+  return join(base, ".gsd", "completed-units.json");
+}
+
+/** Write a completed unit key to disk atomically (append to set). */
+function persistCompletedKey(base: string, key: string): void {
+  const file = completedKeysPath(base);
+  let keys: string[] = [];
+  try {
+    if (existsSync(file)) {
+      keys = JSON.parse(readFileSync(file, "utf-8"));
+    }
+  } catch { /* corrupt file — start fresh */ }
+  if (!keys.includes(key)) {
+    keys.push(key);
+    writeFileSync(file, JSON.stringify(keys), "utf-8");
+  }
+}
+
+/** Load all completed unit keys from disk into the in-memory set. */
+function loadPersistedKeys(base: string, target: Set<string>): void {
+  const file = completedKeysPath(base);
+  try {
+    if (existsSync(file)) {
+      const keys: string[] = JSON.parse(readFileSync(file, "utf-8"));
+      for (const k of keys) target.add(k);
+    }
+  } catch { /* non-fatal */ }
+}
+
 // ─── State ────────────────────────────────────────────────────────────────────

 let active = false;
@ -78,10 +111,15 @@ let verbose = false;
 let cmdCtx: ExtensionCommandContext | null = null;
 let basePath = "";

-/** Track last dispatched unit to detect stuck loops */
-let lastUnit: { type: string; id: string } | null = null;
-let retryCount = 0;
-const MAX_RETRIES = 1;
+/** Track total dispatches per unit to detect stuck loops (catches A→B→A→B patterns) */
+const unitDispatchCount = new Map<string, number>();
+const MAX_UNIT_DISPATCHES = 3;
+
+/** Tracks recovery attempt count per unit for backoff and diagnostics. */
+const unitRecoveryCount = new Map<string, number>();
+
+/** Persisted completed-unit keys — survives restarts. Loaded from .gsd/completed-units.json. */
+const completedKeySet = new Set<string>();

 /** Crash recovery prompt — set by startAuto, consumed by first dispatchNextUnit */
 let pendingCrashRecovery: string | null = null;
@ -205,7 +243,8 @@ export async function stopAuto(ctx?: ExtensionContext, pi?: ExtensionAPI): Promi
  active = false;
  paused = false;
  stepMode = false;
-  lastUnit = null;
+  unitDispatchCount.clear();
+  unitRecoveryCount.clear();
  currentUnit = null;
  currentMilestoneId = null;
  cachedSliceProgress = null;
@ -235,7 +274,7 @@ export async function pauseAuto(ctx?: ExtensionContext, _pi?: ExtensionAPI): Pro
  if (basePath) clearLock(basePath);
  active = false;
  paused = true;
-  // Preserve: lastUnit, currentUnit, basePath, verbose, cmdCtx,
+  // Preserve: unitDispatchCount, currentUnit, basePath, verbose, cmdCtx,
  // completedUnits, autoStartTime, currentMilestoneId, originalModelId
  // — all needed for resume and dashboard display
  ctx?.ui.setStatus("gsd-auto", "paused");
@ -248,6 +287,33 @@ export async function pauseAuto(ctx?: ExtensionContext, _pi?: ExtensionAPI): Pro
  );
 }

+/**
+ * Self-heal: scan runtime records in .gsd/ and clear any where the expected
+ * artifact already exists on disk. This repairs incomplete closeouts from
+ * prior crashes — preventing spurious re-dispatch of already-completed units.
+ */
+async function selfHealRuntimeRecords(base: string, ctx: ExtensionContext): Promise<void> {
+  try {
+    const { listUnitRuntimeRecords } = await import("./unit-runtime.js");
+    const records = listUnitRuntimeRecords(base);
+    let healed = 0;
+    for (const record of records) {
+      const { unitType, unitId } = record;
+      const artifactPath = resolveExpectedArtifactPath(unitType, unitId, base);
+      if (artifactPath && existsSync(artifactPath)) {
+        // Artifact exists — unit completed but closeout didn't finish.
+        clearUnitRuntimeRecord(base, unitType, unitId);
+        healed++;
+      }
+    }
+    if (healed > 0) {
+      ctx.ui.notify(`Self-heal: cleared ${healed} stale runtime record(s) with completed artifacts.`, "info");
+    }
+  } catch {
+    // Non-fatal — self-heal should never block auto-mode start
+  }
+}
+
 export async function startAuto(
  ctx: ExtensionCommandContext,
  pi: ExtensionAPI,
@ -280,6 +346,8 @@ export async function startAuto(
        ctx.ui.notify(`Resume: applied ${report.fixesApplied.length} fix(es) to state.`, "info");
      }
    } catch { /* non-fatal */ }
+    // Self-heal: clear stale runtime records where artifacts already exist
+    await selfHealRuntimeRecords(base, ctx);
    await dispatchNextUnit(ctx, pi);
    return;
  }
@ -358,8 +426,10 @@ export async function startAuto(
  verbose = verboseMode;
  cmdCtx = ctx;
  basePath = base;
-  lastUnit = null;
-  retryCount = 0;
+  unitDispatchCount.clear();
+  unitRecoveryCount.clear();
+  completedKeySet.clear();
+  loadPersistedKeys(base, completedKeySet);
  autoStartTime = Date.now();
  completedUnits = [];
  currentUnit = null;
@ -383,6 +453,9 @@ export async function startAuto(
    : "Will loop until milestone complete.";
  ctx.ui.notify(`${modeLabel} started. ${scopeMsg}`, "info");

+  // Self-heal: clear stale runtime records where artifacts already exist
+  await selfHealRuntimeRecords(base, ctx);
+
  // Dispatch the first unit
  await dispatchNextUnit(ctx, pi);
 }
@ -876,8 +949,8 @@ async function dispatchNextUnit(
      "info",
    );
    // Reset stuck detection for new milestone
-    lastUnit = null;
-    retryCount = 0;
+    unitDispatchCount.clear();
+    unitRecoveryCount.clear();
  }
  if (mid) currentMilestoneId = mid;

@ -953,6 +1026,12 @@ async function dispatchNextUnit(
      snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId);
      saveActivityLog(ctx, basePath, currentUnit.type, currentUnit.id);
    }
+    // Clear completed-units.json for the finished milestone so it doesn't grow unbounded.
+    try {
+      const file = completedKeysPath(basePath);
+      if (existsSync(file)) writeFileSync(file, JSON.stringify([]), "utf-8");
+      completedKeySet.clear();
+    } catch { /* non-fatal */ }
    await stopAuto(ctx, pi);
    return;
  }
@ -996,7 +1075,15 @@ async function dispatchNextUnit(
  // After a slice completes, we reassess the roadmap before moving to the next slice.
  // Skip reassessment for the final slice (milestone complete) or if already assessed.
  const needsReassess = await checkNeedsReassessment(basePath, mid, state);
-  if (needsRunUat) {
+  if (state.phase === "summarizing") {
+    // complete-slice MUST run before reassessment to guarantee mergeSliceToMain
+    const sid = state.activeSlice!.id;
+    const sTitle = state.activeSlice!.title;
+    unitType = "complete-slice";
+    unitId = `${mid}/${sid}`;
+    prompt = await buildCompleteSlicePrompt(mid, midTitle!, sid, sTitle, basePath);
+
+  } else if (needsRunUat) {
    const { sliceId, uatType } = needsRunUat;
    unitType = "run-uat";
    unitId = `${mid}/${sliceId}`;
@ -1075,14 +1162,6 @@ async function dispatchNextUnit(
    unitId = `${mid}/${sid}/${tid}`;
    prompt = await buildExecuteTaskPrompt(mid, sid, sTitle, tid, tTitle, basePath);

-  } else if (state.phase === "summarizing") {
-    // All tasks done — complete the slice
-    const sid = state.activeSlice!.id;
-    const sTitle = state.activeSlice!.title;
-    unitType = "complete-slice";
-    unitId = `${mid}/${sid}`;
-    prompt = await buildCompleteSlicePrompt(mid, midTitle!, sid, sTitle, basePath);
-
  } else if (state.phase === "completing-milestone") {
    // All slices done — complete the milestone
    unitType = "complete-milestone";
@ -1102,34 +1181,44 @@ async function dispatchNextUnit(

  await emitObservabilityWarnings(ctx, unitType, unitId);

-  // Stuck detection — same unit dispatched again means the LLM didn't produce
-  // the expected artifact. Retry once (the LLM may have hit an error or run out
-  // of context), then stop with a diagnostic.
-  if (lastUnit && lastUnit.type === unitType && lastUnit.id === unitId) {
-    retryCount++;
-
-    if (retryCount > MAX_RETRIES) {
-      if (currentUnit) {
-        const modelId = ctx.model?.id ?? "unknown";
-        snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId);
-      }
-      saveActivityLog(ctx, basePath, lastUnit.type, lastUnit.id);
-
-      // Diagnostic: what file was expected?
-      const expected = diagnoseExpectedArtifact(unitType, unitId, basePath);
-      await stopAuto(ctx, pi);
-      ctx.ui.notify(
-        `Stuck: ${unitType} ${unitId} fired ${retryCount + 1} times. Expected artifact not found.${expected ? `\n   Expected: ${expected}` : ""}\n   Check .gsd/ and activity logs.`,
-        "error",
-      );
-      return;
-    }
+  // Idempotency: skip units already completed in a prior session.
+  const idempotencyKey = `${unitType}/${unitId}`;
+  if (completedKeySet.has(idempotencyKey)) {
    ctx.ui.notify(
-      `${unitType} ${unitId} didn't produce expected artifact. Retrying (${retryCount}/${MAX_RETRIES}).`,
+      `Skipping ${unitType} ${unitId} — already completed in a prior session. Advancing.`,
+      "info",
+    );
+    // Don't increment dispatch count — just advance by calling dispatchNextUnit again.
+    // First, force state re-derive so the scheduler sees the completed artifact.
+    await dispatchNextUnit(ctx, pi);
+    return;
+  }
+
+  // Stuck detection — tracks total dispatches per unit (not just consecutive repeats).
+  // Pattern A→B→A→B would reset retryCount every time; this map catches it.
+  const dispatchKey = `${unitType}/${unitId}`;
+  const prevCount = unitDispatchCount.get(dispatchKey) ?? 0;
+  if (prevCount >= MAX_UNIT_DISPATCHES) {
+    if (currentUnit) {
+      const modelId = ctx.model?.id ?? "unknown";
+      snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId);
+    }
+    saveActivityLog(ctx, basePath, unitType, unitId);
+
+    const expected = diagnoseExpectedArtifact(unitType, unitId, basePath);
+    await stopAuto(ctx, pi);
+    ctx.ui.notify(
+      `Loop detected: ${unitType} ${unitId} dispatched ${prevCount + 1} times total. Expected artifact not found.${expected ? `\n   Expected: ${expected}` : ""}\n   Check branch state and .gsd/ artifacts.`,
+      "error",
+    );
+    return;
+  }
+  unitDispatchCount.set(dispatchKey, prevCount + 1);
+  if (prevCount > 0) {
+    ctx.ui.notify(
+      `${unitType} ${unitId} didn't produce expected artifact. Retrying (${prevCount + 1}/${MAX_UNIT_DISPATCHES}).`,
      "warning",
    );
-  } else {
-    retryCount = 0;
  }
  // Snapshot metrics + activity log for the PREVIOUS unit before we reassign.
  // The session still holds the previous unit's data (newSession hasn't fired yet).
@ -1138,6 +1227,11 @@ async function dispatchNextUnit(
    snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId);
    saveActivityLog(ctx, basePath, currentUnit.type, currentUnit.id);

+    // Persist completion to disk BEFORE updating memory — so a crash here is recoverable.
+    const closeoutKey = `${currentUnit.type}/${currentUnit.id}`;
+    persistCompletedKey(basePath, closeoutKey);
+    completedKeySet.add(closeoutKey);
+
    completedUnits.push({
      type: currentUnit.type,
      id: currentUnit.id,
@ -1145,9 +1239,9 @@ async function dispatchNextUnit(
      finishedAt: Date.now(),
    });
    clearUnitRuntimeRecord(basePath, currentUnit.type, currentUnit.id);
+    unitDispatchCount.delete(`${currentUnit.type}/${currentUnit.id}`);
+    unitRecoveryCount.delete(`${currentUnit.type}/${currentUnit.id}`);
  }
-
-  lastUnit = { type: unitType, id: unitId };
  currentUnit = { type: unitType, id: unitId, startedAt: Date.now() };
  writeUnitRuntimeRecord(basePath, unitType, unitId, currentUnit.startedAt, {
    phase: "dispatched",
@ -1191,7 +1285,7 @@ async function dispatchNextUnit(
  if (pendingCrashRecovery) {
    finalPrompt = `${pendingCrashRecovery}\n\n---\n\n${finalPrompt}`;
    pendingCrashRecovery = null;
-  } else if (retryCount > 0) {
+  } else if ((unitDispatchCount.get(`${unitType}/${unitId}`) ?? 0) > 1) {
    const diagnostic = getDeepDiagnostic(basePath);
    if (diagnostic) {
      finalPrompt = `**RETRY — your previous attempt did not produce the required artifact.**\n\nDiagnostic from previous attempt:\n${diagnostic}\n\nFix whatever went wrong and make sure you write the required file this time.\n\n---\n\n${finalPrompt}`;
@ -2169,6 +2263,20 @@ async function recoverTimedOutUnit(
  const recoveryAttempts = runtime?.recoveryAttempts ?? 0;
  const maxRecoveryAttempts = reason === "idle" ? 2 : 1;

+  const recoveryKey = `${unitType}/${unitId}`;
+  const attemptNumber = (unitRecoveryCount.get(recoveryKey) ?? 0) + 1;
+  unitRecoveryCount.set(recoveryKey, attemptNumber);
+
+  if (attemptNumber > 1) {
+    // Exponential backoff: 2^(n-1) seconds, capped at 30s
+    const backoffMs = Math.min(1000 * Math.pow(2, attemptNumber - 2), 30000);
+    ctx.ui.notify(
+      `Recovery attempt ${attemptNumber} for ${unitType} ${unitId}. Waiting ${backoffMs / 1000}s before retry.`,
+      "info",
+    );
+    await new Promise(r => setTimeout(r, backoffMs));
+  }
+
  if (unitType === "execute-task") {
    const status = await inspectExecuteTaskDurability(basePath, unitId);
    if (!status) return "paused";
@ -2184,9 +2292,10 @@ async function recoverTimedOutUnit(
        recovery: status,
      });
      ctx.ui.notify(
-        `${reason === "idle" ? "Idle" : "Timeout"} recovery: ${unitType} ${unitId} already completed on disk. Continuing auto-mode.`,
+        `${reason === "idle" ? "Idle" : "Timeout"} recovery: ${unitType} ${unitId} already completed on disk. Continuing auto-mode. (attempt ${attemptNumber})`,
        "info",
      );
+      unitRecoveryCount.delete(recoveryKey);
      await dispatchNextUnit(ctx, pi);
      return "recovered";
    }
@ -2233,7 +2342,7 @@ async function recoverTimedOutUnit(
        { triggerTurn: true, deliverAs: "steer" },
      );
      ctx.ui.notify(
-        `${reason === "idle" ? "Idle" : "Timeout"} recovery: steering ${unitType} ${unitId} to finish durable output (attempt ${recoveryAttempts + 1}/${maxRecoveryAttempts}).`,
+        `${reason === "idle" ? "Idle" : "Timeout"} recovery: steering ${unitType} ${unitId} to finish durable output (attempt ${recoveryAttempts + 1}/${maxRecoveryAttempts}) (attempt ${attemptNumber}).`,
        "warning",
      );
      return "recovered";
@ -2254,9 +2363,10 @@ async function recoverTimedOutUnit(
        lastRecoveryReason: reason,
      });
      ctx.ui.notify(
-        `${unitType} ${unitId} skipped after ${maxRecoveryAttempts} recovery attempts (${diagnostic}). Blocker artifacts written. Advancing pipeline.`,
+        `${unitType} ${unitId} skipped after ${maxRecoveryAttempts} recovery attempts (${diagnostic}). Blocker artifacts written. Advancing pipeline. (attempt ${attemptNumber})`,
        "warning",
      );
+      unitRecoveryCount.delete(recoveryKey);
      await dispatchNextUnit(ctx, pi);
      return "recovered";
    }
@ -2287,9 +2397,10 @@ async function recoverTimedOutUnit(
      lastRecoveryReason: reason,
    });
    ctx.ui.notify(
-      `${reason === "idle" ? "Idle" : "Timeout"} recovery: ${unitType} ${unitId} artifact already exists on disk. Advancing.`,
+      `${reason === "idle" ? "Idle" : "Timeout"} recovery: ${unitType} ${unitId} artifact already exists on disk. Advancing. (attempt ${attemptNumber})`,
      "info",
    );
+    unitRecoveryCount.delete(recoveryKey);
    await dispatchNextUnit(ctx, pi);
    return "recovered";
  }
@ -2335,7 +2446,7 @@ async function recoverTimedOutUnit(
      { triggerTurn: true, deliverAs: "steer" },
    );
    ctx.ui.notify(
-      `${reason === "idle" ? "Idle" : "Timeout"} recovery: steering ${unitType} ${unitId} to produce ${expected} (attempt ${recoveryAttempts + 1}/${maxRecoveryAttempts}).`,
+      `${reason === "idle" ? "Idle" : "Timeout"} recovery: steering ${unitType} ${unitId} to produce ${expected} (attempt ${recoveryAttempts + 1}/${maxRecoveryAttempts}) (attempt ${attemptNumber}).`,
      "warning",
    );
    return "recovered";
@ -2355,9 +2466,10 @@ async function recoverTimedOutUnit(
      lastRecoveryReason: reason,
    });
    ctx.ui.notify(
-      `${unitType} ${unitId} skipped after ${maxRecoveryAttempts} recovery attempts. Blocker placeholder written to ${placeholder}. Advancing pipeline.`,
+      `${unitType} ${unitId} skipped after ${maxRecoveryAttempts} recovery attempts. Blocker placeholder written to ${placeholder}. Advancing pipeline. (attempt ${attemptNumber})`,
      "warning",
    );
+    unitRecoveryCount.delete(recoveryKey);
    await dispatchNextUnit(ctx, pi);
    return "recovered";
  }
--- a/src/resources/extensions/gsd/unit-runtime.ts
+++ b/src/resources/extensions/gsd/unit-runtime.ts
@ -1,4 +1,4 @@
-import { existsSync, mkdirSync, readFileSync, writeFileSync, unlinkSync } from "node:fs";
+import { existsSync, mkdirSync, readdirSync, readFileSync, writeFileSync, unlinkSync } from "node:fs";
 import { join } from "node:path";
 import {
  gsdRoot,
@ -99,6 +99,27 @@ export function clearUnitRuntimeRecord(basePath: string, unitType: string, unitI
  if (existsSync(path)) unlinkSync(path);
 }

+/**
+ * Return all runtime records currently on disk for `basePath`.
+ * Returns an empty array if the runtime directory does not exist.
+ */
+export function listUnitRuntimeRecords(basePath: string): AutoUnitRuntimeRecord[] {
+  const dir = runtimeDir(basePath);
+  if (!existsSync(dir)) return [];
+  const results: AutoUnitRuntimeRecord[] = [];
+  for (const file of readdirSync(dir)) {
+    if (!file.endsWith(".json")) continue;
+    try {
+      const raw = readFileSync(join(dir, file), "utf-8");
+      const record = JSON.parse(raw) as AutoUnitRuntimeRecord;
+      results.push(record);
+    } catch {
+      // Skip malformed files
+    }
+  }
+  return results;
+}
+
 export async function inspectExecuteTaskDurability(
  basePath: string,
  unitId: string,
--- a/src/resources/extensions/gsd/worktree.ts
+++ b/src/resources/extensions/gsd/worktree.ts
@ -78,11 +78,10 @@ export function ensureSliceBranch(basePath: string, milestoneId: string, sliceId

  if (current === branch) return false;

-  const mainBranch = getMainBranch(basePath);
  let created = false;

  if (!branchExists(basePath, branch)) {
-    runGit(basePath, ["branch", branch, mainBranch]);
+    runGit(basePath, ["branch", branch]);
    created = true;
  }