fix: resolve auto-mode infinite loop and closeout instability (#96, #109)

- D1: Move complete-slice dispatch before needsReassess so mergeSliceToMain cannot be bypassed by early reassessment - D2: Preserve main's slice-branch-chaining guard (branch from current HEAD when not on a slice branch, fall back to main otherwise) - D3: Replace consecutive-repeat stuck detection with per-unit total dispatch counter that catches A→B→A→B alternating loops - Atomic closeout: write unit completion to .gsd/completed-units.json before in-memory update for crash recovery - Persistent idempotency: completedKeySet loaded from disk on start/resume - Startup self-heal: scan runtime records and clear stale ones - Recovery backoff: exponential backoff between cross-invocation recovery attempts Closes #96 Closes #109 Co-Authored-By: omarsharaf96 <omarsharaf96@users.noreply.github.com> Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-12 09:01:53 -06:00 · 2026-03-12 09:01:53 -06:00 · 1a308bd23d
commit 1a308bd23d
parent b28299bd60 527b63ac36
3 changed files with 163 additions and 53 deletions
--- a/src/resources/extensions/gsd/auto.ts
+++ b/src/resources/extensions/gsd/auto.ts
@ -71,6 +71,39 @@ import { truncateToWidth, visibleWidth } from "@mariozechner/pi-tui";
 import { makeUI, GLYPH, INDENT } from "../shared/ui.js";
 import { showNextAction } from "../shared/next-action-ui.js";

+// ─── Disk-backed completed-unit helpers ───────────────────────────────────────
+
+/** Path to the persisted completed-unit keys file. */
+function completedKeysPath(base: string): string {
+  return join(base, ".gsd", "completed-units.json");
+}
+
+/** Write a completed unit key to disk atomically (append to set). */
+function persistCompletedKey(base: string, key: string): void {
+  const file = completedKeysPath(base);
+  let keys: string[] = [];
+  try {
+    if (existsSync(file)) {
+      keys = JSON.parse(readFileSync(file, "utf-8"));
+    }
+  } catch { /* corrupt file — start fresh */ }
+  if (!keys.includes(key)) {
+    keys.push(key);
+    writeFileSync(file, JSON.stringify(keys), "utf-8");
+  }
+}
+
+/** Load all completed unit keys from disk into the in-memory set. */
+function loadPersistedKeys(base: string, target: Set<string>): void {
+  const file = completedKeysPath(base);
+  try {
+    if (existsSync(file)) {
+      const keys: string[] = JSON.parse(readFileSync(file, "utf-8"));
+      for (const k of keys) target.add(k);
+    }
+  } catch { /* non-fatal */ }
+}
+
 // ─── State ────────────────────────────────────────────────────────────────────

 let active = false;
@ -80,13 +113,16 @@ let verbose = false;
 let cmdCtx: ExtensionCommandContext | null = null;
 let basePath = "";

-/** Track dispatched units to detect stuck loops (including alternating patterns) */
-let lastUnit: { type: string; id: string } | null = null;
-let retryCount = 0;
-const MAX_RETRIES = 1;
+/** Track total dispatches per unit to detect stuck loops (catches A→B→A→B patterns) */
 const unitDispatchCount = new Map<string, number>();
 const MAX_UNIT_DISPATCHES = 3;

+/** Tracks recovery attempt count per unit for backoff and diagnostics. */
+const unitRecoveryCount = new Map<string, number>();
+
+/** Persisted completed-unit keys — survives restarts. Loaded from .gsd/completed-units.json. */
+const completedKeySet = new Set<string>();
+
 /** Crash recovery prompt — set by startAuto, consumed by first dispatchNextUnit */
 let pendingCrashRecovery: string | null = null;

@ -209,7 +245,8 @@ export async function stopAuto(ctx?: ExtensionContext, pi?: ExtensionAPI): Promi
  active = false;
  paused = false;
  stepMode = false;
-  lastUnit = null;
+  unitDispatchCount.clear();
+  unitRecoveryCount.clear();
  currentUnit = null;
  currentMilestoneId = null;
  cachedSliceProgress = null;
@ -239,7 +276,7 @@ export async function pauseAuto(ctx?: ExtensionContext, _pi?: ExtensionAPI): Pro
  if (basePath) clearLock(basePath);
  active = false;
  paused = true;
-  // Preserve: lastUnit, currentUnit, basePath, verbose, cmdCtx,
+  // Preserve: unitDispatchCount, currentUnit, basePath, verbose, cmdCtx,
  // completedUnits, autoStartTime, currentMilestoneId, originalModelId
  // — all needed for resume and dashboard display
  ctx?.ui.setStatus("gsd-auto", "paused");
@ -252,6 +289,33 @@ export async function pauseAuto(ctx?: ExtensionContext, _pi?: ExtensionAPI): Pro
  );
 }

+/**
+ * Self-heal: scan runtime records in .gsd/ and clear any where the expected
+ * artifact already exists on disk. This repairs incomplete closeouts from
+ * prior crashes — preventing spurious re-dispatch of already-completed units.
+ */
+async function selfHealRuntimeRecords(base: string, ctx: ExtensionContext): Promise<void> {
+  try {
+    const { listUnitRuntimeRecords } = await import("./unit-runtime.js");
+    const records = listUnitRuntimeRecords(base);
+    let healed = 0;
+    for (const record of records) {
+      const { unitType, unitId } = record;
+      const artifactPath = resolveExpectedArtifactPath(unitType, unitId, base);
+      if (artifactPath && existsSync(artifactPath)) {
+        // Artifact exists — unit completed but closeout didn't finish.
+        clearUnitRuntimeRecord(base, unitType, unitId);
+        healed++;
+      }
+    }
+    if (healed > 0) {
+      ctx.ui.notify(`Self-heal: cleared ${healed} stale runtime record(s) with completed artifacts.`, "info");
+    }
+  } catch {
+    // Non-fatal — self-heal should never block auto-mode start
+  }
+}
+
 export async function startAuto(
  ctx: ExtensionCommandContext,
  pi: ExtensionAPI,
@ -285,6 +349,8 @@ export async function startAuto(
        ctx.ui.notify(`Resume: applied ${report.fixesApplied.length} fix(es) to state.`, "info");
      }
    } catch { /* non-fatal */ }
+    // Self-heal: clear stale runtime records where artifacts already exist
+    await selfHealRuntimeRecords(base, ctx);
    await dispatchNextUnit(ctx, pi);
    return;
  }
@ -363,9 +429,10 @@ export async function startAuto(
  verbose = verboseMode;
  cmdCtx = ctx;
  basePath = base;
-  lastUnit = null;
-  retryCount = 0;
  unitDispatchCount.clear();
+  unitRecoveryCount.clear();
+  completedKeySet.clear();
+  loadPersistedKeys(base, completedKeySet);
  autoStartTime = Date.now();
  completedUnits = [];
  currentUnit = null;
@ -389,6 +456,9 @@ export async function startAuto(
    : "Will loop until milestone complete.";
  ctx.ui.notify(`${modeLabel} started. ${scopeMsg}`, "info");

+  // Self-heal: clear stale runtime records where artifacts already exist
+  await selfHealRuntimeRecords(base, ctx);
+
  // Dispatch the first unit
  await dispatchNextUnit(ctx, pi);
 }
@ -882,8 +952,8 @@ async function dispatchNextUnit(
      "info",
    );
    // Reset stuck detection for new milestone
-    lastUnit = null;
-    retryCount = 0;
+    unitDispatchCount.clear();
+    unitRecoveryCount.clear();
  }
  if (mid) currentMilestoneId = mid;

@ -960,6 +1030,12 @@ async function dispatchNextUnit(
      snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId);
      saveActivityLog(ctx, basePath, currentUnit.type, currentUnit.id);
    }
+    // Clear completed-units.json for the finished milestone so it doesn't grow unbounded.
+    try {
+      const file = completedKeysPath(basePath);
+      if (existsSync(file)) writeFileSync(file, JSON.stringify([]), "utf-8");
+      completedKeySet.clear();
+    } catch { /* non-fatal */ }
    await stopAuto(ctx, pi);
    return;
  }
@ -1111,54 +1187,44 @@ async function dispatchNextUnit(

  await emitObservabilityWarnings(ctx, unitType, unitId);

-  // Stuck detection — catches both consecutive repeats and alternating patterns.
-  // Consecutive: same unit dispatched back-to-back (LLM didn't produce artifact).
-  // Alternating: same unit dispatched N times total (A→B→A→B loop).
-  const dispatchKey = `${unitType}/${unitId}`;
-  const prevTotalCount = unitDispatchCount.get(dispatchKey) ?? 0;
+  // Idempotency: skip units already completed in a prior session.
+  const idempotencyKey = `${unitType}/${unitId}`;
+  if (completedKeySet.has(idempotencyKey)) {
+    ctx.ui.notify(
+      `Skipping ${unitType} ${unitId} — already completed in a prior session. Advancing.`,
+      "info",
+    );
+    // Don't increment dispatch count — just advance by calling dispatchNextUnit again.
+    // First, force state re-derive so the scheduler sees the completed artifact.
+    await dispatchNextUnit(ctx, pi);
+    return;
+  }

-  if (prevTotalCount >= MAX_UNIT_DISPATCHES) {
+  // Stuck detection — tracks total dispatches per unit (not just consecutive repeats).
+  // Pattern A→B→A→B would reset retryCount every time; this map catches it.
+  const dispatchKey = `${unitType}/${unitId}`;
+  const prevCount = unitDispatchCount.get(dispatchKey) ?? 0;
+  if (prevCount >= MAX_UNIT_DISPATCHES) {
    if (currentUnit) {
      const modelId = ctx.model?.id ?? "unknown";
      snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId);
    }
    saveActivityLog(ctx, basePath, unitType, unitId);
+
    const expected = diagnoseExpectedArtifact(unitType, unitId, basePath);
    await stopAuto(ctx, pi);
    ctx.ui.notify(
-      `Loop detected: ${unitType} ${unitId} dispatched ${prevTotalCount + 1} times. ` +
-      `Check branch state and .gsd/ artifacts.${expected ? `\n   Expected: ${expected}` : ""}`,
+      `Loop detected: ${unitType} ${unitId} dispatched ${prevCount + 1} times total. Expected artifact not found.${expected ? `\n   Expected: ${expected}` : ""}\n   Check branch state and .gsd/ artifacts.`,
      "error",
    );
    return;
  }
-  unitDispatchCount.set(dispatchKey, prevTotalCount + 1);
-
-  if (lastUnit && lastUnit.type === unitType && lastUnit.id === unitId) {
-    retryCount++;
-
-    if (retryCount > MAX_RETRIES) {
-      if (currentUnit) {
-        const modelId = ctx.model?.id ?? "unknown";
-        snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId);
-      }
-      saveActivityLog(ctx, basePath, lastUnit.type, lastUnit.id);
-
-      // Diagnostic: what file was expected?
-      const expected = diagnoseExpectedArtifact(unitType, unitId, basePath);
-      await stopAuto(ctx, pi);
-      ctx.ui.notify(
-        `Stuck: ${unitType} ${unitId} fired ${retryCount + 1} times. Expected artifact not found.${expected ? `\n   Expected: ${expected}` : ""}\n   Check .gsd/ and activity logs.`,
-        "error",
-      );
-      return;
-    }
+  unitDispatchCount.set(dispatchKey, prevCount + 1);
+  if (prevCount > 0) {
    ctx.ui.notify(
-      `${unitType} ${unitId} didn't produce expected artifact. Retrying (${retryCount}/${MAX_RETRIES}).`,
+      `${unitType} ${unitId} didn't produce expected artifact. Retrying (${prevCount + 1}/${MAX_UNIT_DISPATCHES}).`,
      "warning",
    );
-  } else {
-    retryCount = 0;
  }
  // Snapshot metrics + activity log for the PREVIOUS unit before we reassign.
  // The session still holds the previous unit's data (newSession hasn't fired yet).
@ -1167,6 +1233,11 @@ async function dispatchNextUnit(
    snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId);
    saveActivityLog(ctx, basePath, currentUnit.type, currentUnit.id);

+    // Persist completion to disk BEFORE updating memory — so a crash here is recoverable.
+    const closeoutKey = `${currentUnit.type}/${currentUnit.id}`;
+    persistCompletedKey(basePath, closeoutKey);
+    completedKeySet.add(closeoutKey);
+
    completedUnits.push({
      type: currentUnit.type,
      id: currentUnit.id,
@ -1174,9 +1245,9 @@ async function dispatchNextUnit(
      finishedAt: Date.now(),
    });
    clearUnitRuntimeRecord(basePath, currentUnit.type, currentUnit.id);
+    unitDispatchCount.delete(`${currentUnit.type}/${currentUnit.id}`);
+    unitRecoveryCount.delete(`${currentUnit.type}/${currentUnit.id}`);
  }
-
-  lastUnit = { type: unitType, id: unitId };
  currentUnit = { type: unitType, id: unitId, startedAt: Date.now() };
  writeUnitRuntimeRecord(basePath, unitType, unitId, currentUnit.startedAt, {
    phase: "dispatched",
@ -1220,7 +1291,7 @@ async function dispatchNextUnit(
  if (pendingCrashRecovery) {
    finalPrompt = `${pendingCrashRecovery}\n\n---\n\n${finalPrompt}`;
    pendingCrashRecovery = null;
-  } else if (retryCount > 0) {
+  } else if ((unitDispatchCount.get(`${unitType}/${unitId}`) ?? 0) > 1) {
    const diagnostic = getDeepDiagnostic(basePath);
    if (diagnostic) {
      finalPrompt = `**RETRY — your previous attempt did not produce the required artifact.**\n\nDiagnostic from previous attempt:\n${diagnostic}\n\nFix whatever went wrong and make sure you write the required file this time.\n\n---\n\n${finalPrompt}`;
@ -2198,6 +2269,20 @@ async function recoverTimedOutUnit(
  const recoveryAttempts = runtime?.recoveryAttempts ?? 0;
  const maxRecoveryAttempts = reason === "idle" ? 2 : 1;

+  const recoveryKey = `${unitType}/${unitId}`;
+  const attemptNumber = (unitRecoveryCount.get(recoveryKey) ?? 0) + 1;
+  unitRecoveryCount.set(recoveryKey, attemptNumber);
+
+  if (attemptNumber > 1) {
+    // Exponential backoff: 2^(n-1) seconds, capped at 30s
+    const backoffMs = Math.min(1000 * Math.pow(2, attemptNumber - 2), 30000);
+    ctx.ui.notify(
+      `Recovery attempt ${attemptNumber} for ${unitType} ${unitId}. Waiting ${backoffMs / 1000}s before retry.`,
+      "info",
+    );
+    await new Promise(r => setTimeout(r, backoffMs));
+  }
+
  if (unitType === "execute-task") {
    const status = await inspectExecuteTaskDurability(basePath, unitId);
    if (!status) return "paused";
@ -2213,9 +2298,10 @@ async function recoverTimedOutUnit(
        recovery: status,
      });
      ctx.ui.notify(
-        `${reason === "idle" ? "Idle" : "Timeout"} recovery: ${unitType} ${unitId} already completed on disk. Continuing auto-mode.`,
+        `${reason === "idle" ? "Idle" : "Timeout"} recovery: ${unitType} ${unitId} already completed on disk. Continuing auto-mode. (attempt ${attemptNumber})`,
        "info",
      );
+      unitRecoveryCount.delete(recoveryKey);
      await dispatchNextUnit(ctx, pi);
      return "recovered";
    }
@ -2262,7 +2348,7 @@ async function recoverTimedOutUnit(
        { triggerTurn: true, deliverAs: "steer" },
      );
      ctx.ui.notify(
-        `${reason === "idle" ? "Idle" : "Timeout"} recovery: steering ${unitType} ${unitId} to finish durable output (attempt ${recoveryAttempts + 1}/${maxRecoveryAttempts}).`,
+        `${reason === "idle" ? "Idle" : "Timeout"} recovery: steering ${unitType} ${unitId} to finish durable output (attempt ${recoveryAttempts + 1}/${maxRecoveryAttempts}) (attempt ${attemptNumber}).`,
        "warning",
      );
      return "recovered";
@ -2283,9 +2369,10 @@ async function recoverTimedOutUnit(
        lastRecoveryReason: reason,
      });
      ctx.ui.notify(
-        `${unitType} ${unitId} skipped after ${maxRecoveryAttempts} recovery attempts (${diagnostic}). Blocker artifacts written. Advancing pipeline.`,
+        `${unitType} ${unitId} skipped after ${maxRecoveryAttempts} recovery attempts (${diagnostic}). Blocker artifacts written. Advancing pipeline. (attempt ${attemptNumber})`,
        "warning",
      );
+      unitRecoveryCount.delete(recoveryKey);
      await dispatchNextUnit(ctx, pi);
      return "recovered";
    }
@ -2316,9 +2403,10 @@ async function recoverTimedOutUnit(
      lastRecoveryReason: reason,
    });
    ctx.ui.notify(
-      `${reason === "idle" ? "Idle" : "Timeout"} recovery: ${unitType} ${unitId} artifact already exists on disk. Advancing.`,
+      `${reason === "idle" ? "Idle" : "Timeout"} recovery: ${unitType} ${unitId} artifact already exists on disk. Advancing. (attempt ${attemptNumber})`,
      "info",
    );
+    unitRecoveryCount.delete(recoveryKey);
    await dispatchNextUnit(ctx, pi);
    return "recovered";
  }
@ -2364,7 +2452,7 @@ async function recoverTimedOutUnit(
      { triggerTurn: true, deliverAs: "steer" },
    );
    ctx.ui.notify(
-      `${reason === "idle" ? "Idle" : "Timeout"} recovery: steering ${unitType} ${unitId} to produce ${expected} (attempt ${recoveryAttempts + 1}/${maxRecoveryAttempts}).`,
+      `${reason === "idle" ? "Idle" : "Timeout"} recovery: steering ${unitType} ${unitId} to produce ${expected} (attempt ${recoveryAttempts + 1}/${maxRecoveryAttempts}) (attempt ${attemptNumber}).`,
      "warning",
    );
    return "recovered";
@ -2384,9 +2472,10 @@ async function recoverTimedOutUnit(
      lastRecoveryReason: reason,
    });
    ctx.ui.notify(
-      `${unitType} ${unitId} skipped after ${maxRecoveryAttempts} recovery attempts. Blocker placeholder written to ${placeholder}. Advancing pipeline.`,
+      `${unitType} ${unitId} skipped after ${maxRecoveryAttempts} recovery attempts. Blocker placeholder written to ${placeholder}. Advancing pipeline. (attempt ${attemptNumber})`,
      "warning",
    );
+    unitRecoveryCount.delete(recoveryKey);
    await dispatchNextUnit(ctx, pi);
    return "recovered";
  }
--- a/src/resources/extensions/gsd/unit-runtime.ts
+++ b/src/resources/extensions/gsd/unit-runtime.ts
@ -1,4 +1,4 @@
-import { existsSync, mkdirSync, readFileSync, writeFileSync, unlinkSync } from "node:fs";
+import { existsSync, mkdirSync, readdirSync, readFileSync, writeFileSync, unlinkSync } from "node:fs";
 import { join } from "node:path";
 import {
  gsdRoot,
@ -99,6 +99,27 @@ export function clearUnitRuntimeRecord(basePath: string, unitType: string, unitI
  if (existsSync(path)) unlinkSync(path);
 }

+/**
+ * Return all runtime records currently on disk for `basePath`.
+ * Returns an empty array if the runtime directory does not exist.
+ */
+export function listUnitRuntimeRecords(basePath: string): AutoUnitRuntimeRecord[] {
+  const dir = runtimeDir(basePath);
+  if (!existsSync(dir)) return [];
+  const results: AutoUnitRuntimeRecord[] = [];
+  for (const file of readdirSync(dir)) {
+    if (!file.endsWith(".json")) continue;
+    try {
+      const raw = readFileSync(join(dir, file), "utf-8");
+      const record = JSON.parse(raw) as AutoUnitRuntimeRecord;
+      results.push(record);
+    } catch {
+      // Skip malformed files
+    }
+  }
+  return results;
+}
+
 export async function inspectExecuteTaskDurability(
  basePath: string,
  unitId: string,
--- a/src/resources/extensions/gsd/worktree.ts
+++ b/src/resources/extensions/gsd/worktree.ts
@ -156,7 +156,6 @@ export function ensureSliceBranch(basePath: string, milestoneId: string, sliceId

  if (current === branch) return false;

-  const mainBranch = getMainBranch(basePath);
  let created = false;

  if (!branchExists(basePath, branch)) {
@ -166,6 +165,7 @@ export function ensureSliceBranch(basePath: string, milestoneId: string, sliceId
    // branch and haven't been merged to main yet.
    // If we're already on a slice branch (e.g. creating S02 while S01
    // wasn't merged yet), fall back to main to avoid chaining slice branches.
+    const mainBranch = getMainBranch(basePath);
    const base = SLICE_BRANCH_RE.test(current) ? mainBranch : current;
    runGit(basePath, ["branch", branch, base]);
    created = true;