fix: resolve auto-mode infinite loop and closeout instability (#96, #109)

- D1: Move complete-slice dispatch before needsReassess so mergeSliceToMain
  cannot be bypassed by early reassessment
- D2: Preserve main's slice-branch-chaining guard (branch from current HEAD
  when not on a slice branch, fall back to main otherwise)
- D3: Replace consecutive-repeat stuck detection with per-unit total dispatch
  counter that catches A→B→A→B alternating loops
- Atomic closeout: write unit completion to .gsd/completed-units.json before
  in-memory update for crash recovery
- Persistent idempotency: completedKeySet loaded from disk on start/resume
- Startup self-heal: scan runtime records and clear stale ones
- Recovery backoff: exponential backoff between cross-invocation recovery attempts

Closes #96
Closes #109

Co-Authored-By: omarsharaf96 <omarsharaf96@users.noreply.github.com>
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Lex Christopherson 2026-03-12 09:01:53 -06:00
commit 1a308bd23d
3 changed files with 163 additions and 53 deletions

View file

@ -71,6 +71,39 @@ import { truncateToWidth, visibleWidth } from "@mariozechner/pi-tui";
import { makeUI, GLYPH, INDENT } from "../shared/ui.js";
import { showNextAction } from "../shared/next-action-ui.js";
// ─── Disk-backed completed-unit helpers ───────────────────────────────────────
/** Path to the persisted completed-unit keys file. */
function completedKeysPath(base: string): string {
return join(base, ".gsd", "completed-units.json");
}
/** Write a completed unit key to disk atomically (append to set). */
function persistCompletedKey(base: string, key: string): void {
const file = completedKeysPath(base);
let keys: string[] = [];
try {
if (existsSync(file)) {
keys = JSON.parse(readFileSync(file, "utf-8"));
}
} catch { /* corrupt file — start fresh */ }
if (!keys.includes(key)) {
keys.push(key);
writeFileSync(file, JSON.stringify(keys), "utf-8");
}
}
/** Load all completed unit keys from disk into the in-memory set. */
function loadPersistedKeys(base: string, target: Set<string>): void {
const file = completedKeysPath(base);
try {
if (existsSync(file)) {
const keys: string[] = JSON.parse(readFileSync(file, "utf-8"));
for (const k of keys) target.add(k);
}
} catch { /* non-fatal */ }
}
// ─── State ────────────────────────────────────────────────────────────────────
let active = false;
@ -80,13 +113,16 @@ let verbose = false;
let cmdCtx: ExtensionCommandContext | null = null;
let basePath = "";
/** Track dispatched units to detect stuck loops (including alternating patterns) */
let lastUnit: { type: string; id: string } | null = null;
let retryCount = 0;
const MAX_RETRIES = 1;
/** Track total dispatches per unit to detect stuck loops (catches A→B→A→B patterns) */
const unitDispatchCount = new Map<string, number>();
const MAX_UNIT_DISPATCHES = 3;
/** Tracks recovery attempt count per unit for backoff and diagnostics. */
const unitRecoveryCount = new Map<string, number>();
/** Persisted completed-unit keys — survives restarts. Loaded from .gsd/completed-units.json. */
const completedKeySet = new Set<string>();
/** Crash recovery prompt — set by startAuto, consumed by first dispatchNextUnit */
let pendingCrashRecovery: string | null = null;
@ -209,7 +245,8 @@ export async function stopAuto(ctx?: ExtensionContext, pi?: ExtensionAPI): Promi
active = false;
paused = false;
stepMode = false;
lastUnit = null;
unitDispatchCount.clear();
unitRecoveryCount.clear();
currentUnit = null;
currentMilestoneId = null;
cachedSliceProgress = null;
@ -239,7 +276,7 @@ export async function pauseAuto(ctx?: ExtensionContext, _pi?: ExtensionAPI): Pro
if (basePath) clearLock(basePath);
active = false;
paused = true;
// Preserve: lastUnit, currentUnit, basePath, verbose, cmdCtx,
// Preserve: unitDispatchCount, currentUnit, basePath, verbose, cmdCtx,
// completedUnits, autoStartTime, currentMilestoneId, originalModelId
// — all needed for resume and dashboard display
ctx?.ui.setStatus("gsd-auto", "paused");
@ -252,6 +289,33 @@ export async function pauseAuto(ctx?: ExtensionContext, _pi?: ExtensionAPI): Pro
);
}
/**
* Self-heal: scan runtime records in .gsd/ and clear any where the expected
* artifact already exists on disk. This repairs incomplete closeouts from
* prior crashes preventing spurious re-dispatch of already-completed units.
*/
async function selfHealRuntimeRecords(base: string, ctx: ExtensionContext): Promise<void> {
try {
const { listUnitRuntimeRecords } = await import("./unit-runtime.js");
const records = listUnitRuntimeRecords(base);
let healed = 0;
for (const record of records) {
const { unitType, unitId } = record;
const artifactPath = resolveExpectedArtifactPath(unitType, unitId, base);
if (artifactPath && existsSync(artifactPath)) {
// Artifact exists — unit completed but closeout didn't finish.
clearUnitRuntimeRecord(base, unitType, unitId);
healed++;
}
}
if (healed > 0) {
ctx.ui.notify(`Self-heal: cleared ${healed} stale runtime record(s) with completed artifacts.`, "info");
}
} catch {
// Non-fatal — self-heal should never block auto-mode start
}
}
export async function startAuto(
ctx: ExtensionCommandContext,
pi: ExtensionAPI,
@ -285,6 +349,8 @@ export async function startAuto(
ctx.ui.notify(`Resume: applied ${report.fixesApplied.length} fix(es) to state.`, "info");
}
} catch { /* non-fatal */ }
// Self-heal: clear stale runtime records where artifacts already exist
await selfHealRuntimeRecords(base, ctx);
await dispatchNextUnit(ctx, pi);
return;
}
@ -363,9 +429,10 @@ export async function startAuto(
verbose = verboseMode;
cmdCtx = ctx;
basePath = base;
lastUnit = null;
retryCount = 0;
unitDispatchCount.clear();
unitRecoveryCount.clear();
completedKeySet.clear();
loadPersistedKeys(base, completedKeySet);
autoStartTime = Date.now();
completedUnits = [];
currentUnit = null;
@ -389,6 +456,9 @@ export async function startAuto(
: "Will loop until milestone complete.";
ctx.ui.notify(`${modeLabel} started. ${scopeMsg}`, "info");
// Self-heal: clear stale runtime records where artifacts already exist
await selfHealRuntimeRecords(base, ctx);
// Dispatch the first unit
await dispatchNextUnit(ctx, pi);
}
@ -882,8 +952,8 @@ async function dispatchNextUnit(
"info",
);
// Reset stuck detection for new milestone
lastUnit = null;
retryCount = 0;
unitDispatchCount.clear();
unitRecoveryCount.clear();
}
if (mid) currentMilestoneId = mid;
@ -960,6 +1030,12 @@ async function dispatchNextUnit(
snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId);
saveActivityLog(ctx, basePath, currentUnit.type, currentUnit.id);
}
// Clear completed-units.json for the finished milestone so it doesn't grow unbounded.
try {
const file = completedKeysPath(basePath);
if (existsSync(file)) writeFileSync(file, JSON.stringify([]), "utf-8");
completedKeySet.clear();
} catch { /* non-fatal */ }
await stopAuto(ctx, pi);
return;
}
@ -1111,54 +1187,44 @@ async function dispatchNextUnit(
await emitObservabilityWarnings(ctx, unitType, unitId);
// Stuck detection — catches both consecutive repeats and alternating patterns.
// Consecutive: same unit dispatched back-to-back (LLM didn't produce artifact).
// Alternating: same unit dispatched N times total (A→B→A→B loop).
const dispatchKey = `${unitType}/${unitId}`;
const prevTotalCount = unitDispatchCount.get(dispatchKey) ?? 0;
// Idempotency: skip units already completed in a prior session.
const idempotencyKey = `${unitType}/${unitId}`;
if (completedKeySet.has(idempotencyKey)) {
ctx.ui.notify(
`Skipping ${unitType} ${unitId} — already completed in a prior session. Advancing.`,
"info",
);
// Don't increment dispatch count — just advance by calling dispatchNextUnit again.
// First, force state re-derive so the scheduler sees the completed artifact.
await dispatchNextUnit(ctx, pi);
return;
}
if (prevTotalCount >= MAX_UNIT_DISPATCHES) {
// Stuck detection — tracks total dispatches per unit (not just consecutive repeats).
// Pattern A→B→A→B would reset retryCount every time; this map catches it.
const dispatchKey = `${unitType}/${unitId}`;
const prevCount = unitDispatchCount.get(dispatchKey) ?? 0;
if (prevCount >= MAX_UNIT_DISPATCHES) {
if (currentUnit) {
const modelId = ctx.model?.id ?? "unknown";
snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId);
}
saveActivityLog(ctx, basePath, unitType, unitId);
const expected = diagnoseExpectedArtifact(unitType, unitId, basePath);
await stopAuto(ctx, pi);
ctx.ui.notify(
`Loop detected: ${unitType} ${unitId} dispatched ${prevTotalCount + 1} times. ` +
`Check branch state and .gsd/ artifacts.${expected ? `\n Expected: ${expected}` : ""}`,
`Loop detected: ${unitType} ${unitId} dispatched ${prevCount + 1} times total. Expected artifact not found.${expected ? `\n Expected: ${expected}` : ""}\n Check branch state and .gsd/ artifacts.`,
"error",
);
return;
}
unitDispatchCount.set(dispatchKey, prevTotalCount + 1);
if (lastUnit && lastUnit.type === unitType && lastUnit.id === unitId) {
retryCount++;
if (retryCount > MAX_RETRIES) {
if (currentUnit) {
const modelId = ctx.model?.id ?? "unknown";
snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId);
}
saveActivityLog(ctx, basePath, lastUnit.type, lastUnit.id);
// Diagnostic: what file was expected?
const expected = diagnoseExpectedArtifact(unitType, unitId, basePath);
await stopAuto(ctx, pi);
ctx.ui.notify(
`Stuck: ${unitType} ${unitId} fired ${retryCount + 1} times. Expected artifact not found.${expected ? `\n Expected: ${expected}` : ""}\n Check .gsd/ and activity logs.`,
"error",
);
return;
}
unitDispatchCount.set(dispatchKey, prevCount + 1);
if (prevCount > 0) {
ctx.ui.notify(
`${unitType} ${unitId} didn't produce expected artifact. Retrying (${retryCount}/${MAX_RETRIES}).`,
`${unitType} ${unitId} didn't produce expected artifact. Retrying (${prevCount + 1}/${MAX_UNIT_DISPATCHES}).`,
"warning",
);
} else {
retryCount = 0;
}
// Snapshot metrics + activity log for the PREVIOUS unit before we reassign.
// The session still holds the previous unit's data (newSession hasn't fired yet).
@ -1167,6 +1233,11 @@ async function dispatchNextUnit(
snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId);
saveActivityLog(ctx, basePath, currentUnit.type, currentUnit.id);
// Persist completion to disk BEFORE updating memory — so a crash here is recoverable.
const closeoutKey = `${currentUnit.type}/${currentUnit.id}`;
persistCompletedKey(basePath, closeoutKey);
completedKeySet.add(closeoutKey);
completedUnits.push({
type: currentUnit.type,
id: currentUnit.id,
@ -1174,9 +1245,9 @@ async function dispatchNextUnit(
finishedAt: Date.now(),
});
clearUnitRuntimeRecord(basePath, currentUnit.type, currentUnit.id);
unitDispatchCount.delete(`${currentUnit.type}/${currentUnit.id}`);
unitRecoveryCount.delete(`${currentUnit.type}/${currentUnit.id}`);
}
lastUnit = { type: unitType, id: unitId };
currentUnit = { type: unitType, id: unitId, startedAt: Date.now() };
writeUnitRuntimeRecord(basePath, unitType, unitId, currentUnit.startedAt, {
phase: "dispatched",
@ -1220,7 +1291,7 @@ async function dispatchNextUnit(
if (pendingCrashRecovery) {
finalPrompt = `${pendingCrashRecovery}\n\n---\n\n${finalPrompt}`;
pendingCrashRecovery = null;
} else if (retryCount > 0) {
} else if ((unitDispatchCount.get(`${unitType}/${unitId}`) ?? 0) > 1) {
const diagnostic = getDeepDiagnostic(basePath);
if (diagnostic) {
finalPrompt = `**RETRY — your previous attempt did not produce the required artifact.**\n\nDiagnostic from previous attempt:\n${diagnostic}\n\nFix whatever went wrong and make sure you write the required file this time.\n\n---\n\n${finalPrompt}`;
@ -2198,6 +2269,20 @@ async function recoverTimedOutUnit(
const recoveryAttempts = runtime?.recoveryAttempts ?? 0;
const maxRecoveryAttempts = reason === "idle" ? 2 : 1;
const recoveryKey = `${unitType}/${unitId}`;
const attemptNumber = (unitRecoveryCount.get(recoveryKey) ?? 0) + 1;
unitRecoveryCount.set(recoveryKey, attemptNumber);
if (attemptNumber > 1) {
// Exponential backoff: 2^(n-1) seconds, capped at 30s
const backoffMs = Math.min(1000 * Math.pow(2, attemptNumber - 2), 30000);
ctx.ui.notify(
`Recovery attempt ${attemptNumber} for ${unitType} ${unitId}. Waiting ${backoffMs / 1000}s before retry.`,
"info",
);
await new Promise(r => setTimeout(r, backoffMs));
}
if (unitType === "execute-task") {
const status = await inspectExecuteTaskDurability(basePath, unitId);
if (!status) return "paused";
@ -2213,9 +2298,10 @@ async function recoverTimedOutUnit(
recovery: status,
});
ctx.ui.notify(
`${reason === "idle" ? "Idle" : "Timeout"} recovery: ${unitType} ${unitId} already completed on disk. Continuing auto-mode.`,
`${reason === "idle" ? "Idle" : "Timeout"} recovery: ${unitType} ${unitId} already completed on disk. Continuing auto-mode. (attempt ${attemptNumber})`,
"info",
);
unitRecoveryCount.delete(recoveryKey);
await dispatchNextUnit(ctx, pi);
return "recovered";
}
@ -2262,7 +2348,7 @@ async function recoverTimedOutUnit(
{ triggerTurn: true, deliverAs: "steer" },
);
ctx.ui.notify(
`${reason === "idle" ? "Idle" : "Timeout"} recovery: steering ${unitType} ${unitId} to finish durable output (attempt ${recoveryAttempts + 1}/${maxRecoveryAttempts}).`,
`${reason === "idle" ? "Idle" : "Timeout"} recovery: steering ${unitType} ${unitId} to finish durable output (attempt ${recoveryAttempts + 1}/${maxRecoveryAttempts}) (attempt ${attemptNumber}).`,
"warning",
);
return "recovered";
@ -2283,9 +2369,10 @@ async function recoverTimedOutUnit(
lastRecoveryReason: reason,
});
ctx.ui.notify(
`${unitType} ${unitId} skipped after ${maxRecoveryAttempts} recovery attempts (${diagnostic}). Blocker artifacts written. Advancing pipeline.`,
`${unitType} ${unitId} skipped after ${maxRecoveryAttempts} recovery attempts (${diagnostic}). Blocker artifacts written. Advancing pipeline. (attempt ${attemptNumber})`,
"warning",
);
unitRecoveryCount.delete(recoveryKey);
await dispatchNextUnit(ctx, pi);
return "recovered";
}
@ -2316,9 +2403,10 @@ async function recoverTimedOutUnit(
lastRecoveryReason: reason,
});
ctx.ui.notify(
`${reason === "idle" ? "Idle" : "Timeout"} recovery: ${unitType} ${unitId} artifact already exists on disk. Advancing.`,
`${reason === "idle" ? "Idle" : "Timeout"} recovery: ${unitType} ${unitId} artifact already exists on disk. Advancing. (attempt ${attemptNumber})`,
"info",
);
unitRecoveryCount.delete(recoveryKey);
await dispatchNextUnit(ctx, pi);
return "recovered";
}
@ -2364,7 +2452,7 @@ async function recoverTimedOutUnit(
{ triggerTurn: true, deliverAs: "steer" },
);
ctx.ui.notify(
`${reason === "idle" ? "Idle" : "Timeout"} recovery: steering ${unitType} ${unitId} to produce ${expected} (attempt ${recoveryAttempts + 1}/${maxRecoveryAttempts}).`,
`${reason === "idle" ? "Idle" : "Timeout"} recovery: steering ${unitType} ${unitId} to produce ${expected} (attempt ${recoveryAttempts + 1}/${maxRecoveryAttempts}) (attempt ${attemptNumber}).`,
"warning",
);
return "recovered";
@ -2384,9 +2472,10 @@ async function recoverTimedOutUnit(
lastRecoveryReason: reason,
});
ctx.ui.notify(
`${unitType} ${unitId} skipped after ${maxRecoveryAttempts} recovery attempts. Blocker placeholder written to ${placeholder}. Advancing pipeline.`,
`${unitType} ${unitId} skipped after ${maxRecoveryAttempts} recovery attempts. Blocker placeholder written to ${placeholder}. Advancing pipeline. (attempt ${attemptNumber})`,
"warning",
);
unitRecoveryCount.delete(recoveryKey);
await dispatchNextUnit(ctx, pi);
return "recovered";
}

View file

@ -1,4 +1,4 @@
import { existsSync, mkdirSync, readFileSync, writeFileSync, unlinkSync } from "node:fs";
import { existsSync, mkdirSync, readdirSync, readFileSync, writeFileSync, unlinkSync } from "node:fs";
import { join } from "node:path";
import {
gsdRoot,
@ -99,6 +99,27 @@ export function clearUnitRuntimeRecord(basePath: string, unitType: string, unitI
if (existsSync(path)) unlinkSync(path);
}
/**
* Return all runtime records currently on disk for `basePath`.
* Returns an empty array if the runtime directory does not exist.
*/
export function listUnitRuntimeRecords(basePath: string): AutoUnitRuntimeRecord[] {
const dir = runtimeDir(basePath);
if (!existsSync(dir)) return [];
const results: AutoUnitRuntimeRecord[] = [];
for (const file of readdirSync(dir)) {
if (!file.endsWith(".json")) continue;
try {
const raw = readFileSync(join(dir, file), "utf-8");
const record = JSON.parse(raw) as AutoUnitRuntimeRecord;
results.push(record);
} catch {
// Skip malformed files
}
}
return results;
}
export async function inspectExecuteTaskDurability(
basePath: string,
unitId: string,

View file

@ -156,7 +156,6 @@ export function ensureSliceBranch(basePath: string, milestoneId: string, sliceId
if (current === branch) return false;
const mainBranch = getMainBranch(basePath);
let created = false;
if (!branchExists(basePath, branch)) {
@ -166,6 +165,7 @@ export function ensureSliceBranch(basePath: string, milestoneId: string, sliceId
// branch and haven't been merged to main yet.
// If we're already on a slice branch (e.g. creating S02 while S01
// wasn't merged yet), fall back to main to avoid chaining slice branches.
const mainBranch = getMainBranch(basePath);
const base = SLICE_BRANCH_RE.test(current) ? mainBranch : current;
runGit(basePath, ["branch", branch, base]);
created = true;