fix: resolve auto-mode infinite loop and closeout instability (#96, #109)

Three defects in auto.ts + worktree.ts combined to produce an infinite
alternating loop and unreliable unit closeout in GSD auto-mode.

**D1 (auto.ts)** — `state.phase === "summarizing"` is now the first
branch in the dispatch if-else chain, evaluated before `needsRunUat`
and `needsReassess`. Previously, if an execute-task agent wrote
slice-level artifacts early, `needsReassess` fired instead and
`mergeSliceToMain` was permanently skipped.

**D2 (worktree.ts)** — New slice branches are now created from the
current HEAD instead of `main`. When a prior slice merge was skipped,
the new branch would inherit a stale ROADMAP from main, creating
divergent state that drove the A→B→A→B alternation.

**D3 (auto.ts)** — Replaced `lastUnit`/`retryCount` consecutive-repeat
detection with a `unitDispatchCount` map that tracks total dispatches
per unit key. The old guard reset to 0 on every ID change; the map
catches alternating-loop patterns and stops after MAX_UNIT_DISPATCHES=3.

**Atomic closeout (auto.ts)** — `persistCompletedKey` writes the unit
key to `.gsd/completed-units.json` before any in-memory update. A crash
mid-closeout is now recoverable: on next start `loadPersistedKeys`
re-populates `completedKeySet` and the idempotency guard skips already-
completed units.

**Persistent idempotency (auto.ts)** — `completedKeySet` is loaded from
disk on `startAuto` and checked before every dispatch, preventing re-
dispatch of units completed in a prior session even after a restart.

**Startup self-heal (auto.ts + unit-runtime.ts)** — `selfHealRuntimeRecords`
runs on start and resume; it scans all on-disk runtime records, checks
whether each unit's expected artifact exists, and clears any orphaned
records. Added `listUnitRuntimeRecords` to unit-runtime.ts to support
this scan.

**Recovery backoff (auto.ts)** — `recoverTimedOutUnit` now tracks
cross-invocation recovery attempts per unit in `unitRecoveryCount` and
applies exponential backoff (1s→2s→4s…30s cap) between attempts.
Attempt number is included in all recovery notify messages for
traceability.

Closes #96
Closes #109

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
omarsharaf96 2026-03-12 10:27:18 -04:00
parent 9fb348b123
commit 62d4ca74a2
3 changed files with 188 additions and 56 deletions

View file

@ -69,6 +69,39 @@ import { truncateToWidth, visibleWidth } from "@mariozechner/pi-tui";
import { makeUI, GLYPH, INDENT } from "../shared/ui.js";
import { showNextAction } from "../shared/next-action-ui.js";
// ─── Disk-backed completed-unit helpers ───────────────────────────────────────
/** Path to the persisted completed-unit keys file. */
function completedKeysPath(base: string): string {
return join(base, ".gsd", "completed-units.json");
}
/** Write a completed unit key to disk atomically (append to set). */
function persistCompletedKey(base: string, key: string): void {
const file = completedKeysPath(base);
let keys: string[] = [];
try {
if (existsSync(file)) {
keys = JSON.parse(readFileSync(file, "utf-8"));
}
} catch { /* corrupt file — start fresh */ }
if (!keys.includes(key)) {
keys.push(key);
writeFileSync(file, JSON.stringify(keys), "utf-8");
}
}
/** Load all completed unit keys from disk into the in-memory set. */
function loadPersistedKeys(base: string, target: Set<string>): void {
const file = completedKeysPath(base);
try {
if (existsSync(file)) {
const keys: string[] = JSON.parse(readFileSync(file, "utf-8"));
for (const k of keys) target.add(k);
}
} catch { /* non-fatal */ }
}
// ─── State ────────────────────────────────────────────────────────────────────
let active = false;
@ -78,10 +111,15 @@ let verbose = false;
let cmdCtx: ExtensionCommandContext | null = null;
let basePath = "";
/** Track last dispatched unit to detect stuck loops */
let lastUnit: { type: string; id: string } | null = null;
let retryCount = 0;
const MAX_RETRIES = 1;
/** Track total dispatches per unit to detect stuck loops (catches A→B→A→B patterns) */
const unitDispatchCount = new Map<string, number>();
const MAX_UNIT_DISPATCHES = 3;
/** Tracks recovery attempt count per unit for backoff and diagnostics. */
const unitRecoveryCount = new Map<string, number>();
/** Persisted completed-unit keys — survives restarts. Loaded from .gsd/completed-units.json. */
const completedKeySet = new Set<string>();
/** Crash recovery prompt — set by startAuto, consumed by first dispatchNextUnit */
let pendingCrashRecovery: string | null = null;
@ -205,7 +243,8 @@ export async function stopAuto(ctx?: ExtensionContext, pi?: ExtensionAPI): Promi
active = false;
paused = false;
stepMode = false;
lastUnit = null;
unitDispatchCount.clear();
unitRecoveryCount.clear();
currentUnit = null;
currentMilestoneId = null;
cachedSliceProgress = null;
@ -235,7 +274,7 @@ export async function pauseAuto(ctx?: ExtensionContext, _pi?: ExtensionAPI): Pro
if (basePath) clearLock(basePath);
active = false;
paused = true;
// Preserve: lastUnit, currentUnit, basePath, verbose, cmdCtx,
// Preserve: unitDispatchCount, currentUnit, basePath, verbose, cmdCtx,
// completedUnits, autoStartTime, currentMilestoneId, originalModelId
// — all needed for resume and dashboard display
ctx?.ui.setStatus("gsd-auto", "paused");
@ -248,6 +287,33 @@ export async function pauseAuto(ctx?: ExtensionContext, _pi?: ExtensionAPI): Pro
);
}
/**
* Self-heal: scan runtime records in .gsd/ and clear any where the expected
* artifact already exists on disk. This repairs incomplete closeouts from
* prior crashes preventing spurious re-dispatch of already-completed units.
*/
async function selfHealRuntimeRecords(base: string, ctx: ExtensionContext): Promise<void> {
try {
const { listUnitRuntimeRecords } = await import("./unit-runtime.js");
const records = listUnitRuntimeRecords(base);
let healed = 0;
for (const record of records) {
const { unitType, unitId } = record;
const artifactPath = resolveExpectedArtifactPath(unitType, unitId, base);
if (artifactPath && existsSync(artifactPath)) {
// Artifact exists — unit completed but closeout didn't finish.
clearUnitRuntimeRecord(base, unitType, unitId);
healed++;
}
}
if (healed > 0) {
ctx.ui.notify(`Self-heal: cleared ${healed} stale runtime record(s) with completed artifacts.`, "info");
}
} catch {
// Non-fatal — self-heal should never block auto-mode start
}
}
export async function startAuto(
ctx: ExtensionCommandContext,
pi: ExtensionAPI,
@ -280,6 +346,8 @@ export async function startAuto(
ctx.ui.notify(`Resume: applied ${report.fixesApplied.length} fix(es) to state.`, "info");
}
} catch { /* non-fatal */ }
// Self-heal: clear stale runtime records where artifacts already exist
await selfHealRuntimeRecords(base, ctx);
await dispatchNextUnit(ctx, pi);
return;
}
@ -358,8 +426,10 @@ export async function startAuto(
verbose = verboseMode;
cmdCtx = ctx;
basePath = base;
lastUnit = null;
retryCount = 0;
unitDispatchCount.clear();
unitRecoveryCount.clear();
completedKeySet.clear();
loadPersistedKeys(base, completedKeySet);
autoStartTime = Date.now();
completedUnits = [];
currentUnit = null;
@ -383,6 +453,9 @@ export async function startAuto(
: "Will loop until milestone complete.";
ctx.ui.notify(`${modeLabel} started. ${scopeMsg}`, "info");
// Self-heal: clear stale runtime records where artifacts already exist
await selfHealRuntimeRecords(base, ctx);
// Dispatch the first unit
await dispatchNextUnit(ctx, pi);
}
@ -876,8 +949,8 @@ async function dispatchNextUnit(
"info",
);
// Reset stuck detection for new milestone
lastUnit = null;
retryCount = 0;
unitDispatchCount.clear();
unitRecoveryCount.clear();
}
if (mid) currentMilestoneId = mid;
@ -953,6 +1026,12 @@ async function dispatchNextUnit(
snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId);
saveActivityLog(ctx, basePath, currentUnit.type, currentUnit.id);
}
// Clear completed-units.json for the finished milestone so it doesn't grow unbounded.
try {
const file = completedKeysPath(basePath);
if (existsSync(file)) writeFileSync(file, JSON.stringify([]), "utf-8");
completedKeySet.clear();
} catch { /* non-fatal */ }
await stopAuto(ctx, pi);
return;
}
@ -996,7 +1075,15 @@ async function dispatchNextUnit(
// After a slice completes, we reassess the roadmap before moving to the next slice.
// Skip reassessment for the final slice (milestone complete) or if already assessed.
const needsReassess = await checkNeedsReassessment(basePath, mid, state);
if (needsRunUat) {
if (state.phase === "summarizing") {
// complete-slice MUST run before reassessment to guarantee mergeSliceToMain
const sid = state.activeSlice!.id;
const sTitle = state.activeSlice!.title;
unitType = "complete-slice";
unitId = `${mid}/${sid}`;
prompt = await buildCompleteSlicePrompt(mid, midTitle!, sid, sTitle, basePath);
} else if (needsRunUat) {
const { sliceId, uatType } = needsRunUat;
unitType = "run-uat";
unitId = `${mid}/${sliceId}`;
@ -1075,14 +1162,6 @@ async function dispatchNextUnit(
unitId = `${mid}/${sid}/${tid}`;
prompt = await buildExecuteTaskPrompt(mid, sid, sTitle, tid, tTitle, basePath);
} else if (state.phase === "summarizing") {
// All tasks done — complete the slice
const sid = state.activeSlice!.id;
const sTitle = state.activeSlice!.title;
unitType = "complete-slice";
unitId = `${mid}/${sid}`;
prompt = await buildCompleteSlicePrompt(mid, midTitle!, sid, sTitle, basePath);
} else if (state.phase === "completing-milestone") {
// All slices done — complete the milestone
unitType = "complete-milestone";
@ -1102,34 +1181,44 @@ async function dispatchNextUnit(
await emitObservabilityWarnings(ctx, unitType, unitId);
// Stuck detection — same unit dispatched again means the LLM didn't produce
// the expected artifact. Retry once (the LLM may have hit an error or run out
// of context), then stop with a diagnostic.
if (lastUnit && lastUnit.type === unitType && lastUnit.id === unitId) {
retryCount++;
if (retryCount > MAX_RETRIES) {
if (currentUnit) {
const modelId = ctx.model?.id ?? "unknown";
snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId);
}
saveActivityLog(ctx, basePath, lastUnit.type, lastUnit.id);
// Diagnostic: what file was expected?
const expected = diagnoseExpectedArtifact(unitType, unitId, basePath);
await stopAuto(ctx, pi);
ctx.ui.notify(
`Stuck: ${unitType} ${unitId} fired ${retryCount + 1} times. Expected artifact not found.${expected ? `\n Expected: ${expected}` : ""}\n Check .gsd/ and activity logs.`,
"error",
);
return;
}
// Idempotency: skip units already completed in a prior session.
const idempotencyKey = `${unitType}/${unitId}`;
if (completedKeySet.has(idempotencyKey)) {
ctx.ui.notify(
`${unitType} ${unitId} didn't produce expected artifact. Retrying (${retryCount}/${MAX_RETRIES}).`,
`Skipping ${unitType} ${unitId} — already completed in a prior session. Advancing.`,
"info",
);
// Don't increment dispatch count — just advance by calling dispatchNextUnit again.
// First, force state re-derive so the scheduler sees the completed artifact.
await dispatchNextUnit(ctx, pi);
return;
}
// Stuck detection — tracks total dispatches per unit (not just consecutive repeats).
// Pattern A→B→A→B would reset retryCount every time; this map catches it.
const dispatchKey = `${unitType}/${unitId}`;
const prevCount = unitDispatchCount.get(dispatchKey) ?? 0;
if (prevCount >= MAX_UNIT_DISPATCHES) {
if (currentUnit) {
const modelId = ctx.model?.id ?? "unknown";
snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId);
}
saveActivityLog(ctx, basePath, unitType, unitId);
const expected = diagnoseExpectedArtifact(unitType, unitId, basePath);
await stopAuto(ctx, pi);
ctx.ui.notify(
`Loop detected: ${unitType} ${unitId} dispatched ${prevCount + 1} times total. Expected artifact not found.${expected ? `\n Expected: ${expected}` : ""}\n Check branch state and .gsd/ artifacts.`,
"error",
);
return;
}
unitDispatchCount.set(dispatchKey, prevCount + 1);
if (prevCount > 0) {
ctx.ui.notify(
`${unitType} ${unitId} didn't produce expected artifact. Retrying (${prevCount + 1}/${MAX_UNIT_DISPATCHES}).`,
"warning",
);
} else {
retryCount = 0;
}
// Snapshot metrics + activity log for the PREVIOUS unit before we reassign.
// The session still holds the previous unit's data (newSession hasn't fired yet).
@ -1138,6 +1227,11 @@ async function dispatchNextUnit(
snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId);
saveActivityLog(ctx, basePath, currentUnit.type, currentUnit.id);
// Persist completion to disk BEFORE updating memory — so a crash here is recoverable.
const closeoutKey = `${currentUnit.type}/${currentUnit.id}`;
persistCompletedKey(basePath, closeoutKey);
completedKeySet.add(closeoutKey);
completedUnits.push({
type: currentUnit.type,
id: currentUnit.id,
@ -1145,9 +1239,9 @@ async function dispatchNextUnit(
finishedAt: Date.now(),
});
clearUnitRuntimeRecord(basePath, currentUnit.type, currentUnit.id);
unitDispatchCount.delete(`${currentUnit.type}/${currentUnit.id}`);
unitRecoveryCount.delete(`${currentUnit.type}/${currentUnit.id}`);
}
lastUnit = { type: unitType, id: unitId };
currentUnit = { type: unitType, id: unitId, startedAt: Date.now() };
writeUnitRuntimeRecord(basePath, unitType, unitId, currentUnit.startedAt, {
phase: "dispatched",
@ -1191,7 +1285,7 @@ async function dispatchNextUnit(
if (pendingCrashRecovery) {
finalPrompt = `${pendingCrashRecovery}\n\n---\n\n${finalPrompt}`;
pendingCrashRecovery = null;
} else if (retryCount > 0) {
} else if ((unitDispatchCount.get(`${unitType}/${unitId}`) ?? 0) > 1) {
const diagnostic = getDeepDiagnostic(basePath);
if (diagnostic) {
finalPrompt = `**RETRY — your previous attempt did not produce the required artifact.**\n\nDiagnostic from previous attempt:\n${diagnostic}\n\nFix whatever went wrong and make sure you write the required file this time.\n\n---\n\n${finalPrompt}`;
@ -2169,6 +2263,20 @@ async function recoverTimedOutUnit(
const recoveryAttempts = runtime?.recoveryAttempts ?? 0;
const maxRecoveryAttempts = reason === "idle" ? 2 : 1;
const recoveryKey = `${unitType}/${unitId}`;
const attemptNumber = (unitRecoveryCount.get(recoveryKey) ?? 0) + 1;
unitRecoveryCount.set(recoveryKey, attemptNumber);
if (attemptNumber > 1) {
// Exponential backoff: 2^(n-1) seconds, capped at 30s
const backoffMs = Math.min(1000 * Math.pow(2, attemptNumber - 2), 30000);
ctx.ui.notify(
`Recovery attempt ${attemptNumber} for ${unitType} ${unitId}. Waiting ${backoffMs / 1000}s before retry.`,
"info",
);
await new Promise(r => setTimeout(r, backoffMs));
}
if (unitType === "execute-task") {
const status = await inspectExecuteTaskDurability(basePath, unitId);
if (!status) return "paused";
@ -2184,9 +2292,10 @@ async function recoverTimedOutUnit(
recovery: status,
});
ctx.ui.notify(
`${reason === "idle" ? "Idle" : "Timeout"} recovery: ${unitType} ${unitId} already completed on disk. Continuing auto-mode.`,
`${reason === "idle" ? "Idle" : "Timeout"} recovery: ${unitType} ${unitId} already completed on disk. Continuing auto-mode. (attempt ${attemptNumber})`,
"info",
);
unitRecoveryCount.delete(recoveryKey);
await dispatchNextUnit(ctx, pi);
return "recovered";
}
@ -2233,7 +2342,7 @@ async function recoverTimedOutUnit(
{ triggerTurn: true, deliverAs: "steer" },
);
ctx.ui.notify(
`${reason === "idle" ? "Idle" : "Timeout"} recovery: steering ${unitType} ${unitId} to finish durable output (attempt ${recoveryAttempts + 1}/${maxRecoveryAttempts}).`,
`${reason === "idle" ? "Idle" : "Timeout"} recovery: steering ${unitType} ${unitId} to finish durable output (attempt ${recoveryAttempts + 1}/${maxRecoveryAttempts}) (attempt ${attemptNumber}).`,
"warning",
);
return "recovered";
@ -2254,9 +2363,10 @@ async function recoverTimedOutUnit(
lastRecoveryReason: reason,
});
ctx.ui.notify(
`${unitType} ${unitId} skipped after ${maxRecoveryAttempts} recovery attempts (${diagnostic}). Blocker artifacts written. Advancing pipeline.`,
`${unitType} ${unitId} skipped after ${maxRecoveryAttempts} recovery attempts (${diagnostic}). Blocker artifacts written. Advancing pipeline. (attempt ${attemptNumber})`,
"warning",
);
unitRecoveryCount.delete(recoveryKey);
await dispatchNextUnit(ctx, pi);
return "recovered";
}
@ -2287,9 +2397,10 @@ async function recoverTimedOutUnit(
lastRecoveryReason: reason,
});
ctx.ui.notify(
`${reason === "idle" ? "Idle" : "Timeout"} recovery: ${unitType} ${unitId} artifact already exists on disk. Advancing.`,
`${reason === "idle" ? "Idle" : "Timeout"} recovery: ${unitType} ${unitId} artifact already exists on disk. Advancing. (attempt ${attemptNumber})`,
"info",
);
unitRecoveryCount.delete(recoveryKey);
await dispatchNextUnit(ctx, pi);
return "recovered";
}
@ -2335,7 +2446,7 @@ async function recoverTimedOutUnit(
{ triggerTurn: true, deliverAs: "steer" },
);
ctx.ui.notify(
`${reason === "idle" ? "Idle" : "Timeout"} recovery: steering ${unitType} ${unitId} to produce ${expected} (attempt ${recoveryAttempts + 1}/${maxRecoveryAttempts}).`,
`${reason === "idle" ? "Idle" : "Timeout"} recovery: steering ${unitType} ${unitId} to produce ${expected} (attempt ${recoveryAttempts + 1}/${maxRecoveryAttempts}) (attempt ${attemptNumber}).`,
"warning",
);
return "recovered";
@ -2355,9 +2466,10 @@ async function recoverTimedOutUnit(
lastRecoveryReason: reason,
});
ctx.ui.notify(
`${unitType} ${unitId} skipped after ${maxRecoveryAttempts} recovery attempts. Blocker placeholder written to ${placeholder}. Advancing pipeline.`,
`${unitType} ${unitId} skipped after ${maxRecoveryAttempts} recovery attempts. Blocker placeholder written to ${placeholder}. Advancing pipeline. (attempt ${attemptNumber})`,
"warning",
);
unitRecoveryCount.delete(recoveryKey);
await dispatchNextUnit(ctx, pi);
return "recovered";
}

View file

@ -1,4 +1,4 @@
import { existsSync, mkdirSync, readFileSync, writeFileSync, unlinkSync } from "node:fs";
import { existsSync, mkdirSync, readdirSync, readFileSync, writeFileSync, unlinkSync } from "node:fs";
import { join } from "node:path";
import {
gsdRoot,
@ -99,6 +99,27 @@ export function clearUnitRuntimeRecord(basePath: string, unitType: string, unitI
if (existsSync(path)) unlinkSync(path);
}
/**
* Return all runtime records currently on disk for `basePath`.
* Returns an empty array if the runtime directory does not exist.
*/
export function listUnitRuntimeRecords(basePath: string): AutoUnitRuntimeRecord[] {
const dir = runtimeDir(basePath);
if (!existsSync(dir)) return [];
const results: AutoUnitRuntimeRecord[] = [];
for (const file of readdirSync(dir)) {
if (!file.endsWith(".json")) continue;
try {
const raw = readFileSync(join(dir, file), "utf-8");
const record = JSON.parse(raw) as AutoUnitRuntimeRecord;
results.push(record);
} catch {
// Skip malformed files
}
}
return results;
}
export async function inspectExecuteTaskDurability(
basePath: string,
unitId: string,

View file

@ -78,11 +78,10 @@ export function ensureSliceBranch(basePath: string, milestoneId: string, sliceId
if (current === branch) return false;
const mainBranch = getMainBranch(basePath);
let created = false;
if (!branchExists(basePath, branch)) {
runGit(basePath, ["branch", branch, mainBranch]);
runGit(basePath, ["branch", branch]);
created = true;
}