diff --git a/src/resources/extensions/gsd/auto.ts b/src/resources/extensions/gsd/auto.ts index 498b6ffa8..bbc061bea 100644 --- a/src/resources/extensions/gsd/auto.ts +++ b/src/resources/extensions/gsd/auto.ts @@ -71,6 +71,39 @@ import { truncateToWidth, visibleWidth } from "@mariozechner/pi-tui"; import { makeUI, GLYPH, INDENT } from "../shared/ui.js"; import { showNextAction } from "../shared/next-action-ui.js"; +// ─── Disk-backed completed-unit helpers ─────────────────────────────────────── + +/** Path to the persisted completed-unit keys file. */ +function completedKeysPath(base: string): string { + return join(base, ".gsd", "completed-units.json"); +} + +/** Write a completed unit key to disk atomically (append to set). */ +function persistCompletedKey(base: string, key: string): void { + const file = completedKeysPath(base); + let keys: string[] = []; + try { + if (existsSync(file)) { + keys = JSON.parse(readFileSync(file, "utf-8")); + } + } catch { /* corrupt file — start fresh */ } + if (!keys.includes(key)) { + keys.push(key); + writeFileSync(file, JSON.stringify(keys), "utf-8"); + } +} + +/** Load all completed unit keys from disk into the in-memory set. */ +function loadPersistedKeys(base: string, target: Set): void { + const file = completedKeysPath(base); + try { + if (existsSync(file)) { + const keys: string[] = JSON.parse(readFileSync(file, "utf-8")); + for (const k of keys) target.add(k); + } + } catch { /* non-fatal */ } +} + // ─── State ──────────────────────────────────────────────────────────────────── let active = false; @@ -80,13 +113,16 @@ let verbose = false; let cmdCtx: ExtensionCommandContext | null = null; let basePath = ""; -/** Track dispatched units to detect stuck loops (including alternating patterns) */ -let lastUnit: { type: string; id: string } | null = null; -let retryCount = 0; -const MAX_RETRIES = 1; +/** Track total dispatches per unit to detect stuck loops (catches A→B→A→B patterns) */ const unitDispatchCount = new Map(); const MAX_UNIT_DISPATCHES = 3; +/** Tracks recovery attempt count per unit for backoff and diagnostics. */ +const unitRecoveryCount = new Map(); + +/** Persisted completed-unit keys — survives restarts. Loaded from .gsd/completed-units.json. */ +const completedKeySet = new Set(); + /** Crash recovery prompt — set by startAuto, consumed by first dispatchNextUnit */ let pendingCrashRecovery: string | null = null; @@ -209,7 +245,8 @@ export async function stopAuto(ctx?: ExtensionContext, pi?: ExtensionAPI): Promi active = false; paused = false; stepMode = false; - lastUnit = null; + unitDispatchCount.clear(); + unitRecoveryCount.clear(); currentUnit = null; currentMilestoneId = null; cachedSliceProgress = null; @@ -239,7 +276,7 @@ export async function pauseAuto(ctx?: ExtensionContext, _pi?: ExtensionAPI): Pro if (basePath) clearLock(basePath); active = false; paused = true; - // Preserve: lastUnit, currentUnit, basePath, verbose, cmdCtx, + // Preserve: unitDispatchCount, currentUnit, basePath, verbose, cmdCtx, // completedUnits, autoStartTime, currentMilestoneId, originalModelId // — all needed for resume and dashboard display ctx?.ui.setStatus("gsd-auto", "paused"); @@ -252,6 +289,33 @@ export async function pauseAuto(ctx?: ExtensionContext, _pi?: ExtensionAPI): Pro ); } +/** + * Self-heal: scan runtime records in .gsd/ and clear any where the expected + * artifact already exists on disk. This repairs incomplete closeouts from + * prior crashes — preventing spurious re-dispatch of already-completed units. + */ +async function selfHealRuntimeRecords(base: string, ctx: ExtensionContext): Promise { + try { + const { listUnitRuntimeRecords } = await import("./unit-runtime.js"); + const records = listUnitRuntimeRecords(base); + let healed = 0; + for (const record of records) { + const { unitType, unitId } = record; + const artifactPath = resolveExpectedArtifactPath(unitType, unitId, base); + if (artifactPath && existsSync(artifactPath)) { + // Artifact exists — unit completed but closeout didn't finish. + clearUnitRuntimeRecord(base, unitType, unitId); + healed++; + } + } + if (healed > 0) { + ctx.ui.notify(`Self-heal: cleared ${healed} stale runtime record(s) with completed artifacts.`, "info"); + } + } catch { + // Non-fatal — self-heal should never block auto-mode start + } +} + export async function startAuto( ctx: ExtensionCommandContext, pi: ExtensionAPI, @@ -285,6 +349,8 @@ export async function startAuto( ctx.ui.notify(`Resume: applied ${report.fixesApplied.length} fix(es) to state.`, "info"); } } catch { /* non-fatal */ } + // Self-heal: clear stale runtime records where artifacts already exist + await selfHealRuntimeRecords(base, ctx); await dispatchNextUnit(ctx, pi); return; } @@ -363,9 +429,10 @@ export async function startAuto( verbose = verboseMode; cmdCtx = ctx; basePath = base; - lastUnit = null; - retryCount = 0; unitDispatchCount.clear(); + unitRecoveryCount.clear(); + completedKeySet.clear(); + loadPersistedKeys(base, completedKeySet); autoStartTime = Date.now(); completedUnits = []; currentUnit = null; @@ -389,6 +456,9 @@ export async function startAuto( : "Will loop until milestone complete."; ctx.ui.notify(`${modeLabel} started. ${scopeMsg}`, "info"); + // Self-heal: clear stale runtime records where artifacts already exist + await selfHealRuntimeRecords(base, ctx); + // Dispatch the first unit await dispatchNextUnit(ctx, pi); } @@ -882,8 +952,8 @@ async function dispatchNextUnit( "info", ); // Reset stuck detection for new milestone - lastUnit = null; - retryCount = 0; + unitDispatchCount.clear(); + unitRecoveryCount.clear(); } if (mid) currentMilestoneId = mid; @@ -960,6 +1030,12 @@ async function dispatchNextUnit( snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId); saveActivityLog(ctx, basePath, currentUnit.type, currentUnit.id); } + // Clear completed-units.json for the finished milestone so it doesn't grow unbounded. + try { + const file = completedKeysPath(basePath); + if (existsSync(file)) writeFileSync(file, JSON.stringify([]), "utf-8"); + completedKeySet.clear(); + } catch { /* non-fatal */ } await stopAuto(ctx, pi); return; } @@ -1111,54 +1187,44 @@ async function dispatchNextUnit( await emitObservabilityWarnings(ctx, unitType, unitId); - // Stuck detection — catches both consecutive repeats and alternating patterns. - // Consecutive: same unit dispatched back-to-back (LLM didn't produce artifact). - // Alternating: same unit dispatched N times total (A→B→A→B loop). - const dispatchKey = `${unitType}/${unitId}`; - const prevTotalCount = unitDispatchCount.get(dispatchKey) ?? 0; + // Idempotency: skip units already completed in a prior session. + const idempotencyKey = `${unitType}/${unitId}`; + if (completedKeySet.has(idempotencyKey)) { + ctx.ui.notify( + `Skipping ${unitType} ${unitId} — already completed in a prior session. Advancing.`, + "info", + ); + // Don't increment dispatch count — just advance by calling dispatchNextUnit again. + // First, force state re-derive so the scheduler sees the completed artifact. + await dispatchNextUnit(ctx, pi); + return; + } - if (prevTotalCount >= MAX_UNIT_DISPATCHES) { + // Stuck detection — tracks total dispatches per unit (not just consecutive repeats). + // Pattern A→B→A→B would reset retryCount every time; this map catches it. + const dispatchKey = `${unitType}/${unitId}`; + const prevCount = unitDispatchCount.get(dispatchKey) ?? 0; + if (prevCount >= MAX_UNIT_DISPATCHES) { if (currentUnit) { const modelId = ctx.model?.id ?? "unknown"; snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId); } saveActivityLog(ctx, basePath, unitType, unitId); + const expected = diagnoseExpectedArtifact(unitType, unitId, basePath); await stopAuto(ctx, pi); ctx.ui.notify( - `Loop detected: ${unitType} ${unitId} dispatched ${prevTotalCount + 1} times. ` + - `Check branch state and .gsd/ artifacts.${expected ? `\n Expected: ${expected}` : ""}`, + `Loop detected: ${unitType} ${unitId} dispatched ${prevCount + 1} times total. Expected artifact not found.${expected ? `\n Expected: ${expected}` : ""}\n Check branch state and .gsd/ artifacts.`, "error", ); return; } - unitDispatchCount.set(dispatchKey, prevTotalCount + 1); - - if (lastUnit && lastUnit.type === unitType && lastUnit.id === unitId) { - retryCount++; - - if (retryCount > MAX_RETRIES) { - if (currentUnit) { - const modelId = ctx.model?.id ?? "unknown"; - snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId); - } - saveActivityLog(ctx, basePath, lastUnit.type, lastUnit.id); - - // Diagnostic: what file was expected? - const expected = diagnoseExpectedArtifact(unitType, unitId, basePath); - await stopAuto(ctx, pi); - ctx.ui.notify( - `Stuck: ${unitType} ${unitId} fired ${retryCount + 1} times. Expected artifact not found.${expected ? `\n Expected: ${expected}` : ""}\n Check .gsd/ and activity logs.`, - "error", - ); - return; - } + unitDispatchCount.set(dispatchKey, prevCount + 1); + if (prevCount > 0) { ctx.ui.notify( - `${unitType} ${unitId} didn't produce expected artifact. Retrying (${retryCount}/${MAX_RETRIES}).`, + `${unitType} ${unitId} didn't produce expected artifact. Retrying (${prevCount + 1}/${MAX_UNIT_DISPATCHES}).`, "warning", ); - } else { - retryCount = 0; } // Snapshot metrics + activity log for the PREVIOUS unit before we reassign. // The session still holds the previous unit's data (newSession hasn't fired yet). @@ -1167,6 +1233,11 @@ async function dispatchNextUnit( snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId); saveActivityLog(ctx, basePath, currentUnit.type, currentUnit.id); + // Persist completion to disk BEFORE updating memory — so a crash here is recoverable. + const closeoutKey = `${currentUnit.type}/${currentUnit.id}`; + persistCompletedKey(basePath, closeoutKey); + completedKeySet.add(closeoutKey); + completedUnits.push({ type: currentUnit.type, id: currentUnit.id, @@ -1174,9 +1245,9 @@ async function dispatchNextUnit( finishedAt: Date.now(), }); clearUnitRuntimeRecord(basePath, currentUnit.type, currentUnit.id); + unitDispatchCount.delete(`${currentUnit.type}/${currentUnit.id}`); + unitRecoveryCount.delete(`${currentUnit.type}/${currentUnit.id}`); } - - lastUnit = { type: unitType, id: unitId }; currentUnit = { type: unitType, id: unitId, startedAt: Date.now() }; writeUnitRuntimeRecord(basePath, unitType, unitId, currentUnit.startedAt, { phase: "dispatched", @@ -1220,7 +1291,7 @@ async function dispatchNextUnit( if (pendingCrashRecovery) { finalPrompt = `${pendingCrashRecovery}\n\n---\n\n${finalPrompt}`; pendingCrashRecovery = null; - } else if (retryCount > 0) { + } else if ((unitDispatchCount.get(`${unitType}/${unitId}`) ?? 0) > 1) { const diagnostic = getDeepDiagnostic(basePath); if (diagnostic) { finalPrompt = `**RETRY — your previous attempt did not produce the required artifact.**\n\nDiagnostic from previous attempt:\n${diagnostic}\n\nFix whatever went wrong and make sure you write the required file this time.\n\n---\n\n${finalPrompt}`; @@ -2198,6 +2269,20 @@ async function recoverTimedOutUnit( const recoveryAttempts = runtime?.recoveryAttempts ?? 0; const maxRecoveryAttempts = reason === "idle" ? 2 : 1; + const recoveryKey = `${unitType}/${unitId}`; + const attemptNumber = (unitRecoveryCount.get(recoveryKey) ?? 0) + 1; + unitRecoveryCount.set(recoveryKey, attemptNumber); + + if (attemptNumber > 1) { + // Exponential backoff: 2^(n-1) seconds, capped at 30s + const backoffMs = Math.min(1000 * Math.pow(2, attemptNumber - 2), 30000); + ctx.ui.notify( + `Recovery attempt ${attemptNumber} for ${unitType} ${unitId}. Waiting ${backoffMs / 1000}s before retry.`, + "info", + ); + await new Promise(r => setTimeout(r, backoffMs)); + } + if (unitType === "execute-task") { const status = await inspectExecuteTaskDurability(basePath, unitId); if (!status) return "paused"; @@ -2213,9 +2298,10 @@ async function recoverTimedOutUnit( recovery: status, }); ctx.ui.notify( - `${reason === "idle" ? "Idle" : "Timeout"} recovery: ${unitType} ${unitId} already completed on disk. Continuing auto-mode.`, + `${reason === "idle" ? "Idle" : "Timeout"} recovery: ${unitType} ${unitId} already completed on disk. Continuing auto-mode. (attempt ${attemptNumber})`, "info", ); + unitRecoveryCount.delete(recoveryKey); await dispatchNextUnit(ctx, pi); return "recovered"; } @@ -2262,7 +2348,7 @@ async function recoverTimedOutUnit( { triggerTurn: true, deliverAs: "steer" }, ); ctx.ui.notify( - `${reason === "idle" ? "Idle" : "Timeout"} recovery: steering ${unitType} ${unitId} to finish durable output (attempt ${recoveryAttempts + 1}/${maxRecoveryAttempts}).`, + `${reason === "idle" ? "Idle" : "Timeout"} recovery: steering ${unitType} ${unitId} to finish durable output (attempt ${recoveryAttempts + 1}/${maxRecoveryAttempts}) (attempt ${attemptNumber}).`, "warning", ); return "recovered"; @@ -2283,9 +2369,10 @@ async function recoverTimedOutUnit( lastRecoveryReason: reason, }); ctx.ui.notify( - `${unitType} ${unitId} skipped after ${maxRecoveryAttempts} recovery attempts (${diagnostic}). Blocker artifacts written. Advancing pipeline.`, + `${unitType} ${unitId} skipped after ${maxRecoveryAttempts} recovery attempts (${diagnostic}). Blocker artifacts written. Advancing pipeline. (attempt ${attemptNumber})`, "warning", ); + unitRecoveryCount.delete(recoveryKey); await dispatchNextUnit(ctx, pi); return "recovered"; } @@ -2316,9 +2403,10 @@ async function recoverTimedOutUnit( lastRecoveryReason: reason, }); ctx.ui.notify( - `${reason === "idle" ? "Idle" : "Timeout"} recovery: ${unitType} ${unitId} artifact already exists on disk. Advancing.`, + `${reason === "idle" ? "Idle" : "Timeout"} recovery: ${unitType} ${unitId} artifact already exists on disk. Advancing. (attempt ${attemptNumber})`, "info", ); + unitRecoveryCount.delete(recoveryKey); await dispatchNextUnit(ctx, pi); return "recovered"; } @@ -2364,7 +2452,7 @@ async function recoverTimedOutUnit( { triggerTurn: true, deliverAs: "steer" }, ); ctx.ui.notify( - `${reason === "idle" ? "Idle" : "Timeout"} recovery: steering ${unitType} ${unitId} to produce ${expected} (attempt ${recoveryAttempts + 1}/${maxRecoveryAttempts}).`, + `${reason === "idle" ? "Idle" : "Timeout"} recovery: steering ${unitType} ${unitId} to produce ${expected} (attempt ${recoveryAttempts + 1}/${maxRecoveryAttempts}) (attempt ${attemptNumber}).`, "warning", ); return "recovered"; @@ -2384,9 +2472,10 @@ async function recoverTimedOutUnit( lastRecoveryReason: reason, }); ctx.ui.notify( - `${unitType} ${unitId} skipped after ${maxRecoveryAttempts} recovery attempts. Blocker placeholder written to ${placeholder}. Advancing pipeline.`, + `${unitType} ${unitId} skipped after ${maxRecoveryAttempts} recovery attempts. Blocker placeholder written to ${placeholder}. Advancing pipeline. (attempt ${attemptNumber})`, "warning", ); + unitRecoveryCount.delete(recoveryKey); await dispatchNextUnit(ctx, pi); return "recovered"; } diff --git a/src/resources/extensions/gsd/unit-runtime.ts b/src/resources/extensions/gsd/unit-runtime.ts index e1bdf7b01..6b17c5f35 100644 --- a/src/resources/extensions/gsd/unit-runtime.ts +++ b/src/resources/extensions/gsd/unit-runtime.ts @@ -1,4 +1,4 @@ -import { existsSync, mkdirSync, readFileSync, writeFileSync, unlinkSync } from "node:fs"; +import { existsSync, mkdirSync, readdirSync, readFileSync, writeFileSync, unlinkSync } from "node:fs"; import { join } from "node:path"; import { gsdRoot, @@ -99,6 +99,27 @@ export function clearUnitRuntimeRecord(basePath: string, unitType: string, unitI if (existsSync(path)) unlinkSync(path); } +/** + * Return all runtime records currently on disk for `basePath`. + * Returns an empty array if the runtime directory does not exist. + */ +export function listUnitRuntimeRecords(basePath: string): AutoUnitRuntimeRecord[] { + const dir = runtimeDir(basePath); + if (!existsSync(dir)) return []; + const results: AutoUnitRuntimeRecord[] = []; + for (const file of readdirSync(dir)) { + if (!file.endsWith(".json")) continue; + try { + const raw = readFileSync(join(dir, file), "utf-8"); + const record = JSON.parse(raw) as AutoUnitRuntimeRecord; + results.push(record); + } catch { + // Skip malformed files + } + } + return results; +} + export async function inspectExecuteTaskDurability( basePath: string, unitId: string, diff --git a/src/resources/extensions/gsd/worktree.ts b/src/resources/extensions/gsd/worktree.ts index 67be797a8..31f6fe7d1 100644 --- a/src/resources/extensions/gsd/worktree.ts +++ b/src/resources/extensions/gsd/worktree.ts @@ -156,7 +156,6 @@ export function ensureSliceBranch(basePath: string, milestoneId: string, sliceId if (current === branch) return false; - const mainBranch = getMainBranch(basePath); let created = false; if (!branchExists(basePath, branch)) { @@ -166,6 +165,7 @@ export function ensureSliceBranch(basePath: string, milestoneId: string, sliceId // branch and haven't been merged to main yet. // If we're already on a slice branch (e.g. creating S02 while S01 // wasn't merged yet), fall back to main to avoid chaining slice branches. + const mainBranch = getMainBranch(basePath); const base = SLICE_BRANCH_RE.test(current) ? mainBranch : current; runGit(basePath, ["branch", branch, base]); created = true;