diff --git a/src/resources/extensions/gsd/auto.ts b/src/resources/extensions/gsd/auto.ts index 420aa0fa1..3e54b13e2 100644 --- a/src/resources/extensions/gsd/auto.ts +++ b/src/resources/extensions/gsd/auto.ts @@ -69,6 +69,39 @@ import { truncateToWidth, visibleWidth } from "@mariozechner/pi-tui"; import { makeUI, GLYPH, INDENT } from "../shared/ui.js"; import { showNextAction } from "../shared/next-action-ui.js"; +// ─── Disk-backed completed-unit helpers ─────────────────────────────────────── + +/** Path to the persisted completed-unit keys file. */ +function completedKeysPath(base: string): string { + return join(base, ".gsd", "completed-units.json"); +} + +/** Write a completed unit key to disk atomically (append to set). */ +function persistCompletedKey(base: string, key: string): void { + const file = completedKeysPath(base); + let keys: string[] = []; + try { + if (existsSync(file)) { + keys = JSON.parse(readFileSync(file, "utf-8")); + } + } catch { /* corrupt file — start fresh */ } + if (!keys.includes(key)) { + keys.push(key); + writeFileSync(file, JSON.stringify(keys), "utf-8"); + } +} + +/** Load all completed unit keys from disk into the in-memory set. */ +function loadPersistedKeys(base: string, target: Set): void { + const file = completedKeysPath(base); + try { + if (existsSync(file)) { + const keys: string[] = JSON.parse(readFileSync(file, "utf-8")); + for (const k of keys) target.add(k); + } + } catch { /* non-fatal */ } +} + // ─── State ──────────────────────────────────────────────────────────────────── let active = false; @@ -78,10 +111,15 @@ let verbose = false; let cmdCtx: ExtensionCommandContext | null = null; let basePath = ""; -/** Track last dispatched unit to detect stuck loops */ -let lastUnit: { type: string; id: string } | null = null; -let retryCount = 0; -const MAX_RETRIES = 1; +/** Track total dispatches per unit to detect stuck loops (catches A→B→A→B patterns) */ +const unitDispatchCount = new Map(); +const MAX_UNIT_DISPATCHES = 3; + +/** Tracks recovery attempt count per unit for backoff and diagnostics. */ +const unitRecoveryCount = new Map(); + +/** Persisted completed-unit keys — survives restarts. Loaded from .gsd/completed-units.json. */ +const completedKeySet = new Set(); /** Crash recovery prompt — set by startAuto, consumed by first dispatchNextUnit */ let pendingCrashRecovery: string | null = null; @@ -205,7 +243,8 @@ export async function stopAuto(ctx?: ExtensionContext, pi?: ExtensionAPI): Promi active = false; paused = false; stepMode = false; - lastUnit = null; + unitDispatchCount.clear(); + unitRecoveryCount.clear(); currentUnit = null; currentMilestoneId = null; cachedSliceProgress = null; @@ -235,7 +274,7 @@ export async function pauseAuto(ctx?: ExtensionContext, _pi?: ExtensionAPI): Pro if (basePath) clearLock(basePath); active = false; paused = true; - // Preserve: lastUnit, currentUnit, basePath, verbose, cmdCtx, + // Preserve: unitDispatchCount, currentUnit, basePath, verbose, cmdCtx, // completedUnits, autoStartTime, currentMilestoneId, originalModelId // — all needed for resume and dashboard display ctx?.ui.setStatus("gsd-auto", "paused"); @@ -248,6 +287,33 @@ export async function pauseAuto(ctx?: ExtensionContext, _pi?: ExtensionAPI): Pro ); } +/** + * Self-heal: scan runtime records in .gsd/ and clear any where the expected + * artifact already exists on disk. This repairs incomplete closeouts from + * prior crashes — preventing spurious re-dispatch of already-completed units. + */ +async function selfHealRuntimeRecords(base: string, ctx: ExtensionContext): Promise { + try { + const { listUnitRuntimeRecords } = await import("./unit-runtime.js"); + const records = listUnitRuntimeRecords(base); + let healed = 0; + for (const record of records) { + const { unitType, unitId } = record; + const artifactPath = resolveExpectedArtifactPath(unitType, unitId, base); + if (artifactPath && existsSync(artifactPath)) { + // Artifact exists — unit completed but closeout didn't finish. + clearUnitRuntimeRecord(base, unitType, unitId); + healed++; + } + } + if (healed > 0) { + ctx.ui.notify(`Self-heal: cleared ${healed} stale runtime record(s) with completed artifacts.`, "info"); + } + } catch { + // Non-fatal — self-heal should never block auto-mode start + } +} + export async function startAuto( ctx: ExtensionCommandContext, pi: ExtensionAPI, @@ -280,6 +346,8 @@ export async function startAuto( ctx.ui.notify(`Resume: applied ${report.fixesApplied.length} fix(es) to state.`, "info"); } } catch { /* non-fatal */ } + // Self-heal: clear stale runtime records where artifacts already exist + await selfHealRuntimeRecords(base, ctx); await dispatchNextUnit(ctx, pi); return; } @@ -358,8 +426,10 @@ export async function startAuto( verbose = verboseMode; cmdCtx = ctx; basePath = base; - lastUnit = null; - retryCount = 0; + unitDispatchCount.clear(); + unitRecoveryCount.clear(); + completedKeySet.clear(); + loadPersistedKeys(base, completedKeySet); autoStartTime = Date.now(); completedUnits = []; currentUnit = null; @@ -383,6 +453,9 @@ export async function startAuto( : "Will loop until milestone complete."; ctx.ui.notify(`${modeLabel} started. ${scopeMsg}`, "info"); + // Self-heal: clear stale runtime records where artifacts already exist + await selfHealRuntimeRecords(base, ctx); + // Dispatch the first unit await dispatchNextUnit(ctx, pi); } @@ -876,8 +949,8 @@ async function dispatchNextUnit( "info", ); // Reset stuck detection for new milestone - lastUnit = null; - retryCount = 0; + unitDispatchCount.clear(); + unitRecoveryCount.clear(); } if (mid) currentMilestoneId = mid; @@ -953,6 +1026,12 @@ async function dispatchNextUnit( snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId); saveActivityLog(ctx, basePath, currentUnit.type, currentUnit.id); } + // Clear completed-units.json for the finished milestone so it doesn't grow unbounded. + try { + const file = completedKeysPath(basePath); + if (existsSync(file)) writeFileSync(file, JSON.stringify([]), "utf-8"); + completedKeySet.clear(); + } catch { /* non-fatal */ } await stopAuto(ctx, pi); return; } @@ -996,7 +1075,15 @@ async function dispatchNextUnit( // After a slice completes, we reassess the roadmap before moving to the next slice. // Skip reassessment for the final slice (milestone complete) or if already assessed. const needsReassess = await checkNeedsReassessment(basePath, mid, state); - if (needsRunUat) { + if (state.phase === "summarizing") { + // complete-slice MUST run before reassessment to guarantee mergeSliceToMain + const sid = state.activeSlice!.id; + const sTitle = state.activeSlice!.title; + unitType = "complete-slice"; + unitId = `${mid}/${sid}`; + prompt = await buildCompleteSlicePrompt(mid, midTitle!, sid, sTitle, basePath); + + } else if (needsRunUat) { const { sliceId, uatType } = needsRunUat; unitType = "run-uat"; unitId = `${mid}/${sliceId}`; @@ -1075,14 +1162,6 @@ async function dispatchNextUnit( unitId = `${mid}/${sid}/${tid}`; prompt = await buildExecuteTaskPrompt(mid, sid, sTitle, tid, tTitle, basePath); - } else if (state.phase === "summarizing") { - // All tasks done — complete the slice - const sid = state.activeSlice!.id; - const sTitle = state.activeSlice!.title; - unitType = "complete-slice"; - unitId = `${mid}/${sid}`; - prompt = await buildCompleteSlicePrompt(mid, midTitle!, sid, sTitle, basePath); - } else if (state.phase === "completing-milestone") { // All slices done — complete the milestone unitType = "complete-milestone"; @@ -1102,34 +1181,44 @@ async function dispatchNextUnit( await emitObservabilityWarnings(ctx, unitType, unitId); - // Stuck detection — same unit dispatched again means the LLM didn't produce - // the expected artifact. Retry once (the LLM may have hit an error or run out - // of context), then stop with a diagnostic. - if (lastUnit && lastUnit.type === unitType && lastUnit.id === unitId) { - retryCount++; - - if (retryCount > MAX_RETRIES) { - if (currentUnit) { - const modelId = ctx.model?.id ?? "unknown"; - snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId); - } - saveActivityLog(ctx, basePath, lastUnit.type, lastUnit.id); - - // Diagnostic: what file was expected? - const expected = diagnoseExpectedArtifact(unitType, unitId, basePath); - await stopAuto(ctx, pi); - ctx.ui.notify( - `Stuck: ${unitType} ${unitId} fired ${retryCount + 1} times. Expected artifact not found.${expected ? `\n Expected: ${expected}` : ""}\n Check .gsd/ and activity logs.`, - "error", - ); - return; - } + // Idempotency: skip units already completed in a prior session. + const idempotencyKey = `${unitType}/${unitId}`; + if (completedKeySet.has(idempotencyKey)) { ctx.ui.notify( - `${unitType} ${unitId} didn't produce expected artifact. Retrying (${retryCount}/${MAX_RETRIES}).`, + `Skipping ${unitType} ${unitId} — already completed in a prior session. Advancing.`, + "info", + ); + // Don't increment dispatch count — just advance by calling dispatchNextUnit again. + // First, force state re-derive so the scheduler sees the completed artifact. + await dispatchNextUnit(ctx, pi); + return; + } + + // Stuck detection — tracks total dispatches per unit (not just consecutive repeats). + // Pattern A→B→A→B would reset retryCount every time; this map catches it. + const dispatchKey = `${unitType}/${unitId}`; + const prevCount = unitDispatchCount.get(dispatchKey) ?? 0; + if (prevCount >= MAX_UNIT_DISPATCHES) { + if (currentUnit) { + const modelId = ctx.model?.id ?? "unknown"; + snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId); + } + saveActivityLog(ctx, basePath, unitType, unitId); + + const expected = diagnoseExpectedArtifact(unitType, unitId, basePath); + await stopAuto(ctx, pi); + ctx.ui.notify( + `Loop detected: ${unitType} ${unitId} dispatched ${prevCount + 1} times total. Expected artifact not found.${expected ? `\n Expected: ${expected}` : ""}\n Check branch state and .gsd/ artifacts.`, + "error", + ); + return; + } + unitDispatchCount.set(dispatchKey, prevCount + 1); + if (prevCount > 0) { + ctx.ui.notify( + `${unitType} ${unitId} didn't produce expected artifact. Retrying (${prevCount + 1}/${MAX_UNIT_DISPATCHES}).`, "warning", ); - } else { - retryCount = 0; } // Snapshot metrics + activity log for the PREVIOUS unit before we reassign. // The session still holds the previous unit's data (newSession hasn't fired yet). @@ -1138,6 +1227,11 @@ async function dispatchNextUnit( snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId); saveActivityLog(ctx, basePath, currentUnit.type, currentUnit.id); + // Persist completion to disk BEFORE updating memory — so a crash here is recoverable. + const closeoutKey = `${currentUnit.type}/${currentUnit.id}`; + persistCompletedKey(basePath, closeoutKey); + completedKeySet.add(closeoutKey); + completedUnits.push({ type: currentUnit.type, id: currentUnit.id, @@ -1145,9 +1239,9 @@ async function dispatchNextUnit( finishedAt: Date.now(), }); clearUnitRuntimeRecord(basePath, currentUnit.type, currentUnit.id); + unitDispatchCount.delete(`${currentUnit.type}/${currentUnit.id}`); + unitRecoveryCount.delete(`${currentUnit.type}/${currentUnit.id}`); } - - lastUnit = { type: unitType, id: unitId }; currentUnit = { type: unitType, id: unitId, startedAt: Date.now() }; writeUnitRuntimeRecord(basePath, unitType, unitId, currentUnit.startedAt, { phase: "dispatched", @@ -1191,7 +1285,7 @@ async function dispatchNextUnit( if (pendingCrashRecovery) { finalPrompt = `${pendingCrashRecovery}\n\n---\n\n${finalPrompt}`; pendingCrashRecovery = null; - } else if (retryCount > 0) { + } else if ((unitDispatchCount.get(`${unitType}/${unitId}`) ?? 0) > 1) { const diagnostic = getDeepDiagnostic(basePath); if (diagnostic) { finalPrompt = `**RETRY — your previous attempt did not produce the required artifact.**\n\nDiagnostic from previous attempt:\n${diagnostic}\n\nFix whatever went wrong and make sure you write the required file this time.\n\n---\n\n${finalPrompt}`; @@ -2169,6 +2263,20 @@ async function recoverTimedOutUnit( const recoveryAttempts = runtime?.recoveryAttempts ?? 0; const maxRecoveryAttempts = reason === "idle" ? 2 : 1; + const recoveryKey = `${unitType}/${unitId}`; + const attemptNumber = (unitRecoveryCount.get(recoveryKey) ?? 0) + 1; + unitRecoveryCount.set(recoveryKey, attemptNumber); + + if (attemptNumber > 1) { + // Exponential backoff: 2^(n-1) seconds, capped at 30s + const backoffMs = Math.min(1000 * Math.pow(2, attemptNumber - 2), 30000); + ctx.ui.notify( + `Recovery attempt ${attemptNumber} for ${unitType} ${unitId}. Waiting ${backoffMs / 1000}s before retry.`, + "info", + ); + await new Promise(r => setTimeout(r, backoffMs)); + } + if (unitType === "execute-task") { const status = await inspectExecuteTaskDurability(basePath, unitId); if (!status) return "paused"; @@ -2184,9 +2292,10 @@ async function recoverTimedOutUnit( recovery: status, }); ctx.ui.notify( - `${reason === "idle" ? "Idle" : "Timeout"} recovery: ${unitType} ${unitId} already completed on disk. Continuing auto-mode.`, + `${reason === "idle" ? "Idle" : "Timeout"} recovery: ${unitType} ${unitId} already completed on disk. Continuing auto-mode. (attempt ${attemptNumber})`, "info", ); + unitRecoveryCount.delete(recoveryKey); await dispatchNextUnit(ctx, pi); return "recovered"; } @@ -2233,7 +2342,7 @@ async function recoverTimedOutUnit( { triggerTurn: true, deliverAs: "steer" }, ); ctx.ui.notify( - `${reason === "idle" ? "Idle" : "Timeout"} recovery: steering ${unitType} ${unitId} to finish durable output (attempt ${recoveryAttempts + 1}/${maxRecoveryAttempts}).`, + `${reason === "idle" ? "Idle" : "Timeout"} recovery: steering ${unitType} ${unitId} to finish durable output (attempt ${recoveryAttempts + 1}/${maxRecoveryAttempts}) (attempt ${attemptNumber}).`, "warning", ); return "recovered"; @@ -2254,9 +2363,10 @@ async function recoverTimedOutUnit( lastRecoveryReason: reason, }); ctx.ui.notify( - `${unitType} ${unitId} skipped after ${maxRecoveryAttempts} recovery attempts (${diagnostic}). Blocker artifacts written. Advancing pipeline.`, + `${unitType} ${unitId} skipped after ${maxRecoveryAttempts} recovery attempts (${diagnostic}). Blocker artifacts written. Advancing pipeline. (attempt ${attemptNumber})`, "warning", ); + unitRecoveryCount.delete(recoveryKey); await dispatchNextUnit(ctx, pi); return "recovered"; } @@ -2287,9 +2397,10 @@ async function recoverTimedOutUnit( lastRecoveryReason: reason, }); ctx.ui.notify( - `${reason === "idle" ? "Idle" : "Timeout"} recovery: ${unitType} ${unitId} artifact already exists on disk. Advancing.`, + `${reason === "idle" ? "Idle" : "Timeout"} recovery: ${unitType} ${unitId} artifact already exists on disk. Advancing. (attempt ${attemptNumber})`, "info", ); + unitRecoveryCount.delete(recoveryKey); await dispatchNextUnit(ctx, pi); return "recovered"; } @@ -2335,7 +2446,7 @@ async function recoverTimedOutUnit( { triggerTurn: true, deliverAs: "steer" }, ); ctx.ui.notify( - `${reason === "idle" ? "Idle" : "Timeout"} recovery: steering ${unitType} ${unitId} to produce ${expected} (attempt ${recoveryAttempts + 1}/${maxRecoveryAttempts}).`, + `${reason === "idle" ? "Idle" : "Timeout"} recovery: steering ${unitType} ${unitId} to produce ${expected} (attempt ${recoveryAttempts + 1}/${maxRecoveryAttempts}) (attempt ${attemptNumber}).`, "warning", ); return "recovered"; @@ -2355,9 +2466,10 @@ async function recoverTimedOutUnit( lastRecoveryReason: reason, }); ctx.ui.notify( - `${unitType} ${unitId} skipped after ${maxRecoveryAttempts} recovery attempts. Blocker placeholder written to ${placeholder}. Advancing pipeline.`, + `${unitType} ${unitId} skipped after ${maxRecoveryAttempts} recovery attempts. Blocker placeholder written to ${placeholder}. Advancing pipeline. (attempt ${attemptNumber})`, "warning", ); + unitRecoveryCount.delete(recoveryKey); await dispatchNextUnit(ctx, pi); return "recovered"; } diff --git a/src/resources/extensions/gsd/unit-runtime.ts b/src/resources/extensions/gsd/unit-runtime.ts index e1bdf7b01..6b17c5f35 100644 --- a/src/resources/extensions/gsd/unit-runtime.ts +++ b/src/resources/extensions/gsd/unit-runtime.ts @@ -1,4 +1,4 @@ -import { existsSync, mkdirSync, readFileSync, writeFileSync, unlinkSync } from "node:fs"; +import { existsSync, mkdirSync, readdirSync, readFileSync, writeFileSync, unlinkSync } from "node:fs"; import { join } from "node:path"; import { gsdRoot, @@ -99,6 +99,27 @@ export function clearUnitRuntimeRecord(basePath: string, unitType: string, unitI if (existsSync(path)) unlinkSync(path); } +/** + * Return all runtime records currently on disk for `basePath`. + * Returns an empty array if the runtime directory does not exist. + */ +export function listUnitRuntimeRecords(basePath: string): AutoUnitRuntimeRecord[] { + const dir = runtimeDir(basePath); + if (!existsSync(dir)) return []; + const results: AutoUnitRuntimeRecord[] = []; + for (const file of readdirSync(dir)) { + if (!file.endsWith(".json")) continue; + try { + const raw = readFileSync(join(dir, file), "utf-8"); + const record = JSON.parse(raw) as AutoUnitRuntimeRecord; + results.push(record); + } catch { + // Skip malformed files + } + } + return results; +} + export async function inspectExecuteTaskDurability( basePath: string, unitId: string, diff --git a/src/resources/extensions/gsd/worktree.ts b/src/resources/extensions/gsd/worktree.ts index 450ffc586..4f696be38 100644 --- a/src/resources/extensions/gsd/worktree.ts +++ b/src/resources/extensions/gsd/worktree.ts @@ -78,11 +78,10 @@ export function ensureSliceBranch(basePath: string, milestoneId: string, sliceId if (current === branch) return false; - const mainBranch = getMainBranch(basePath); let created = false; if (!branchExists(basePath, branch)) { - runGit(basePath, ["branch", branch, mainBranch]); + runGit(basePath, ["branch", branch]); created = true; }