From 88e6957f647ff500170f059fc915e7800cdbb195 Mon Sep 17 00:00:00 2001 From: 0xggoma Date: Sun, 15 Mar 2026 01:39:19 -0700 Subject: [PATCH 1/2] fix: persist completion key in loop-recovery/self-repair to prevent infinite dispatch loops MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When loop-recovery or self-repair reconciliation succeeds (artifacts exist on disk), the dispatch counter is reset but the unit is never marked complete in completed-units.json. If deriveState() continues returning the same unit, the cycle repeats indefinitely: 3 dispatches → stuck detection → reconciliation → counter reset → 3 more dispatches... This was observed in production burning $93.87 on 103 dispatches of a single already-completed task over 4.9 hours. Changes: 1. Persist completed key (persistCompletedKey + completedKeySet.add) in both the loop-recovery and self-repair success paths, so the idempotency check at the top of dispatchNextUnit prevents re-dispatch. 2. Add invalidateStateCache() after reconciliation writes to ensure the next deriveState() call sees fresh disk state. 3. Add a hard lifetime dispatch counter (unitLifetimeDispatches) that survives counter resets from reconciliation paths. Caps any single unit at 6 total dispatches across all reconciliation cycles. Fixes #462 --- src/resources/extensions/gsd/auto.ts | 41 ++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/src/resources/extensions/gsd/auto.ts b/src/resources/extensions/gsd/auto.ts index 405944d7c..08a4c6f80 100644 --- a/src/resources/extensions/gsd/auto.ts +++ b/src/resources/extensions/gsd/auto.ts @@ -151,6 +151,12 @@ const unitDispatchCount = new Map(); const MAX_UNIT_DISPATCHES = 3; /** Retry index at which a stub summary placeholder is written when the summary is still absent. */ const STUB_RECOVERY_THRESHOLD = 2; +/** Hard cap on total dispatches per unit across ALL reconciliation cycles. + * unitDispatchCount can be reset by loop-recovery/self-repair paths, but this + * counter is never reset — it catches infinite reconciliation loops where + * artifacts exist but deriveState keeps returning the same unit. */ +const unitLifetimeDispatches = new Map(); +const MAX_LIFETIME_DISPATCHES = 6; /** Tracks recovery attempt count per unit for backoff and diagnostics. */ const unitRecoveryCount = new Map(); @@ -367,6 +373,7 @@ export async function stopAuto(ctx?: ExtensionContext, pi?: ExtensionAPI): Promi stepMode = false; unitDispatchCount.clear(); unitRecoveryCount.clear(); + unitLifetimeDispatches.clear(); currentUnit = null; currentMilestoneId = null; cachedSliceProgress = null; @@ -568,6 +575,7 @@ export async function startAuto( cmdCtx = ctx; basePath = base; unitDispatchCount.clear(); + unitLifetimeDispatches.clear(); // Re-initialize metrics in case ledger was lost during pause if (!getLedger()) initMetrics(base); // Ensure milestone ID is set on git service for integration branch resolution @@ -687,6 +695,7 @@ export async function startAuto( basePath = base; unitDispatchCount.clear(); unitRecoveryCount.clear(); + unitLifetimeDispatches.clear(); completedKeySet.clear(); loadPersistedKeys(base, completedKeySet); resetHookState(); @@ -1396,6 +1405,7 @@ async function dispatchNextUnit( // Reset stuck detection for new milestone unitDispatchCount.clear(); unitRecoveryCount.clear(); + unitLifetimeDispatches.clear(); // Capture integration branch for the new milestone and update git service captureIntegrationBranch(basePath, mid); } @@ -1888,6 +1898,26 @@ async function dispatchNextUnit( // Pattern A→B→A→B would reset retryCount every time; this map catches it. const dispatchKey = `${unitType}/${unitId}`; const prevCount = unitDispatchCount.get(dispatchKey) ?? 0; + + // Hard lifetime cap — survives counter resets from loop-recovery/self-repair. + // Catches the case where reconciliation "succeeds" (artifacts exist) but + // deriveState keeps returning the same unit, creating an infinite cycle. + const lifetimeCount = (unitLifetimeDispatches.get(dispatchKey) ?? 0) + 1; + unitLifetimeDispatches.set(dispatchKey, lifetimeCount); + if (lifetimeCount > MAX_LIFETIME_DISPATCHES) { + if (currentUnit) { + const modelId = ctx.model?.id ?? "unknown"; + snapshotUnitMetrics(ctx, currentUnit.type, currentUnit.id, currentUnit.startedAt, modelId); + } + saveActivityLog(ctx, basePath, unitType, unitId); + const expected = diagnoseExpectedArtifact(unitType, unitId, basePath); + await stopAuto(ctx, pi); + ctx.ui.notify( + `Hard loop detected: ${unitType} ${unitId} dispatched ${lifetimeCount} times total (across reconciliation cycles). Stopping.${expected ? `\n Expected artifact: ${expected}` : ""}\n This may indicate deriveState() keeps returning the same unit despite artifacts existing.\n Check .gsd/completed-units.json and the slice plan checkbox state.`, + "error", + ); + return; + } if (prevCount >= MAX_UNIT_DISPATCHES) { if (currentUnit) { const modelId = ctx.model?.id ?? "unknown"; @@ -1912,7 +1942,13 @@ async function dispatchNextUnit( `Loop recovery: ${unitId} reconciled after ${prevCount + 1} dispatches — blocker artifacts written, pipeline advancing.\n Review ${status.summaryPath} and replace the placeholder with real work.`, "warning", ); + // Persist completion so idempotency check prevents re-dispatch + // if deriveState keeps returning this unit (#462). + const reconciledKey = `${unitType}/${unitId}`; + persistCompletedKey(basePath, reconciledKey); + completedKeySet.add(reconciledKey); unitDispatchCount.delete(dispatchKey); + invalidateStateCache(); await new Promise(r => setImmediate(r)); await dispatchNextUnit(ctx, pi); return; @@ -1947,7 +1983,12 @@ async function dispatchNextUnit( `Self-repaired ${unitId}: summary existed but checkbox was unmarked. Marked [x] and advancing.`, "warning", ); + // Persist completion so idempotency check prevents re-dispatch (#462). + const repairedKey = `${unitType}/${unitId}`; + persistCompletedKey(basePath, repairedKey); + completedKeySet.add(repairedKey); unitDispatchCount.delete(dispatchKey); + invalidateStateCache(); await new Promise(r => setImmediate(r)); await dispatchNextUnit(ctx, pi); return; From 271ab395761559710351f070f9e693eaadaeb3fc Mon Sep 17 00:00:00 2001 From: deseltrus Date: Sun, 15 Mar 2026 10:41:05 +0100 Subject: [PATCH 2/2] fix: verify artifacts on disk before bailing on dispatch loop limit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The loop detection in dispatchNextUnit stops auto-mode when a unit has been dispatched MAX_UNIT_DISPATCHES (3) times. Previously, only execute-task had reconciliation logic to check whether the artifact actually exists on disk before bailing. All other unit types (complete-slice, plan-slice, research-slice, etc.) would immediately stop — even if the Nth attempt successfully produced the artifact. This is a race between the dispatch counter and disk verification: the counter increments at dispatch time, but artifact verification only runs during closeout of the NEXT unit. If the last allowed attempt succeeds, the counter is already at the limit when the next dispatch tries to run, and nobody checks disk state. Reproduction scenario: 1. complete-slice dispatched 3 times (LLM missed writing UAT on attempts 1-2, succeeded on attempt 3) 2. Attempt 3 produces both SUMMARY and UAT — auto-committed to disk 3. Dispatch 4 fires: prevCount (3) >= MAX_UNIT_DISPATCHES (3) 4. No disk check for complete-slice → pipeline stops with 'Expected artifact not found' despite artifacts existing Fix: add a general verifyExpectedArtifact() check after the execute-task-specific reconciliation and before the final bail-out. If artifacts exist on disk, clear the counter and advance. If not, same error as before — no behavior change for genuinely stuck units. --- src/resources/extensions/gsd/auto.ts | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/src/resources/extensions/gsd/auto.ts b/src/resources/extensions/gsd/auto.ts index 405944d7c..fdd4ee67b 100644 --- a/src/resources/extensions/gsd/auto.ts +++ b/src/resources/extensions/gsd/auto.ts @@ -1921,6 +1921,30 @@ async function dispatchNextUnit( } } + // General reconciliation: if the last attempt DID produce the expected + // artifact on disk, clear the counter and advance instead of stopping. + // The execute-task path above handles its special case (writing placeholder + // summaries). This catch-all covers complete-slice, plan-slice, + // research-slice, and all other unit types where the Nth attempt at the + // dispatch limit succeeded but the counter check fires before anyone + // verifies disk state. Without this, a successful final attempt is + // indistinguishable from a failed one. + if (verifyExpectedArtifact(unitType, unitId, basePath)) { + ctx.ui.notify( + `Loop recovery: ${unitType} ${unitId} — artifact verified after ${prevCount + 1} dispatches. Advancing.`, + "info", + ); + // Persist completion so the idempotency check prevents re-dispatch + // if deriveState keeps returning this unit (see #462). + persistCompletedKey(basePath, dispatchKey); + completedKeySet.add(dispatchKey); + unitDispatchCount.delete(dispatchKey); + invalidateStateCache(); + await new Promise(r => setImmediate(r)); + await dispatchNextUnit(ctx, pi); + return; + } + const expected = diagnoseExpectedArtifact(unitType, unitId, basePath); const remediation = buildLoopRemediationSteps(unitType, unitId, basePath); await stopAuto(ctx, pi);