fix(auto): break infinite skip loop on repeatedly-skipped completed units

When deriveState() keeps returning the same already-completed unit,
the idempotency skip paths in dispatchNextUnit recursively call
themselves forever. The existing MAX_SKIP_DEPTH (20) breaker yields
to the UI but then re-enters the same loop; the hard lifetime counter
(unitLifetimeDispatches) is never reached because skip paths return
before touching it.

Root cause: no per-unit counter on the skip-only path.

Fix:
- Add unitConsecutiveSkips map + MAX_CONSECUTIVE_SKIPS = 3
- Both skip paths (completedKeySet hit, and fallback artifact-exists)
  increment the counter on each skip of the same idempotencyKey
- When the counter exceeds MAX_CONSECUTIVE_SKIPS, evict the key from
  completedKeySet and persisted storage, invalidate state, and let
  deriveState reconcile on the next real dispatch
- Counter resets to 0 for a given key whenever a real dispatch
  proceeds (i.e., past both skip paths)
- Counter fully cleared at all 4 existing clear sites (stopAuto,
  startAuto, crash recovery, pause/resume)

Export _getUnitConsecutiveSkips / _resetUnitConsecutiveSkips /
MAX_CONSECUTIVE_SKIPS for testability (same pattern as
doctor-proactive.ts resetProactiveHealing).

Tests: auto-skip-loop.test.ts — counter mechanics, threshold bounds,
eviction round-trip, per-key isolation (10 assertions).
Closes #728
This commit is contained in:
Jeremy McSpadden 2026-03-16 17:48:39 -05:00
parent 1a85853fd8
commit 742cd70c9b
2 changed files with 180 additions and 0 deletions

View file

@ -211,6 +211,11 @@ const MAX_LIFETIME_DISPATCHES = 6;
/** Tracks recovery attempt count per unit for backoff and diagnostics. */
const unitRecoveryCount = new Map<string, number>();
/** Track consecutive skips per unit catches infinite skip loops where deriveState
* keeps returning the same already-completed unit. Reset on any real dispatch. */
const unitConsecutiveSkips = new Map<string, number>();
const MAX_CONSECUTIVE_SKIPS = 3;
/** Persisted completed-unit keys — survives restarts. Loaded from .gsd/completed-units.json. */
const completedKeySet = new Set<string>();
@ -624,6 +629,7 @@ export async function stopAuto(ctx?: ExtensionContext, pi?: ExtensionAPI): Promi
stepMode = false;
unitDispatchCount.clear();
unitRecoveryCount.clear();
unitConsecutiveSkips.clear();
inFlightTools.clear();
lastBudgetAlertLevel = 0;
unitLifetimeDispatches.clear();
@ -714,6 +720,7 @@ export async function startAuto(
basePath = base;
unitDispatchCount.clear();
unitLifetimeDispatches.clear();
unitConsecutiveSkips.clear();
// Re-initialize metrics in case ledger was lost during pause
if (!getLedger()) initMetrics(base);
// Ensure milestone ID is set on git service for integration branch resolution
@ -992,6 +999,7 @@ export async function startAuto(
basePath = base;
unitDispatchCount.clear();
unitRecoveryCount.clear();
unitConsecutiveSkips.clear();
lastBudgetAlertLevel = 0;
unitLifetimeDispatches.clear();
completedKeySet.clear();
@ -1919,6 +1927,7 @@ async function dispatchNextUnit(
// Reset stuck detection for new milestone
unitDispatchCount.clear();
unitRecoveryCount.clear();
unitConsecutiveSkips.clear();
unitLifetimeDispatches.clear();
// Clear completed-units.json for the finished milestone
try {
@ -2286,6 +2295,26 @@ async function dispatchNextUnit(
// Cross-validate: does the expected artifact actually exist?
const artifactExists = verifyExpectedArtifact(unitType, unitId, basePath);
if (artifactExists) {
// Guard against infinite skip loops: if deriveState keeps returning the
// same completed unit, consecutive skips will trip this breaker. Evict the
// key so the next dispatch forces full reconciliation instead of looping.
const skipCount = (unitConsecutiveSkips.get(idempotencyKey) ?? 0) + 1;
unitConsecutiveSkips.set(idempotencyKey, skipCount);
if (skipCount > MAX_CONSECUTIVE_SKIPS) {
unitConsecutiveSkips.delete(idempotencyKey);
completedKeySet.delete(idempotencyKey);
removePersistedKey(basePath, idempotencyKey);
invalidateStateCache();
ctx.ui.notify(
`Skip loop detected: ${unitType} ${unitId} skipped ${skipCount} times without advancing. Evicting completion record and forcing reconciliation.`,
"warning",
);
_skipDepth++;
await new Promise(r => setTimeout(r, 50));
await dispatchNextUnit(ctx, pi);
_skipDepth = Math.max(0, _skipDepth - 1);
return;
}
ctx.ui.notify(
`Skipping ${unitType} ${unitId} — already completed in a prior session. Advancing.`,
"info",
@ -2315,6 +2344,24 @@ async function dispatchNextUnit(
persistCompletedKey(basePath, idempotencyKey);
completedKeySet.add(idempotencyKey);
invalidateStateCache();
// Same consecutive-skip guard as the idempotency path above.
const skipCount2 = (unitConsecutiveSkips.get(idempotencyKey) ?? 0) + 1;
unitConsecutiveSkips.set(idempotencyKey, skipCount2);
if (skipCount2 > MAX_CONSECUTIVE_SKIPS) {
unitConsecutiveSkips.delete(idempotencyKey);
completedKeySet.delete(idempotencyKey);
removePersistedKey(basePath, idempotencyKey);
invalidateStateCache();
ctx.ui.notify(
`Skip loop detected: ${unitType} ${unitId} skipped ${skipCount2} times without advancing. Evicting completion record and forcing reconciliation.`,
"warning",
);
_skipDepth++;
await new Promise(r => setTimeout(r, 50));
await dispatchNextUnit(ctx, pi);
_skipDepth = Math.max(0, _skipDepth - 1);
return;
}
ctx.ui.notify(
`Skipping ${unitType} ${unitId} — artifact exists but completion key was missing. Repaired and advancing.`,
"info",
@ -2330,6 +2377,8 @@ async function dispatchNextUnit(
// Pattern A→B→A→B would reset retryCount every time; this map catches it.
const dispatchKey = `${unitType}/${unitId}`;
const prevCount = unitDispatchCount.get(dispatchKey) ?? 0;
// Real dispatch reached — clear the consecutive-skip counter for this unit.
unitConsecutiveSkips.delete(dispatchKey);
debugLog("dispatch-unit", {
type: unitType,
@ -3275,6 +3324,14 @@ export {
buildLoopRemediationSteps,
} from "./auto-recovery.js";
/**
* Test-only: expose skip-loop state for unit tests.
* Not part of the public API.
*/
export function _getUnitConsecutiveSkips(): Map<string, number> { return unitConsecutiveSkips; }
export function _resetUnitConsecutiveSkips(): void { unitConsecutiveSkips.clear(); }
export { MAX_CONSECUTIVE_SKIPS };
/**
* Dispatch a hook unit directly, bypassing normal pre-dispatch hooks.
* Used for manual hook triggers via /gsd run-hook.

View file

@ -0,0 +1,123 @@
/**
* auto-skip-loop.test.ts Tests for the consecutive-skip loop breaker.
*
* Regression for #728: auto-mode infinite skip loop on previously completed
* plan-slice units when deriveState keeps returning the same unit.
*
* The skip paths in dispatchNextUnit track consecutive skips per unit via
* unitConsecutiveSkips. When the same unit is skipped > MAX_CONSECUTIVE_SKIPS
* times without a real dispatch in between, the completion record is evicted
* so deriveState can reconcile.
*/
import { mkdtempSync, mkdirSync, writeFileSync, rmSync } from "node:fs";
import { join } from "node:path";
import { tmpdir } from "node:os";
import {
_getUnitConsecutiveSkips,
_resetUnitConsecutiveSkips,
MAX_CONSECUTIVE_SKIPS,
} from "../auto.ts";
import { persistCompletedKey, removePersistedKey, loadPersistedKeys } from "../auto-recovery.ts";
import { createTestContext } from "./test-helpers.ts";
const { assertEq, assertTrue, report } = createTestContext();
function makeTmpBase(): string {
const dir = mkdtempSync(join(tmpdir(), "gsd-skip-loop-test-"));
mkdirSync(join(dir, ".gsd"), { recursive: true });
return dir;
}
async function main(): Promise<void> {
// ─── Counter starts at zero ────────────────────────────────────────────
console.log("\n=== skip loop counter: initial state ===");
{
_resetUnitConsecutiveSkips();
const map = _getUnitConsecutiveSkips();
assertEq(map.size, 0, "counter map starts empty after reset");
}
// ─── Counter increments correctly ────────────────────────────────────
console.log("\n=== skip loop counter: increments on repeated calls ===");
{
_resetUnitConsecutiveSkips();
const map = _getUnitConsecutiveSkips();
const key = "plan-slice/M001/S04";
for (let i = 1; i <= MAX_CONSECUTIVE_SKIPS; i++) {
const prev = map.get(key) ?? 0;
map.set(key, prev + 1);
}
assertEq(map.get(key), MAX_CONSECUTIVE_SKIPS, `counter reaches MAX_CONSECUTIVE_SKIPS (${MAX_CONSECUTIVE_SKIPS})`);
}
// ─── Threshold constant is sane ──────────────────────────────────────
console.log("\n=== skip loop counter: threshold is reasonable ===");
{
assertTrue(MAX_CONSECUTIVE_SKIPS >= 3, "threshold allows a few legitimate skips");
assertTrue(MAX_CONSECUTIVE_SKIPS <= 10, "threshold catches loops quickly");
}
// ─── Reset clears all keys ────────────────────────────────────────────
console.log("\n=== skip loop counter: reset clears all keys ===");
{
_resetUnitConsecutiveSkips();
const map = _getUnitConsecutiveSkips();
map.set("plan-slice/M001/S01", 2);
map.set("plan-slice/M001/S02", 1);
assertEq(map.size, 2, "map has 2 entries before reset");
_resetUnitConsecutiveSkips();
assertEq(_getUnitConsecutiveSkips().size, 0, "map empty after reset");
}
// ─── Eviction path: persistCompletedKey + removePersistedKey round-trip
// (simulates what the loop-breaker does) ───────────────────────────
console.log("\n=== skip loop counter: eviction removes persisted key ===");
{
_resetUnitConsecutiveSkips();
const base = makeTmpBase();
try {
const key = "plan-slice/M001/S04";
const keySet = new Set<string>();
persistCompletedKey(base, key);
loadPersistedKeys(base, keySet);
assertTrue(keySet.has(key), "key persisted before eviction");
// Simulate loop-breaker eviction
keySet.delete(key);
removePersistedKey(base, key);
const keySet2 = new Set<string>();
loadPersistedKeys(base, keySet2);
assertTrue(!keySet2.has(key), "key absent after eviction");
} finally {
rmSync(base, { recursive: true, force: true });
}
}
// ─── Counter resets per-key, not globally ─────────────────────────────
console.log("\n=== skip loop counter: per-key isolation ===");
{
_resetUnitConsecutiveSkips();
const map = _getUnitConsecutiveSkips();
map.set("plan-slice/M001/S04", MAX_CONSECUTIVE_SKIPS + 1);
map.set("plan-slice/M001/S05", 1);
// Deleting S04 (eviction) should not affect S05
map.delete("plan-slice/M001/S04");
assertTrue(!map.has("plan-slice/M001/S04"), "S04 evicted");
assertEq(map.get("plan-slice/M001/S05"), 1, "S05 counter unaffected");
}
_resetUnitConsecutiveSkips();
report();
}
main().catch((err) => {
console.error(err);
process.exit(1);
});