fix: prevent phantom skip loop from stale crash recovery context (#790)
When a crash lock references a unit from a fully-completed milestone, crash recovery injects stale context that fights the skip-loop breaker, creating an infinite evict/repair cycle with selfHealRuntimeRecords. Fix 1: Validate recovered unit's milestone before synthesizing recovery context. If the milestone has a SUMMARY file (complete), discard the stale recovery context and clear the lock without injection. Fix 2: Skip-loop breaker cross-checks whether the evicted unit belongs to a completed milestone. If so, the eviction is counterproductive — clear the skip counter and re-dispatch from fresh state instead. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
ad4e68551d
commit
0184498a44
1 changed files with 67 additions and 13 deletions
|
|
@ -889,23 +889,37 @@ export async function startAuto(
|
|||
);
|
||||
return;
|
||||
}
|
||||
// Stale lock from a dead process — synthesize crash recovery context.
|
||||
const activityDir = join(gsdRoot(base), "activity");
|
||||
const recovery = synthesizeCrashRecovery(
|
||||
base, crashLock.unitType, crashLock.unitId,
|
||||
crashLock.sessionFile, activityDir,
|
||||
);
|
||||
if (recovery && recovery.trace.toolCallCount > 0) {
|
||||
pendingCrashRecovery = recovery.prompt;
|
||||
// Stale lock from a dead process — validate before synthesizing recovery context.
|
||||
// If the recovered unit belongs to a fully-completed milestone (SUMMARY exists),
|
||||
// discard recovery context to prevent phantom skip loops (#790).
|
||||
const recoveredMid = crashLock.unitId.split("/")[0];
|
||||
const milestoneAlreadyComplete = recoveredMid
|
||||
? !!resolveMilestoneFile(base, recoveredMid, "SUMMARY")
|
||||
: false;
|
||||
|
||||
if (milestoneAlreadyComplete) {
|
||||
ctx.ui.notify(
|
||||
`${formatCrashInfo(crashLock)}\nRecovered ${recovery.trace.toolCallCount} tool calls from crashed session. Resuming with full context.`,
|
||||
"warning",
|
||||
`Crash recovery: discarding stale context for ${crashLock.unitId} — milestone ${recoveredMid} is already complete.`,
|
||||
"info",
|
||||
);
|
||||
} else {
|
||||
ctx.ui.notify(
|
||||
`${formatCrashInfo(crashLock)}\nNo session data recovered. Resuming from disk state.`,
|
||||
"warning",
|
||||
const activityDir = join(gsdRoot(base), "activity");
|
||||
const recovery = synthesizeCrashRecovery(
|
||||
base, crashLock.unitType, crashLock.unitId,
|
||||
crashLock.sessionFile, activityDir,
|
||||
);
|
||||
if (recovery && recovery.trace.toolCallCount > 0) {
|
||||
pendingCrashRecovery = recovery.prompt;
|
||||
ctx.ui.notify(
|
||||
`${formatCrashInfo(crashLock)}\nRecovered ${recovery.trace.toolCallCount} tool calls from crashed session. Resuming with full context.`,
|
||||
"warning",
|
||||
);
|
||||
} else {
|
||||
ctx.ui.notify(
|
||||
`${formatCrashInfo(crashLock)}\nNo session data recovered. Resuming from disk state.`,
|
||||
"warning",
|
||||
);
|
||||
}
|
||||
}
|
||||
clearLock(base);
|
||||
}
|
||||
|
|
@ -2418,6 +2432,28 @@ async function dispatchNextUnit(
|
|||
const skipCount = (unitConsecutiveSkips.get(idempotencyKey) ?? 0) + 1;
|
||||
unitConsecutiveSkips.set(idempotencyKey, skipCount);
|
||||
if (skipCount > MAX_CONSECUTIVE_SKIPS) {
|
||||
// Cross-check: verify deriveState actually returns this unit (#790).
|
||||
// If the unit's milestone is already complete, this is a phantom skip
|
||||
// loop from stale crash recovery context — don't evict.
|
||||
const skippedMid = unitId.split("/")[0];
|
||||
const skippedMilestoneComplete = skippedMid
|
||||
? !!resolveMilestoneFile(basePath, skippedMid, "SUMMARY")
|
||||
: false;
|
||||
if (skippedMilestoneComplete) {
|
||||
// Milestone is complete — evicting this key would fight self-heal.
|
||||
// Clear skip counter and re-dispatch from fresh state.
|
||||
unitConsecutiveSkips.delete(idempotencyKey);
|
||||
invalidateStateCache();
|
||||
ctx.ui.notify(
|
||||
`Phantom skip loop cleared: ${unitType} ${unitId} belongs to completed milestone ${skippedMid}. Re-dispatching from fresh state.`,
|
||||
"info",
|
||||
);
|
||||
_skipDepth++;
|
||||
await new Promise(r => setTimeout(r, 50));
|
||||
await dispatchNextUnit(ctx, pi);
|
||||
_skipDepth = Math.max(0, _skipDepth - 1);
|
||||
return;
|
||||
}
|
||||
unitConsecutiveSkips.delete(idempotencyKey);
|
||||
completedKeySet.delete(idempotencyKey);
|
||||
removePersistedKey(basePath, idempotencyKey);
|
||||
|
|
@ -2465,6 +2501,24 @@ async function dispatchNextUnit(
|
|||
const skipCount2 = (unitConsecutiveSkips.get(idempotencyKey) ?? 0) + 1;
|
||||
unitConsecutiveSkips.set(idempotencyKey, skipCount2);
|
||||
if (skipCount2 > MAX_CONSECUTIVE_SKIPS) {
|
||||
// Cross-check: verify the unit's milestone is still active (#790).
|
||||
const skippedMid2 = unitId.split("/")[0];
|
||||
const skippedMilestoneComplete2 = skippedMid2
|
||||
? !!resolveMilestoneFile(basePath, skippedMid2, "SUMMARY")
|
||||
: false;
|
||||
if (skippedMilestoneComplete2) {
|
||||
unitConsecutiveSkips.delete(idempotencyKey);
|
||||
invalidateStateCache();
|
||||
ctx.ui.notify(
|
||||
`Phantom skip loop cleared: ${unitType} ${unitId} belongs to completed milestone ${skippedMid2}. Re-dispatching from fresh state.`,
|
||||
"info",
|
||||
);
|
||||
_skipDepth++;
|
||||
await new Promise(r => setTimeout(r, 50));
|
||||
await dispatchNextUnit(ctx, pi);
|
||||
_skipDepth = Math.max(0, _skipDepth - 1);
|
||||
return;
|
||||
}
|
||||
unitConsecutiveSkips.delete(idempotencyKey);
|
||||
completedKeySet.delete(idempotencyKey);
|
||||
removePersistedKey(basePath, idempotencyKey);
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue