fix(auto): stop re-running finished tasks after session restart (#513)
* fix(auto): prevent infinite re-dispatch when completion key is missing Root cause: When a task completed successfully on the first attempt, the idempotency key was never persisted to completed-units.json. The persistence logic (persistCompletedKey) only triggered at the retry threshold (MAX_UNIT_DISPATCHES=3). After session restart, the key was missing and auto-mode re-dispatched the same task endlessly. Evidence: M008/S01/T01 was dispatched 15+ times over 3.5 hours. T01-SUMMARY.md existed, S01-PLAN.md marked T01 as [x], but completed-units.json had no execute-task/M008/S01/T01 entry. Fix: Added fallback artifact check before dispatch. If the expected artifact already exists on disk but the completion key is missing, the key is repaired (persisted + added to in-memory set) and the unit is skipped. This catches the gap between the closeout-based persistence (which requires the NEXT dispatch to fire) and the retry-threshold persistence (which requires MAX attempts). Also fixes guided-flow-escape.test.ts: added missing cache invalidation after rmSync (clearPathCache + invalidateStateCache). Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix(auto): prevent TUI freeze on cascading skip-dispatches When multiple completed tasks are skipped in sequence (T01 artifact fallback → T02 idempotency skip → T03 dispatch), the recursive dispatchNextUnit calls can freeze the TUI. Fix: invalidateStateCache() after key repair so deriveState returns the correct next task, and use setTimeout(50ms) instead of setImmediate to yield more generously to the event loop between cascading skips. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix(auto): systematic hardening of dispatch recovery pipeline Five fixes addressing the 20 failure modes identified in the auto-mode dispatch loop audit: 1. Stale runtime record cleanup: selfHealRuntimeRecords now clears records older than 1h with phase=dispatched (crash orphans), and also persists completion keys for records with existing artifacts. 2. Recursion depth limit: _skipDepth counter prevents TUI freeze when many completed units are skipped in cascade. After MAX_SKIP_DEPTH (20) skips, yields 200ms to the event loop before continuing. 3. Atomic completed-units.json writes: persistCompletedKey now uses tmp file + renameSync to prevent partial writes on crash. 4. Skip depth tracking on both skip paths (idempotency check at L1815 and artifact fallback at L1844) with setTimeout(50ms) between skips. 5. Self-heal now also repairs missing completion keys when artifact exists, closing the gap where crash between completion and closeout leaves the key unwritten. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix(auto): add reentrancy guard to dispatchNextUnit itself The _handlingAgentEnd boolean only guards calls from agent_end hooks. Direct calls from watchdog timers, step wizard, and crash recovery can still race with an in-progress dispatch. Added _dispatching guard that blocks concurrent external calls while allowing recursive skip calls (_skipDepth > 0). Cleared on stopAuto. Audit confirmed: double watchdog (#11) already prevented by existing clearDispatchGapWatchdog in startDispatchGapWatchdog + catch/return. Counter cleanup (#16) already handled by unitDispatchCount.clear() in startAuto before selfHealRuntimeRecords. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> * fix(auto): final hardening for unattended multi-milestone runs Three fixes from paranoid stress-test audit: 1. Git index.lock cleanup: Remove stale .git/index.lock (>60s old) at auto-start. A crash during git commit/merge leaves this file behind, blocking ALL subsequent git operations with no recovery. 2. Stub summary for complete-milestone: If the LLM fails to write a milestone SUMMARY after MAX_UNIT_DISPATCHES attempts, generate a stub summary to unblock the pipeline. Without this, auto-mode loops forever in "completing-milestone" phase. 3. Pre-flight queue validation: At auto-start with multiple milestones, scan for CONTEXT-DRAFT.md files (will pause for discussion) and report milestone count. Gives the user early visibility into what will happen during the run. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com> --------- Co-authored-by: deseltrus <simulacraverse@protonmail.com> Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
f59301e4ba
commit
59698978af
1 changed files with 136 additions and 7 deletions
|
|
@ -70,7 +70,7 @@ import {
|
|||
} from "./metrics.js";
|
||||
import { dirname, join } from "node:path";
|
||||
import { sep as pathSep } from "node:path";
|
||||
import { readdirSync, readFileSync, existsSync, mkdirSync, writeFileSync, unlinkSync } from "node:fs";
|
||||
import { readdirSync, readFileSync, existsSync, mkdirSync, writeFileSync, unlinkSync, renameSync, statSync } from "node:fs";
|
||||
import { execSync, execFileSync } from "node:child_process";
|
||||
import {
|
||||
autoCommitCurrentBranch,
|
||||
|
|
@ -117,7 +117,10 @@ function persistCompletedKey(base: string, key: string): void {
|
|||
} catch { /* corrupt file — start fresh */ }
|
||||
if (!keys.includes(key)) {
|
||||
keys.push(key);
|
||||
writeFileSync(file, JSON.stringify(keys), "utf-8");
|
||||
// Atomic write: tmp file + rename prevents partial writes on crash
|
||||
const tmpFile = file + ".tmp";
|
||||
writeFileSync(tmpFile, JSON.stringify(keys), "utf-8");
|
||||
renameSync(tmpFile, file);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -355,6 +358,8 @@ export async function stopAuto(ctx?: ExtensionContext, pi?: ExtensionAPI): Promi
|
|||
clearUnitTimeout();
|
||||
if (basePath) clearLock(basePath);
|
||||
clearSkillSnapshot();
|
||||
_dispatching = false;
|
||||
_skipDepth = 0;
|
||||
|
||||
// Remove SIGTERM handler registered at auto-mode start
|
||||
deregisterSigtermHandler();
|
||||
|
|
@ -463,17 +468,35 @@ async function selfHealRuntimeRecords(base: string, ctx: ExtensionContext): Prom
|
|||
const { listUnitRuntimeRecords } = await import("./unit-runtime.js");
|
||||
const records = listUnitRuntimeRecords(base);
|
||||
let healed = 0;
|
||||
const STALE_THRESHOLD_MS = 60 * 60 * 1000; // 1 hour
|
||||
const now = Date.now();
|
||||
for (const record of records) {
|
||||
const { unitType, unitId } = record;
|
||||
const artifactPath = resolveExpectedArtifactPath(unitType, unitId, base);
|
||||
|
||||
// Case 1: Artifact exists — unit completed but closeout didn't finish
|
||||
if (artifactPath && existsSync(artifactPath)) {
|
||||
// Artifact exists — unit completed but closeout didn't finish.
|
||||
clearUnitRuntimeRecord(base, unitType, unitId);
|
||||
// Also persist completion key if missing
|
||||
const key = `${unitType}/${unitId}`;
|
||||
if (!completedKeySet.has(key)) {
|
||||
persistCompletedKey(base, key);
|
||||
completedKeySet.add(key);
|
||||
}
|
||||
healed++;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Case 2: No artifact but record is stale (dispatched > 1h ago, process crashed)
|
||||
const age = now - (record.startedAt ?? 0);
|
||||
if (record.phase === "dispatched" && age > STALE_THRESHOLD_MS) {
|
||||
clearUnitRuntimeRecord(base, unitType, unitId);
|
||||
healed++;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
if (healed > 0) {
|
||||
ctx.ui.notify(`Self-heal: cleared ${healed} stale runtime record(s) with completed artifacts.`, "info");
|
||||
ctx.ui.notify(`Self-heal: cleared ${healed} stale runtime record(s).`, "info");
|
||||
}
|
||||
} catch {
|
||||
// Non-fatal — self-heal should never block auto-mode start
|
||||
|
|
@ -755,6 +778,43 @@ export async function startAuto(
|
|||
// Self-heal: clear stale runtime records where artifacts already exist
|
||||
await selfHealRuntimeRecords(base, ctx);
|
||||
|
||||
// Self-heal: remove stale .git/index.lock from prior crash.
|
||||
// A stale lock file blocks all git operations (commit, merge, checkout).
|
||||
// Only remove if older than 60 seconds (not from a concurrent process).
|
||||
try {
|
||||
const gitLockFile = join(base, ".git", "index.lock");
|
||||
if (existsSync(gitLockFile)) {
|
||||
const lockAge = Date.now() - statSync(gitLockFile).mtimeMs;
|
||||
if (lockAge > 60_000) {
|
||||
unlinkSync(gitLockFile);
|
||||
ctx.ui.notify("Removed stale .git/index.lock from prior crash.", "info");
|
||||
}
|
||||
}
|
||||
} catch { /* non-fatal */ }
|
||||
|
||||
// Pre-flight: validate milestone queue for multi-milestone runs.
|
||||
// Warn about issues that will cause auto-mode to pause or block.
|
||||
try {
|
||||
const msDir = join(base, ".gsd", "milestones");
|
||||
if (existsSync(msDir)) {
|
||||
const milestoneIds = readdirSync(msDir, { withFileTypes: true })
|
||||
.filter(d => d.isDirectory() && /^M\d{3}/.test(d.name))
|
||||
.map(d => d.name.match(/^(M\d{3})/)?.[1] ?? d.name);
|
||||
if (milestoneIds.length > 1) {
|
||||
const issues: string[] = [];
|
||||
for (const id of milestoneIds) {
|
||||
const draft = resolveMilestoneFile(base, id, "CONTEXT-DRAFT");
|
||||
if (draft) issues.push(`${id}: has CONTEXT-DRAFT.md (will pause for discussion)`);
|
||||
}
|
||||
if (issues.length > 0) {
|
||||
ctx.ui.notify(`Pre-flight: ${milestoneIds.length} milestones queued.\n${issues.map(i => ` ⚠ ${i}`).join("\n")}`, "warning");
|
||||
} else {
|
||||
ctx.ui.notify(`Pre-flight: ${milestoneIds.length} milestones queued. All have full context.`, "info");
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch { /* non-fatal — pre-flight should never block auto-mode */ }
|
||||
|
||||
// Dispatch the first unit
|
||||
await dispatchNextUnit(ctx, pi);
|
||||
}
|
||||
|
|
@ -1431,6 +1491,16 @@ function getRoadmapSlicesSync(): { done: number; total: number; activeSliceTasks
|
|||
|
||||
// ─── Core Loop ────────────────────────────────────────────────────────────────
|
||||
|
||||
/** Tracks recursive skip depth to prevent TUI freeze on cascading completed-unit skips */
|
||||
let _skipDepth = 0;
|
||||
const MAX_SKIP_DEPTH = 20;
|
||||
|
||||
/** Reentrancy guard for dispatchNextUnit itself (not just handleAgentEnd).
|
||||
* Prevents concurrent dispatch from watchdog timers, step wizard, and direct calls
|
||||
* that bypass the _handlingAgentEnd guard. Recursive calls (from skip paths) are
|
||||
* allowed via _skipDepth > 0. */
|
||||
let _dispatching = false;
|
||||
|
||||
async function dispatchNextUnit(
|
||||
ctx: ExtensionContext,
|
||||
pi: ExtensionAPI,
|
||||
|
|
@ -1442,6 +1512,22 @@ async function dispatchNextUnit(
|
|||
return;
|
||||
}
|
||||
|
||||
// Reentrancy guard: allow recursive calls from skip paths (_skipDepth > 0)
|
||||
// but block concurrent external calls (watchdog, step wizard, etc.)
|
||||
if (_dispatching && _skipDepth === 0) {
|
||||
return; // Another dispatch is in progress — bail silently
|
||||
}
|
||||
_dispatching = true;
|
||||
|
||||
// Recursion depth guard: when many units are skipped in sequence (e.g., after
|
||||
// crash recovery with 10+ completed units), recursive dispatchNextUnit calls
|
||||
// can freeze the TUI or overflow the stack. Yield generously after MAX_SKIP_DEPTH.
|
||||
if (_skipDepth > MAX_SKIP_DEPTH) {
|
||||
_skipDepth = 0;
|
||||
ctx.ui.notify(`Skipped ${MAX_SKIP_DEPTH}+ completed units. Yielding to UI before continuing.`, "info");
|
||||
await new Promise(r => setTimeout(r, 200));
|
||||
}
|
||||
|
||||
// Clear stale directory listing cache so deriveState sees fresh disk state (#431)
|
||||
clearPathCache();
|
||||
// Clear parsed roadmap/plan cache — doctor may have re-populated it with
|
||||
|
|
@ -1821,10 +1907,10 @@ async function dispatchNextUnit(
|
|||
`Skipping ${unitType} ${unitId} — already completed in a prior session. Advancing.`,
|
||||
"info",
|
||||
);
|
||||
// Yield to the event loop before re-dispatching to avoid tight recursion
|
||||
// when many units are already completed (e.g., after crash recovery).
|
||||
await new Promise(r => setImmediate(r));
|
||||
_skipDepth++;
|
||||
await new Promise(r => setTimeout(r, 50));
|
||||
await dispatchNextUnit(ctx, pi);
|
||||
_skipDepth = Math.max(0, _skipDepth - 1);
|
||||
return;
|
||||
} else {
|
||||
// Stale completion record — artifact missing. Remove and re-run.
|
||||
|
|
@ -1837,6 +1923,26 @@ async function dispatchNextUnit(
|
|||
}
|
||||
}
|
||||
|
||||
// Fallback: if the idempotency key is missing but the expected artifact already
|
||||
// exists on disk, the task completed in a prior session without persisting the key.
|
||||
// Persist it now and skip re-dispatch. This prevents infinite loops where a task
|
||||
// completes successfully but the completion key was never written (e.g., completed
|
||||
// on the first attempt before hitting the retry-threshold persistence logic).
|
||||
if (verifyExpectedArtifact(unitType, unitId, basePath)) {
|
||||
persistCompletedKey(basePath, idempotencyKey);
|
||||
completedKeySet.add(idempotencyKey);
|
||||
invalidateStateCache();
|
||||
ctx.ui.notify(
|
||||
`Skipping ${unitType} ${unitId} — artifact exists but completion key was missing. Repaired and advancing.`,
|
||||
"info",
|
||||
);
|
||||
_skipDepth++;
|
||||
await new Promise(r => setTimeout(r, 50));
|
||||
await dispatchNextUnit(ctx, pi);
|
||||
_skipDepth = Math.max(0, _skipDepth - 1);
|
||||
return;
|
||||
}
|
||||
|
||||
// Stuck detection — tracks total dispatches per unit (not just consecutive repeats).
|
||||
// Pattern A→B→A→B would reset retryCount every time; this map catches it.
|
||||
const dispatchKey = `${unitType}/${unitId}`;
|
||||
|
|
@ -1924,6 +2030,29 @@ async function dispatchNextUnit(
|
|||
return;
|
||||
}
|
||||
|
||||
// Last resort for complete-milestone: generate stub summary to unblock pipeline.
|
||||
// All slices are done (otherwise we wouldn't be in completing-milestone phase),
|
||||
// but the LLM failed to write the summary N times. A stub lets the pipeline advance.
|
||||
if (unitType === "complete-milestone") {
|
||||
try {
|
||||
const mPath = resolveMilestonePath(basePath, unitId);
|
||||
if (mPath) {
|
||||
const stubPath = join(mPath, `${unitId}-SUMMARY.md`);
|
||||
if (!existsSync(stubPath)) {
|
||||
writeFileSync(stubPath, `# ${unitId} Summary\n\nAuto-generated stub — milestone tasks completed but summary generation failed after ${prevCount + 1} attempts.\nReview and replace this stub with a proper summary.\n`);
|
||||
ctx.ui.notify(`Generated stub summary for ${unitId} to unblock pipeline. Review later.`, "warning");
|
||||
persistCompletedKey(basePath, dispatchKey);
|
||||
completedKeySet.add(dispatchKey);
|
||||
unitDispatchCount.delete(dispatchKey);
|
||||
invalidateStateCache();
|
||||
await new Promise(r => setImmediate(r));
|
||||
await dispatchNextUnit(ctx, pi);
|
||||
return;
|
||||
}
|
||||
}
|
||||
} catch { /* non-fatal — fall through to normal stop */ }
|
||||
}
|
||||
|
||||
const expected = diagnoseExpectedArtifact(unitType, unitId, basePath);
|
||||
const remediation = buildLoopRemediationSteps(unitType, unitId, basePath);
|
||||
await stopAuto(ctx, pi);
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue