fix(auto): harden recovery — checkbox verification, atomic writes, roadmap checks (#547)

Four fixes to auto-recovery logic that caused silent failures or
inconsistent state:

1. skipExecuteTask: return false when checkbox regex doesn't match the
   plan format, so callers fall through to other recovery strategies
   instead of assuming success (lines 252-255)

2. verifyExpectedArtifact: fail verification on corrupt/unparseable
   roadmap instead of silently passing. Prevents advancing past an
   incomplete complete-slice when the roadmap file is malformed (line 152)

3. removePersistedKey: use atomic tmp+rename write (matching
   persistCompletedKey) to prevent completed-units.json corruption
   on crash mid-write (line 293)

4. selfHealRuntimeRecords: use verifyExpectedArtifact instead of bare
   existsSync for execute-task healing, so tasks with summary but
   unchecked plan checkbox aren't incorrectly marked complete (line 374)

Co-authored-by: TÂCHES <afromanguy@me.com>
This commit is contained in:
Flux Labs 2026-03-15 19:15:44 -05:00 committed by GitHub
parent 3101469b4d
commit ecef348a95

View file

@ -149,7 +149,12 @@ export function verifyExpectedArtifact(unitType: string, unitId: string, base: s
const roadmap = parseRoadmap(roadmapContent);
const slice = roadmap.slices.find(s => s.id === sid);
if (slice && !slice.done) return false;
} catch (e) { /* corrupt roadmap — be lenient and treat as verified */ void e; }
} catch {
// Corrupt/unparseable roadmap — fail verification so the unit
// re-runs and has a chance to fix the roadmap. Silently passing
// here could advance past an incomplete slice.
return false;
}
}
}
}
@ -251,6 +256,11 @@ export function skipExecuteTask(
const re = new RegExp(`^(- \\[) \\] (\\*\\*${escapedTid}:)`, "m");
if (re.test(planContent)) {
writeFileSync(planAbs, planContent.replace(re, "$1x] $2"), "utf-8");
} else {
// Regex didn't match — checkbox format differs from expected pattern.
// Return false so callers know the plan was NOT updated and can
// fall through to other recovery strategies instead of assuming success.
return false;
}
}
}
@ -290,7 +300,10 @@ export function removePersistedKey(base: string, key: string): void {
if (existsSync(file)) {
let keys: string[] = JSON.parse(readFileSync(file, "utf-8"));
keys = keys.filter(k => k !== key);
writeFileSync(file, JSON.stringify(keys), "utf-8");
// Atomic write: tmp file + rename prevents partial writes on crash
const tmpFile = file + ".tmp";
writeFileSync(tmpFile, JSON.stringify(keys), "utf-8");
renameSync(tmpFile, file);
}
} catch (e) { /* non-fatal: removePersistedKey failure */ void e; }
}
@ -412,8 +425,12 @@ export async function selfHealRuntimeRecords(
const { unitType, unitId } = record;
const artifactPath = resolveExpectedArtifactPath(unitType, unitId, base);
// Case 1: Artifact exists — unit completed but closeout didn't finish
if (artifactPath && existsSync(artifactPath)) {
// Case 1: Artifact exists — unit completed but closeout didn't finish.
// Use verifyExpectedArtifact (not just existsSync) so that execute-task
// also checks the plan checkbox is marked [x]. Without this, a task
// whose summary exists but checkbox is unchecked would be incorrectly
// marked as completed, causing deriveState to re-dispatch it endlessly.
if (artifactPath && existsSync(artifactPath) && verifyExpectedArtifact(unitType, unitId, base)) {
clearUnitRuntimeRecord(base, unitType, unitId);
// Also persist completion key if missing
const key = `${unitType}/${unitId}`;