fix(auto): harden recovery — checkbox verification, atomic writes, roadmap checks (#547)

Four fixes to auto-recovery logic that caused silent failures or inconsistent state: 1. skipExecuteTask: return false when checkbox regex doesn't match the plan format, so callers fall through to other recovery strategies instead of assuming success (lines 252-255) 2. verifyExpectedArtifact: fail verification on corrupt/unparseable roadmap instead of silently passing. Prevents advancing past an incomplete complete-slice when the roadmap file is malformed (line 152) 3. removePersistedKey: use atomic tmp+rename write (matching persistCompletedKey) to prevent completed-units.json corruption on crash mid-write (line 293) 4. selfHealRuntimeRecords: use verifyExpectedArtifact instead of bare existsSync for execute-task healing, so tasks with summary but unchecked plan checkbox aren't incorrectly marked complete (line 374) Co-authored-by: TÂCHES <afromanguy@me.com>
2026-03-15 19:15:44 -05:00 · 2026-03-15 19:15:44 -05:00 · ecef348a95
commit ecef348a95
parent 3101469b4d
1 changed files with 21 additions and 4 deletions
--- a/src/resources/extensions/gsd/auto-recovery.ts
+++ b/src/resources/extensions/gsd/auto-recovery.ts
@ -149,7 +149,12 @@ export function verifyExpectedArtifact(unitType: string, unitId: string, base: s
          const roadmap = parseRoadmap(roadmapContent);
          const slice = roadmap.slices.find(s => s.id === sid);
          if (slice && !slice.done) return false;
-        } catch (e) { /* corrupt roadmap — be lenient and treat as verified */ void e; }
+        } catch {
+          // Corrupt/unparseable roadmap — fail verification so the unit
+          // re-runs and has a chance to fix the roadmap. Silently passing
+          // here could advance past an incomplete slice.
+          return false;
+        }
      }
    }
  }
@ -251,6 +256,11 @@ export function skipExecuteTask(
      const re = new RegExp(`^(- \\[) \\] (\\*\\*${escapedTid}:)`, "m");
      if (re.test(planContent)) {
        writeFileSync(planAbs, planContent.replace(re, "$1x] $2"), "utf-8");
+      } else {
+        // Regex didn't match — checkbox format differs from expected pattern.
+        // Return false so callers know the plan was NOT updated and can
+        // fall through to other recovery strategies instead of assuming success.
+        return false;
      }
    }
  }
@ -290,7 +300,10 @@ export function removePersistedKey(base: string, key: string): void {
    if (existsSync(file)) {
      let keys: string[] = JSON.parse(readFileSync(file, "utf-8"));
      keys = keys.filter(k => k !== key);
-      writeFileSync(file, JSON.stringify(keys), "utf-8");
+      // Atomic write: tmp file + rename prevents partial writes on crash
+      const tmpFile = file + ".tmp";
+      writeFileSync(tmpFile, JSON.stringify(keys), "utf-8");
+      renameSync(tmpFile, file);
    }
  } catch (e) { /* non-fatal: removePersistedKey failure */ void e; }
 }
@ -412,8 +425,12 @@ export async function selfHealRuntimeRecords(
      const { unitType, unitId } = record;
      const artifactPath = resolveExpectedArtifactPath(unitType, unitId, base);

-      // Case 1: Artifact exists — unit completed but closeout didn't finish
-      if (artifactPath && existsSync(artifactPath)) {
+      // Case 1: Artifact exists — unit completed but closeout didn't finish.
+      // Use verifyExpectedArtifact (not just existsSync) so that execute-task
+      // also checks the plan checkbox is marked [x]. Without this, a task
+      // whose summary exists but checkbox is unchecked would be incorrectly
+      // marked as completed, causing deriveState to re-dispatch it endlessly.
+      if (artifactPath && existsSync(artifactPath) && verifyExpectedArtifact(unitType, unitId, base)) {
        clearUnitRuntimeRecord(base, unitType, unitId);
        // Also persist completion key if missing
        const key = `${unitType}/${unitId}`;