Merge branch 'main' into fix/tui-resource-leaks-and-quality

2026-03-15 15:09:38 -05:00 · 2026-03-15 15:09:38 -05:00 · d7aaa5c5d9
commit d7aaa5c5d9
parent f844635de0 59698978af
3 changed files with 176 additions and 10 deletions
--- a/src/resources/extensions/gsd/auto.ts
+++ b/src/resources/extensions/gsd/auto.ts
@ -69,11 +69,13 @@ import {
  getProjectTotals, formatCost, formatTokenCount,
 } from "./metrics.js";
 import { dirname, join } from "node:path";
-import { readdirSync, readFileSync, existsSync, mkdirSync, writeFileSync, unlinkSync } from "node:fs";
+import { sep as pathSep } from "node:path";
+import { readdirSync, readFileSync, existsSync, mkdirSync, writeFileSync, unlinkSync, renameSync, statSync } from "node:fs";
 import { execSync, execFileSync } from "node:child_process";
 import {
  autoCommitCurrentBranch,
  captureIntegrationBranch,
+  detectWorktreeName,
  getCurrentBranch,
  getMainBranch,
  MergeConflictError,
@ -115,7 +117,10 @@ function persistCompletedKey(base: string, key: string): void {
  } catch { /* corrupt file — start fresh */ }
  if (!keys.includes(key)) {
    keys.push(key);
-    writeFileSync(file, JSON.stringify(keys), "utf-8");
+    // Atomic write: tmp file + rename prevents partial writes on crash
+    const tmpFile = file + ".tmp";
+    writeFileSync(tmpFile, JSON.stringify(keys), "utf-8");
+    renameSync(tmpFile, file);
  }
 }

@ -353,6 +358,8 @@ export async function stopAuto(ctx?: ExtensionContext, pi?: ExtensionAPI): Promi
  clearUnitTimeout();
  if (basePath) clearLock(basePath);
  clearSkillSnapshot();
+  _dispatching = false;
+  _skipDepth = 0;

  // Remove SIGTERM handler registered at auto-mode start
  deregisterSigtermHandler();
@ -461,17 +468,35 @@ async function selfHealRuntimeRecords(base: string, ctx: ExtensionContext): Prom
    const { listUnitRuntimeRecords } = await import("./unit-runtime.js");
    const records = listUnitRuntimeRecords(base);
    let healed = 0;
+    const STALE_THRESHOLD_MS = 60 * 60 * 1000; // 1 hour
+    const now = Date.now();
    for (const record of records) {
      const { unitType, unitId } = record;
      const artifactPath = resolveExpectedArtifactPath(unitType, unitId, base);
+
+      // Case 1: Artifact exists — unit completed but closeout didn't finish
      if (artifactPath && existsSync(artifactPath)) {
-        // Artifact exists — unit completed but closeout didn't finish.
+        clearUnitRuntimeRecord(base, unitType, unitId);
+        // Also persist completion key if missing
+        const key = `${unitType}/${unitId}`;
+        if (!completedKeySet.has(key)) {
+          persistCompletedKey(base, key);
+          completedKeySet.add(key);
+        }
+        healed++;
+        continue;
+      }
+
+      // Case 2: No artifact but record is stale (dispatched > 1h ago, process crashed)
+      const age = now - (record.startedAt ?? 0);
+      if (record.phase === "dispatched" && age > STALE_THRESHOLD_MS) {
        clearUnitRuntimeRecord(base, unitType, unitId);
        healed++;
+        continue;
      }
    }
    if (healed > 0) {
-      ctx.ui.notify(`Self-heal: cleared ${healed} stale runtime record(s) with completed artifacts.`, "info");
+      ctx.ui.notify(`Self-heal: cleared ${healed} stale runtime record(s).`, "info");
    }
  } catch {
    // Non-fatal — self-heal should never block auto-mode start
@ -505,7 +530,8 @@ export async function startAuto(
    if (currentMilestoneId) setActiveMilestoneId(base, currentMilestoneId);

    // ── Auto-worktree: re-enter worktree on resume if not already inside ──
-    if (currentMilestoneId && originalBasePath && !isInAutoWorktree(basePath)) {
+    // Skip if already inside a worktree (manual /worktree) to prevent nesting.
+    if (currentMilestoneId && originalBasePath && !isInAutoWorktree(basePath) && !detectWorktreeName(basePath) && !detectWorktreeName(originalBasePath)) {
      try {
        const existingWtPath = getAutoWorktreePath(originalBasePath, currentMilestoneId);
        if (existingWtPath) {
@ -668,8 +694,22 @@ export async function startAuto(

  // ── Auto-worktree: create or enter worktree for the active milestone ──
  // Store the original project root before any chdir so we can restore on stop.
+  // Skip if already inside a worktree (manual /worktree or another auto-worktree)
+  // to prevent nested worktree creation.
  originalBasePath = base;
-  if (currentMilestoneId) {
+
+  const isUnderGsdWorktrees = (p: string): boolean => {
+    // Prevent creating nested auto-worktrees when running from within any
+    // `.gsd/worktrees/...` directory (including manual worktrees).
+    const marker = `${pathSep}.gsd${pathSep}worktrees${pathSep}`;
+    if (p.includes(marker)) {
+      return true;
+    }
+    const worktreesSuffix = `${pathSep}.gsd${pathSep}worktrees`;
+    return p.endsWith(worktreesSuffix);
+  };
+
+  if (currentMilestoneId && !detectWorktreeName(base) && !isUnderGsdWorktrees(base)) {
    try {
      const existingWtPath = getAutoWorktreePath(base, currentMilestoneId);
      if (existingWtPath) {
@ -738,6 +778,43 @@ export async function startAuto(
  // Self-heal: clear stale runtime records where artifacts already exist
  await selfHealRuntimeRecords(base, ctx);

+  // Self-heal: remove stale .git/index.lock from prior crash.
+  // A stale lock file blocks all git operations (commit, merge, checkout).
+  // Only remove if older than 60 seconds (not from a concurrent process).
+  try {
+    const gitLockFile = join(base, ".git", "index.lock");
+    if (existsSync(gitLockFile)) {
+      const lockAge = Date.now() - statSync(gitLockFile).mtimeMs;
+      if (lockAge > 60_000) {
+        unlinkSync(gitLockFile);
+        ctx.ui.notify("Removed stale .git/index.lock from prior crash.", "info");
+      }
+    }
+  } catch { /* non-fatal */ }
+
+  // Pre-flight: validate milestone queue for multi-milestone runs.
+  // Warn about issues that will cause auto-mode to pause or block.
+  try {
+    const msDir = join(base, ".gsd", "milestones");
+    if (existsSync(msDir)) {
+      const milestoneIds = readdirSync(msDir, { withFileTypes: true })
+        .filter(d => d.isDirectory() && /^M\d{3}/.test(d.name))
+        .map(d => d.name.match(/^(M\d{3})/)?.[1] ?? d.name);
+      if (milestoneIds.length > 1) {
+        const issues: string[] = [];
+        for (const id of milestoneIds) {
+          const draft = resolveMilestoneFile(base, id, "CONTEXT-DRAFT");
+          if (draft) issues.push(`${id}: has CONTEXT-DRAFT.md (will pause for discussion)`);
+        }
+        if (issues.length > 0) {
+          ctx.ui.notify(`Pre-flight: ${milestoneIds.length} milestones queued.\n${issues.map(i => `  ⚠ ${i}`).join("\n")}`, "warning");
+        } else {
+          ctx.ui.notify(`Pre-flight: ${milestoneIds.length} milestones queued. All have full context.`, "info");
+        }
+      }
+    }
+  } catch { /* non-fatal — pre-flight should never block auto-mode */ }
+
  // Dispatch the first unit
  await dispatchNextUnit(ctx, pi);
 }
@ -1414,6 +1491,16 @@ function getRoadmapSlicesSync(): { done: number; total: number; activeSliceTasks

 // ─── Core Loop ────────────────────────────────────────────────────────────────

+/** Tracks recursive skip depth to prevent TUI freeze on cascading completed-unit skips */
+let _skipDepth = 0;
+const MAX_SKIP_DEPTH = 20;
+
+/** Reentrancy guard for dispatchNextUnit itself (not just handleAgentEnd).
+ *  Prevents concurrent dispatch from watchdog timers, step wizard, and direct calls
+ *  that bypass the _handlingAgentEnd guard. Recursive calls (from skip paths) are
+ *  allowed via _skipDepth > 0. */
+let _dispatching = false;
+
 async function dispatchNextUnit(
  ctx: ExtensionContext,
  pi: ExtensionAPI,
@ -1425,6 +1512,22 @@ async function dispatchNextUnit(
    return;
  }

+  // Reentrancy guard: allow recursive calls from skip paths (_skipDepth > 0)
+  // but block concurrent external calls (watchdog, step wizard, etc.)
+  if (_dispatching && _skipDepth === 0) {
+    return; // Another dispatch is in progress — bail silently
+  }
+  _dispatching = true;
+
+  // Recursion depth guard: when many units are skipped in sequence (e.g., after
+  // crash recovery with 10+ completed units), recursive dispatchNextUnit calls
+  // can freeze the TUI or overflow the stack. Yield generously after MAX_SKIP_DEPTH.
+  if (_skipDepth > MAX_SKIP_DEPTH) {
+    _skipDepth = 0;
+    ctx.ui.notify(`Skipped ${MAX_SKIP_DEPTH}+ completed units. Yielding to UI before continuing.`, "info");
+    await new Promise(r => setTimeout(r, 200));
+  }
+
  // Clear stale directory listing cache so deriveState sees fresh disk state (#431)
  clearPathCache();
  // Clear parsed roadmap/plan cache — doctor may have re-populated it with
@ -1804,10 +1907,10 @@ async function dispatchNextUnit(
        `Skipping ${unitType} ${unitId} — already completed in a prior session. Advancing.`,
        "info",
      );
-      // Yield to the event loop before re-dispatching to avoid tight recursion
-      // when many units are already completed (e.g., after crash recovery).
-      await new Promise(r => setImmediate(r));
+      _skipDepth++;
+      await new Promise(r => setTimeout(r, 50));
      await dispatchNextUnit(ctx, pi);
+      _skipDepth = Math.max(0, _skipDepth - 1);
      return;
    } else {
      // Stale completion record — artifact missing. Remove and re-run.
@ -1820,6 +1923,26 @@ async function dispatchNextUnit(
    }
  }

+  // Fallback: if the idempotency key is missing but the expected artifact already
+  // exists on disk, the task completed in a prior session without persisting the key.
+  // Persist it now and skip re-dispatch. This prevents infinite loops where a task
+  // completes successfully but the completion key was never written (e.g., completed
+  // on the first attempt before hitting the retry-threshold persistence logic).
+  if (verifyExpectedArtifact(unitType, unitId, basePath)) {
+    persistCompletedKey(basePath, idempotencyKey);
+    completedKeySet.add(idempotencyKey);
+    invalidateStateCache();
+    ctx.ui.notify(
+      `Skipping ${unitType} ${unitId} — artifact exists but completion key was missing. Repaired and advancing.`,
+      "info",
+    );
+    _skipDepth++;
+    await new Promise(r => setTimeout(r, 50));
+    await dispatchNextUnit(ctx, pi);
+    _skipDepth = Math.max(0, _skipDepth - 1);
+    return;
+  }
+
  // Stuck detection — tracks total dispatches per unit (not just consecutive repeats).
  // Pattern A→B→A→B would reset retryCount every time; this map catches it.
  const dispatchKey = `${unitType}/${unitId}`;
@ -1907,6 +2030,29 @@ async function dispatchNextUnit(
      return;
    }

+    // Last resort for complete-milestone: generate stub summary to unblock pipeline.
+    // All slices are done (otherwise we wouldn't be in completing-milestone phase),
+    // but the LLM failed to write the summary N times. A stub lets the pipeline advance.
+    if (unitType === "complete-milestone") {
+      try {
+        const mPath = resolveMilestonePath(basePath, unitId);
+        if (mPath) {
+          const stubPath = join(mPath, `${unitId}-SUMMARY.md`);
+          if (!existsSync(stubPath)) {
+            writeFileSync(stubPath, `# ${unitId} Summary\n\nAuto-generated stub — milestone tasks completed but summary generation failed after ${prevCount + 1} attempts.\nReview and replace this stub with a proper summary.\n`);
+            ctx.ui.notify(`Generated stub summary for ${unitId} to unblock pipeline. Review later.`, "warning");
+            persistCompletedKey(basePath, dispatchKey);
+            completedKeySet.add(dispatchKey);
+            unitDispatchCount.delete(dispatchKey);
+            invalidateStateCache();
+            await new Promise(r => setImmediate(r));
+            await dispatchNextUnit(ctx, pi);
+            return;
+          }
+        }
+      } catch { /* non-fatal — fall through to normal stop */ }
+    }
+
    const expected = diagnoseExpectedArtifact(unitType, unitId, basePath);
    const remediation = buildLoopRemediationSteps(unitType, unitId, basePath);
    await stopAuto(ctx, pi);
--- a/src/resources/extensions/gsd/prompts/discuss.md
+++ b/src/resources/extensions/gsd/prompts/discuss.md
@ -215,6 +215,20 @@ Once the user confirms the milestone split:
 5. Write a full `CONTEXT.md` for the primary milestone (the one discussed in depth).
 6. Write a `ROADMAP.md` for **only the primary milestone** — detail-planning later milestones now is waste because the codebase will change. Include requirement coverage and a milestone definition of done.

+#### MANDATORY: depends_on Frontmatter in CONTEXT.md
+
+Every CONTEXT.md for a milestone that depends on other milestones MUST have YAML frontmatter with `depends_on`. The auto-mode state machine reads this field to determine execution order — without it, milestones may execute out of order or in parallel when they shouldn't.
+
+```yaml
+---
+depends_on: [M001, M002]
+---
+
+# M003: Title
+```
+
+If a milestone has no dependencies, omit the frontmatter. The dependency chain from the milestone confirmation gate MUST be reflected in each CONTEXT.md frontmatter. Do NOT rely on QUEUE.md or PROJECT.md for dependency tracking — the state machine only reads CONTEXT.md frontmatter.
+
 #### Phase 3: Sequential readiness gate for remaining milestones

 For each remaining milestone **one at a time, in sequence**, use `ask_user_questions` to assess readiness. Present three options:
--- a/src/resources/extensions/gsd/prompts/queue.md
+++ b/src/resources/extensions/gsd/prompts/queue.md
@ -82,7 +82,13 @@ Determine where the new milestones should go in the overall sequence. Consider d
 Once the user is satisfied, in a single pass for **each** new milestone (starting from {{nextId}}):

 1. `mkdir -p .gsd/milestones/<ID>/slices`
-2. Write `.gsd/milestones/<ID>/<ID>-CONTEXT.md` — use the **Context** output template below. Capture intent, scope, risks, constraints, integration points, and relevant requirements. Mark the status as "Queued — pending auto-mode execution."
+2. Write `.gsd/milestones/<ID>/<ID>-CONTEXT.md` — use the **Context** output template below. Capture intent, scope, risks, constraints, integration points, and relevant requirements. Mark the status as "Queued — pending auto-mode execution." **If this milestone depends on other milestones, add YAML frontmatter with `depends_on`:**
+   ```yaml
+   ---
+   depends_on: [M001, M002]
+   ---
+   ```
+   The auto-mode state machine reads this field to enforce execution order. Without it, milestones may execute out of order. List the exact milestone IDs (including any suffix like `-0zjrg0`) from the dependency chain discussed with the user.

 Then, after all milestone directories and context files are written: