From 77d88407ab48183df4612356e8f4b0316f79558f Mon Sep 17 00:00:00 2001
From: Tom Boucher <trekkie@nomorestars.com>
Date: Sat, 21 Mar 2026 15:23:35 -0400
Subject: [PATCH] fix(auto): reject execute-task with zero tool calls as
 hallucinated (#1838)

* fix(auto): reject execute-task with zero tool calls as hallucinated

Adds two safeguards against agents that complete with exit 0 but make
no tool calls, producing hallucinated summaries:

1. Zero tool-call guard: after closeoutUnit snapshots metrics for an
   execute-task, check the ledger for toolCalls === 0. If zero, log a
   warning and skip adding the unit to completedUnits so the task is
   retried on the next loop iteration instead of silently advancing.

2. Worktree health check: before dispatching an execute-task, verify
   the basePath has a .git marker and at least one of package.json or
   src/. A broken worktree causes agents to hallucinate since they
   cannot read or write files. Stops auto-mode immediately on failure.

Fixes #1833

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* fix: replace findLast with reverse().find() for ES2022 compat

findLast requires ES2023 lib target. The project uses ES2022.
Functionally identical: [...arr].reverse().find() with explicit type.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/resources/extensions/gsd/auto/phases.ts   |  53 ++++
 .../extensions/gsd/tests/auto-loop.test.ts    | 285 +++++++++++++++++-
 2 files changed, 337 insertions(+), 1 deletion(-)

diff --git a/src/resources/extensions/gsd/auto/phases.ts b/src/resources/extensions/gsd/auto/phases.ts
index 5efff699d..7259ade02 100644
--- a/src/resources/extensions/gsd/auto/phases.ts
+++ b/src/resources/extensions/gsd/auto/phases.ts
@@ -809,6 +809,31 @@ export async function runUnitPhase(
     unitId,
   });
 
+  // ── Worktree health check (#1833) ───────────────────────────────────
+  // Verify the working directory is a valid git checkout with project
+  // files before dispatching work. A broken worktree causes agents to
+  // hallucinate summaries since they cannot read or write any files.
+  if (s.basePath && unitType === "execute-task") {
+    const gitMarker = join(s.basePath, ".git");
+    const hasGit = deps.existsSync(gitMarker);
+    const hasPackageJson = deps.existsSync(join(s.basePath, "package.json"));
+    const hasSrcDir = deps.existsSync(join(s.basePath, "src"));
+    if (!hasGit) {
+      const msg = `Worktree health check failed: ${s.basePath} has no .git — refusing to dispatch ${unitType} ${unitId}`;
+      debugLog("runUnitPhase", { phase: "worktree-health-fail", basePath: s.basePath, hasGit, hasPackageJson, hasSrcDir });
+      ctx.ui.notify(msg, "error");
+      await deps.stopAuto(ctx, pi, msg);
+      return { action: "break", reason: "worktree-invalid" };
+    }
+    if (!hasPackageJson && !hasSrcDir) {
+      const msg = `Worktree health check failed: ${s.basePath} has no package.json or src/ — refusing to dispatch ${unitType} ${unitId}`;
+      debugLog("runUnitPhase", { phase: "worktree-health-fail", basePath: s.basePath, hasGit, hasPackageJson, hasSrcDir });
+      ctx.ui.notify(msg, "error");
+      await deps.stopAuto(ctx, pi, msg);
+      return { action: "break", reason: "worktree-invalid" };
+    }
+  }
+
   // Detect retry and capture previous tier for escalation
   const isRetry = !!(
     s.currentUnit &&
@@ -1054,6 +1079,34 @@ export async function runUnitPhase(
     deps.buildSnapshotOpts(unitType, unitId),
   );
 
+  // ── Zero tool-call guard (#1833) ──────────────────────────────────
+  // An execute-task agent that completes with 0 tool calls made no
+  // real changes — its summary is hallucinated. Treat as failed so
+  // the task is retried instead of silently marked complete.
+  if (unitType === "execute-task") {
+    const currentLedger = deps.getLedger() as { units: Array<{ type: string; id: string; startedAt: number; toolCalls: number }> } | null;
+    if (currentLedger?.units) {
+      const lastUnit = [...currentLedger.units].reverse().find(
+        (u: { type: string; id: string; startedAt: number; toolCalls: number }) => u.type === unitType && u.id === unitId && u.startedAt === s.currentUnit!.startedAt,
+      );
+      if (lastUnit && lastUnit.toolCalls === 0) {
+        debugLog("runUnitPhase", {
+          phase: "zero-tool-calls",
+          unitType,
+          unitId,
+          warning: "Task completed with 0 tool calls — likely hallucinated, marking as failed",
+        });
+        ctx.ui.notify(
+          `${unitType} ${unitId} completed with 0 tool calls — hallucinated summary, will retry`,
+          "warning",
+        );
+        // Do NOT add to completedUnits — fall through to next iteration
+        // where dispatch will re-derive and re-dispatch this task.
+        return { action: "next", data: { unitStartedAt: s.currentUnit.startedAt } };
+      }
+    }
+  }
+
   if (s.currentUnitRouting) {
     deps.recordOutcome(
       unitType,
diff --git a/src/resources/extensions/gsd/tests/auto-loop.test.ts b/src/resources/extensions/gsd/tests/auto-loop.test.ts
index 60d22b7d1..de3d5d77d 100644
--- a/src/resources/extensions/gsd/tests/auto-loop.test.ts
+++ b/src/resources/extensions/gsd/tests/auto-loop.test.ts
@@ -382,7 +382,7 @@ function makeMockDeps(
     getDeepDiagnostic: () => null,
     isDbAvailable: () => false,
     reorderForCaching: (p: string) => p,
-    existsSync: () => false,
+    existsSync: (p: string) => p.endsWith(".git") || p.endsWith("package.json"),
     readFileSync: () => "",
     atomicWriteSync: () => {},
     GitServiceImpl: class {} as any,
@@ -1846,3 +1846,286 @@ test("resolveAgentEnd unblocks pending runUnit when called before session reset
   const result = await resultPromise;
   assert.equal(result.status, "completed", "runUnit should resolve, not hang");
 });
+
+// ─── Zero tool-call hallucination guard (#1833) ───────────────────────────
+
+test("autoLoop rejects execute-task with 0 tool calls as hallucinated (#1833)", async () => {
+  _resetPendingResolve();
+
+  const ctx = makeMockCtx();
+  ctx.ui.setStatus = () => {};
+  ctx.sessionManager = { getSessionFile: () => "/tmp/session.json" };
+  const pi = makeMockPi();
+
+  let iterationCount = 0;
+  const notifications: string[] = [];
+  ctx.ui.notify = (msg: string) => { notifications.push(msg); };
+
+  const s = makeLoopSession();
+
+  // Mock ledger: execute-task completed with 0 tool calls
+  const mockLedger = {
+    version: 1,
+    projectStartedAt: Date.now(),
+    units: [] as any[],
+  };
+
+  const deps = makeMockDeps({
+    deriveState: async () => {
+      deps.callLog.push("deriveState");
+      return {
+        phase: "executing",
+        activeMilestone: { id: "M001", title: "Test", status: "active" },
+        activeSlice: { id: "S01", title: "Slice 1" },
+        activeTask: { id: "T01" },
+        registry: [{ id: "M001", status: "active" }],
+        blockers: [],
+      } as any;
+    },
+    resolveDispatch: async () => {
+      deps.callLog.push("resolveDispatch");
+      return {
+        action: "dispatch" as const,
+        unitType: "execute-task",
+        unitId: "M001/S01/T01",
+        prompt: "implement the feature",
+      };
+    },
+    closeoutUnit: async () => {
+      // Simulate snapshotUnitMetrics adding a 0-toolCalls entry to ledger
+      mockLedger.units.push({
+        type: "execute-task",
+        id: "M001/S01/T01",
+        startedAt: s.currentUnit?.startedAt ?? Date.now(),
+        toolCalls: 0,
+        assistantMessages: 1,
+        tokens: { input: 100, output: 200, total: 300, cacheRead: 0, cacheWrite: 0 },
+        cost: 0.50,
+      });
+    },
+    getLedger: () => mockLedger,
+    postUnitPostVerification: async () => {
+      deps.callLog.push("postUnitPostVerification");
+      iterationCount++;
+      if (iterationCount >= 2) {
+        s.active = false;
+      }
+      return "continue" as const;
+    },
+  });
+
+  const loopPromise = autoLoop(ctx, pi, s, deps);
+
+  // First iteration: execute-task with 0 tool calls → rejected
+  await new Promise((r) => setTimeout(r, 50));
+  resolveAgentEnd(makeEvent());
+
+  // Second iteration: same task re-dispatched, this time with tool calls
+  await new Promise((r) => setTimeout(r, 50));
+  mockLedger.units.length = 0; // clear previous entry
+  (deps as any).closeoutUnit = async () => {
+    mockLedger.units.push({
+      type: "execute-task",
+      id: "M001/S01/T01",
+      startedAt: s.currentUnit?.startedAt ?? Date.now(),
+      toolCalls: 5,
+      assistantMessages: 3,
+      tokens: { input: 500, output: 800, total: 1300, cacheRead: 0, cacheWrite: 0 },
+      cost: 1.00,
+    });
+  };
+  resolveAgentEnd(makeEvent());
+
+  await loopPromise;
+
+  // The task should NOT have been added to completedUnits on the first iteration
+  // (0 tool calls), but SHOULD be added on the second iteration (5 tool calls)
+  const warningNotification = notifications.find(
+    (n) => n.includes("0 tool calls") && n.includes("hallucinated"),
+  );
+  assert.ok(
+    warningNotification,
+    "should notify about 0 tool calls hallucination",
+  );
+
+  // Verify deriveState was called at least twice (two iterations)
+  const deriveCount = deps.callLog.filter((c) => c === "deriveState").length;
+  assert.ok(
+    deriveCount >= 2,
+    `deriveState should be called at least 2 times for retry (got ${deriveCount})`,
+  );
+});
+
+test("autoLoop does NOT reject non-execute-task units with 0 tool calls (#1833)", async () => {
+  _resetPendingResolve();
+
+  const ctx = makeMockCtx();
+  ctx.ui.setStatus = () => {};
+  ctx.sessionManager = { getSessionFile: () => "/tmp/session.json" };
+  const pi = makeMockPi();
+
+  const notifications: string[] = [];
+  ctx.ui.notify = (msg: string) => { notifications.push(msg); };
+
+  const s = makeLoopSession();
+
+  const mockLedger = {
+    version: 1,
+    projectStartedAt: Date.now(),
+    units: [] as any[],
+  };
+
+  const deps = makeMockDeps({
+    deriveState: async () => {
+      deps.callLog.push("deriveState");
+      return {
+        phase: "executing",
+        activeMilestone: { id: "M001", title: "Test", status: "active" },
+        activeSlice: { id: "S01", title: "Slice 1" },
+        activeTask: { id: "T01" },
+        registry: [{ id: "M001", status: "active" }],
+        blockers: [],
+      } as any;
+    },
+    resolveDispatch: async () => {
+      deps.callLog.push("resolveDispatch");
+      return {
+        action: "dispatch" as const,
+        unitType: "complete-slice",
+        unitId: "M001/S01",
+        prompt: "complete the slice",
+      };
+    },
+    closeoutUnit: async () => {
+      // complete-slice with 0 tool calls is fine (e.g. it may just update status)
+      mockLedger.units.push({
+        type: "complete-slice",
+        id: "M001/S01",
+        startedAt: s.currentUnit?.startedAt ?? Date.now(),
+        toolCalls: 0,
+        assistantMessages: 1,
+        tokens: { input: 50, output: 100, total: 150, cacheRead: 0, cacheWrite: 0 },
+        cost: 0.10,
+      });
+    },
+    getLedger: () => mockLedger,
+    verifyExpectedArtifact: () => true,
+    postUnitPostVerification: async () => {
+      deps.callLog.push("postUnitPostVerification");
+      s.active = false;
+      return "continue" as const;
+    },
+  });
+
+  const loopPromise = autoLoop(ctx, pi, s, deps);
+
+  await new Promise((r) => setTimeout(r, 50));
+  resolveAgentEnd(makeEvent());
+
+  await loopPromise;
+
+  // Should NOT have a hallucination warning for non-execute-task units
+  const warningNotification = notifications.find(
+    (n) => n.includes("0 tool calls") && n.includes("hallucinated"),
+  );
+  assert.ok(
+    !warningNotification,
+    "should NOT flag non-execute-task units with 0 tool calls",
+  );
+
+  // The unit should have been added to completedUnits normally
+  assert.ok(
+    s.completedUnits.length >= 1,
+    "complete-slice with 0 tool calls should still be marked as completed",
+  );
+});
+
+// ─── Worktree health check (#1833) ────────────────────────────────────────
+
+test("autoLoop stops when worktree has no .git for execute-task (#1833)", async () => {
+  _resetPendingResolve();
+
+  const ctx = makeMockCtx();
+  ctx.ui.setStatus = () => {};
+  ctx.sessionManager = { getSessionFile: () => "/tmp/session.json" };
+  const pi = makeMockPi();
+
+  const notifications: string[] = [];
+  ctx.ui.notify = (msg: string) => { notifications.push(msg); };
+
+  const s = makeLoopSession({ basePath: "/tmp/broken-worktree" });
+
+  const deps = makeMockDeps({
+    deriveState: async () => {
+      deps.callLog.push("deriveState");
+      return {
+        phase: "executing",
+        activeMilestone: { id: "M001", title: "Test", status: "active" },
+        activeSlice: { id: "S01", title: "Slice 1" },
+        activeTask: { id: "T01" },
+        registry: [{ id: "M001", status: "active" }],
+        blockers: [],
+      } as any;
+    },
+    // .git does not exist in the broken worktree
+    existsSync: (p: string) => !p.endsWith(".git"),
+  });
+
+  await autoLoop(ctx, pi, s, deps);
+
+  assert.ok(
+    deps.callLog.includes("stopAuto"),
+    "should stop auto-mode when worktree is invalid",
+  );
+  const healthNotification = notifications.find(
+    (n) => n.includes("Worktree health check failed") && n.includes("no .git"),
+  );
+  assert.ok(
+    healthNotification,
+    "should notify about missing .git in worktree",
+  );
+});
+
+test("autoLoop stops when worktree has no project files for execute-task (#1833)", async () => {
+  _resetPendingResolve();
+
+  const ctx = makeMockCtx();
+  ctx.ui.setStatus = () => {};
+  ctx.sessionManager = { getSessionFile: () => "/tmp/session.json" };
+  const pi = makeMockPi();
+
+  const notifications: string[] = [];
+  ctx.ui.notify = (msg: string) => { notifications.push(msg); };
+
+  const s = makeLoopSession({ basePath: "/tmp/empty-worktree" });
+
+  const deps = makeMockDeps({
+    deriveState: async () => {
+      deps.callLog.push("deriveState");
+      return {
+        phase: "executing",
+        activeMilestone: { id: "M001", title: "Test", status: "active" },
+        activeSlice: { id: "S01", title: "Slice 1" },
+        activeTask: { id: "T01" },
+        registry: [{ id: "M001", status: "active" }],
+        blockers: [],
+      } as any;
+    },
+    // Has .git but no package.json or src/
+    existsSync: (p: string) => p.endsWith(".git"),
+  });
+
+  await autoLoop(ctx, pi, s, deps);
+
+  assert.ok(
+    deps.callLog.includes("stopAuto"),
+    "should stop auto-mode when worktree has no project files",
+  );
+  const healthNotification = notifications.find(
+    (n) => n.includes("Worktree health check failed") && n.includes("no package.json or src/"),
+  );
+  assert.ok(
+    healthNotification,
+    "should notify about missing project files in worktree",
+  );
+});