Wire workflow-logger through the state system

The workflow-logger per-unit buffer API (_resetLogs / drainAndSummarize / formatForNotification) had zero callers outside tests, so accumulated warnings never reached users as a consolidated post-unit alert and the buffer leaked across units in the same Node process. Several state-layer sites also silently swallowed errors that should have surfaced. - auto/phases.ts: reset logger in runUnitPhase, drain + ctx.ui.notify in runFinalize success path, drain in both finalize timeout branches so timed-out unit logs don't bleed into the next iteration - auto/detect-stuck.ts: enrich stuck reasons with summarizeLogs() so recovery has the diagnostic context (read-only peek, no drain) - auto.ts: call setLogBasePath(base) in startAuto to pin the audit log on /clear resume and hot-reload paths that bypass dynamic-tools bootstrap - workflow-manifest.ts: log snapshotState ROLLBACK failures (split-brain signal) instead of silently swallowing them - state.ts: log reconcileDiskToDb roadmap read failures instead of silent continue - workflow-projections.ts: log renderStateProjection DB handle probe failures instead of silent return New regression tests cover the phases.ts wiring (source-scan), setLogBasePath in startAuto, detect-stuck enrichment runtime behavior (including the read-only peek invariant), and the three silent-catch fixes.
2026-04-14 18:13:20 +00:00 · 2026-04-14 18:13:20 +00:00 · 74ca7cd8cd
commit 74ca7cd8cd
parent 0d66433b4f
7 changed files with 297 additions and 9 deletions
--- a/src/resources/extensions/gsd/auto.ts
+++ b/src/resources/extensions/gsd/auto.ts
@ -1400,6 +1400,11 @@ export async function startAuto(
    s.stepMode = requestedStepMode;
    s.cmdCtx = ctx;
    s.basePath = base;
+    // Ensure the workflow-logger audit log is pinned to the project root
+    // even when auto-mode is entered via a path that bypasses the
+    // bootstrap/dynamic-tools ensureDbOpen() → setLogBasePath() chain
+    // (e.g. /clear resume, hot-reload).
+    setLogBasePath(base);
    s.unitDispatchCount.clear();
    s.unitLifetimeDispatches.clear();
    if (!getLedger()) initMetrics(base);
--- a/src/resources/extensions/gsd/auto/detect-stuck.ts
+++ b/src/resources/extensions/gsd/auto/detect-stuck.ts
@ -5,6 +5,7 @@
 */

 import type { WindowEntry } from "./types.js";
+import { summarizeLogs } from "../workflow-logger.js";

 /**
 * Pattern matching ENOENT errors with a file path.
@ -28,6 +29,13 @@ export function detectStuck(
 ): { stuck: true; reason: string } | null {
  if (window.length < 2) return null;

+  // Peek (not drain) the workflow-logger buffer so stuck reasons can surface
+  // the underlying diagnostic context (projection failures, DB degradations,
+  // reconcile warnings) that usually explains *why* the loop is stuck. The
+  // auto-loop's finalize step owns the buffer lifecycle — this is read-only.
+  const loggerSummary = summarizeLogs();
+  const suffix = loggerSummary ? ` — ${loggerSummary}` : "";
+
  const last = window[window.length - 1];
  const prev = window[window.length - 2];

@ -35,7 +43,7 @@ export function detectStuck(
  if (last.error && prev.error && last.error === prev.error) {
    return {
      stuck: true,
-      reason: `Same error repeated: ${last.error.slice(0, 200)}`,
+      reason: `Same error repeated: ${last.error.slice(0, 200)}${suffix}`,
    };
  }

@ -45,7 +53,7 @@ export function detectStuck(
    if (lastThree.every((u) => u.key === last.key)) {
      return {
        stuck: true,
-        reason: `${last.key} derived 3 consecutive times without progress`,
+        reason: `${last.key} derived 3 consecutive times without progress${suffix}`,
      };
    }
  }
@ -60,7 +68,7 @@ export function detectStuck(
    ) {
      return {
        stuck: true,
-        reason: `Oscillation detected: ${w[0].key} ↔ ${w[1].key}`,
+        reason: `Oscillation detected: ${w[0].key} ↔ ${w[1].key}${suffix}`,
      };
    }
  }
@ -77,7 +85,7 @@ export function detectStuck(
    if (count >= 2) {
      return {
        stuck: true,
-        reason: `Missing file referenced twice: ${filePath} (ENOENT)`,
+        reason: `Missing file referenced twice: ${filePath} (ENOENT)${suffix}`,
      };
    }
    enoentPaths.set(filePath, count);
--- a/src/resources/extensions/gsd/auto/phases.ts
+++ b/src/resources/extensions/gsd/auto/phases.ts
@ -30,7 +30,15 @@ import { MergeConflictError } from "../git-service.js";
 import { setCurrentPhase, clearCurrentPhase } from "../../shared/gsd-phase-state.js";
 import { join, basename, dirname, parse as parsePath } from "node:path";
 import { existsSync, cpSync, readdirSync } from "node:fs";
-import { logWarning, logError } from "../workflow-logger.js";
+import {
+  logWarning,
+  logError,
+  _resetLogs,
+  drainLogs,
+  drainAndSummarize,
+  formatForNotification,
+  hasAnyIssues,
+} from "../workflow-logger.js";
 import { gsdRoot } from "../paths.js";
 import { atomicWriteSync } from "../atomic-write.js";
 import { verifyExpectedArtifact, diagnoseExpectedArtifact, buildLoopRemediationSteps } from "../auto-recovery.js";
@ -1081,6 +1089,10 @@ export async function runUnitPhase(
  );
  const previousTier = s.currentUnitRouting?.tier;

+  // Scope workflow-logger buffer to this unit so post-finalize drains are
+  // per-unit. Without this, the module-level _buffer accumulates across every
+  // unit in the same Node process (see workflow-logger.ts module header).
+  _resetLogs();
  s.currentUnit = { type: unitType, id: unitId, startedAt: Date.now() };
  setCurrentPhase(unitType);
  s.lastToolInvocationError = null; // #2883: clear stale error from previous unit
@ -1545,6 +1557,9 @@ export async function runFinalize(
    // cannot mutate state for the next unit (#3757).
    s.currentUnit = null;
    clearCurrentPhase();
+    // Drop any logger entries from the timed-out unit so they don't bleed
+    // into the next iteration's drain.
+    drainLogs();
    loopState.consecutiveFinalizeTimeouts++;
    debugLog("autoLoop", {
      phase: "pre-verification-timeout",
@ -1643,6 +1658,9 @@ export async function runFinalize(
    // cannot mutate state for the next unit (#3757).
    s.currentUnit = null;
    clearCurrentPhase();
+    // Drop any logger entries from the timed-out unit so they don't bleed
+    // into the next iteration's drain.
+    drainLogs();
    loopState.consecutiveFinalizeTimeouts++;
    debugLog("autoLoop", {
      phase: "post-verification-timeout",
@ -1687,5 +1705,16 @@ export async function runFinalize(
  // Both pre and post verification completed without timeout — reset counter
  loopState.consecutiveFinalizeTimeouts = 0;

+  // Surface accumulated workflow-logger issues for this unit to the user.
+  // Warnings/errors logged during the unit are buffered in the logger and
+  // drained here so the user sees a single consolidated post-unit alert.
+  if (hasAnyIssues()) {
+    const { logs } = drainAndSummarize();
+    if (logs.length > 0) {
+      const severity = logs.some((e) => e.severity === "error") ? "error" : "warning";
+      ctx.ui.notify(formatForNotification(logs), severity);
+    }
+  }
+
  return { action: "next", data: undefined as void };
 }
--- a/src/resources/extensions/gsd/gsd-db.ts
+++ b/src/resources/extensions/gsd/gsd-db.ts
@ -967,7 +967,16 @@ export function readTransaction<T>(fn: () => T): T {
    currentDb.exec("COMMIT");
    return result;
  } catch (err) {
-    try { currentDb.exec("ROLLBACK"); } catch { /* swallow */ }
+    try {
+      currentDb.exec("ROLLBACK");
+    } catch (rollbackErr) {
+      // A failed ROLLBACK after a failed read is a split-brain signal —
+      // the transaction is in an indeterminate state. Surface it via the
+      // logger instead of swallowing it.
+      logError("db", "snapshotState ROLLBACK failed", {
+        error: (rollbackErr as Error).message,
+      });
+    }
    throw err;
  } finally {
    _txDepth--;
--- a/src/resources/extensions/gsd/state.ts
+++ b/src/resources/extensions/gsd/state.ts
@ -345,8 +345,15 @@ function reconcileDiskToDb(basePath: string): MilestoneRow[] {
    const dbSliceIds = new Set(dbSlices.map(s => s.id));

    let roadmapContent: string;
-    try { roadmapContent = readFileSync(roadmapPath, "utf-8"); }
-    catch { continue; }
+    try {
+      roadmapContent = readFileSync(roadmapPath, "utf-8");
+    } catch (err) {
+      logWarning("state", "reconcileDiskToDb: roadmap read failed, skipping milestone", {
+        mid,
+        error: (err as Error).message,
+      });
+      continue;
+    }

    const parsed = parseRoadmap(roadmapContent);
    for (const s of parsed.slices) {
--- a/src/resources/extensions/gsd/tests/workflow-logger-wiring.test.ts
+++ b/src/resources/extensions/gsd/tests/workflow-logger-wiring.test.ts
@ -0,0 +1,223 @@
+// GSD Extension — workflow-logger wiring regression tests
+//
+// Verifies the plumbing between workflow-logger and the rest of the state
+// system (auto-loop phases, detect-stuck, notification store). Without this
+// wiring, warnings/errors logged during a unit leak across units, never
+// reach the user as a consolidated post-unit alert, and don't enrich
+// stuck-detection reasons.
+
+import test from "node:test";
+import assert from "node:assert/strict";
+import { readFileSync } from "node:fs";
+import { join } from "node:path";
+
+import {
+  logWarning,
+  logError,
+  peekLogs,
+  _resetLogs,
+  setStderrLoggingEnabled,
+} from "../workflow-logger.ts";
+import { detectStuck } from "../auto/detect-stuck.ts";
+
+const phasesSrc = readFileSync(
+  join(import.meta.dirname, "..", "auto", "phases.ts"),
+  "utf-8",
+);
+const autoSrc = readFileSync(
+  join(import.meta.dirname, "..", "auto.ts"),
+  "utf-8",
+);
+
+// ─── Source-scan: phases.ts calls the logger lifecycle API ─────────────────
+
+test("auto/phases.ts imports _resetLogs, drainAndSummarize, formatForNotification, hasAnyIssues", () => {
+  assert.match(
+    phasesSrc,
+    /from\s+"\.\.\/workflow-logger\.js"/,
+    "phases.ts imports from workflow-logger",
+  );
+  for (const name of [
+    "_resetLogs",
+    "drainLogs",
+    "drainAndSummarize",
+    "formatForNotification",
+    "hasAnyIssues",
+  ]) {
+    assert.ok(
+      phasesSrc.includes(name),
+      `phases.ts should reference ${name}`,
+    );
+  }
+});
+
+test("runUnitPhase calls _resetLogs() before assigning s.currentUnit", () => {
+  // Find the "s.currentUnit = { type: unitType" assignment line and check
+  // the preceding ~500 chars contain a _resetLogs() call.
+  const idx = phasesSrc.indexOf("s.currentUnit = { type: unitType");
+  assert.ok(idx > 0, "runUnitPhase should assign s.currentUnit");
+  const before = phasesSrc.slice(Math.max(0, idx - 500), idx);
+  assert.match(
+    before,
+    /_resetLogs\(\)/,
+    "_resetLogs() must be called immediately before s.currentUnit assignment",
+  );
+});
+
+test("runFinalize drains and surfaces logger buffer via ctx.ui.notify", () => {
+  // Locate the runFinalize success path and verify it calls drainAndSummarize
+  // and routes the result through ctx.ui.notify.
+  const runFinalizeIdx = phasesSrc.indexOf("export async function runFinalize");
+  assert.ok(runFinalizeIdx > 0, "runFinalize export should exist");
+  const finalizeBody = phasesSrc.slice(runFinalizeIdx);
+  assert.match(
+    finalizeBody,
+    /hasAnyIssues\(\)/,
+    "runFinalize should gate drain on hasAnyIssues",
+  );
+  assert.match(
+    finalizeBody,
+    /drainAndSummarize\(\)/,
+    "runFinalize should call drainAndSummarize on success",
+  );
+  assert.match(
+    finalizeBody,
+    /formatForNotification\(logs\)/,
+    "runFinalize should format drained logs for the notification",
+  );
+});
+
+test("runFinalize timeout branches drain the buffer to prevent bleed", () => {
+  // Both timeout branches null out s.currentUnit — they should also drain
+  // so accumulated logs for the timed-out unit don't leak into the next.
+  const runFinalizeIdx = phasesSrc.indexOf("export async function runFinalize");
+  const finalizeBody = phasesSrc.slice(runFinalizeIdx);
+  const drainCallCount =
+    (finalizeBody.match(/drainLogs\(\)/g) ?? []).length;
+  assert.ok(
+    drainCallCount >= 2,
+    `runFinalize timeout branches should each call drainLogs() (found ${drainCallCount}, expected >= 2)`,
+  );
+});
+
+// ─── Source-scan: auto.ts calls setLogBasePath in startAuto ────────────────
+
+test("startAuto calls setLogBasePath(base) so audit log is pinned on resume", () => {
+  const startAutoIdx = autoSrc.indexOf("export async function startAuto");
+  assert.ok(startAutoIdx > 0, "startAuto export should exist");
+  const body = autoSrc.slice(startAutoIdx);
+  assert.match(
+    body,
+    /setLogBasePath\(base\)/,
+    "startAuto must call setLogBasePath(base) to pin the audit log",
+  );
+});
+
+// ─── Runtime: detect-stuck enriches reason with summarizeLogs() ────────────
+
+test("detectStuck reason includes workflow-logger summary when logs present", () => {
+  setStderrLoggingEnabled(false);
+  try {
+    _resetLogs();
+    logWarning("projection", "STATE.md render failed");
+    logError("db", "WAL checkpoint failed");
+
+    const result = detectStuck([
+      { key: "execute-task/slice-A/task-1", error: "ENOENT: no such file" },
+      { key: "execute-task/slice-A/task-1", error: "ENOENT: no such file" },
+    ]);
+
+    assert.notEqual(result, null);
+    assert.equal(result!.stuck, true);
+    assert.match(
+      result!.reason,
+      /Same error repeated:/,
+      "reason should still start with the rule string",
+    );
+    assert.match(
+      result!.reason,
+      /STATE\.md render failed/,
+      "reason should include the accumulated logger warning",
+    );
+    assert.match(
+      result!.reason,
+      /WAL checkpoint failed/,
+      "reason should include the accumulated logger error",
+    );
+
+    // Critical: summarizeLogs must not drain — the auto-loop's finalize
+    // step owns the buffer lifecycle, detect-stuck is read-only.
+    assert.equal(
+      peekLogs().length,
+      2,
+      "detect-stuck must not drain the buffer",
+    );
+  } finally {
+    _resetLogs();
+    setStderrLoggingEnabled(true);
+  }
+});
+
+test("detectStuck reason unchanged when logger buffer is empty", () => {
+  setStderrLoggingEnabled(false);
+  try {
+    _resetLogs();
+    const result = detectStuck([
+      { key: "A", error: "boom" },
+      { key: "A", error: "boom" },
+    ]);
+    assert.notEqual(result, null);
+    // No trailing " — " suffix when there are no logs to summarize.
+    assert.doesNotMatch(
+      result!.reason,
+      / — \d+ (error|warning)/,
+      "reason should have no logger suffix when buffer is empty",
+    );
+  } finally {
+    setStderrLoggingEnabled(true);
+  }
+});
+
+// ─── Runtime: readTransaction rollback failure surfaces via logError ────────
+//
+// snapshotState now delegates its transaction to readTransaction() in
+// gsd-db.ts (single-writer refactor in #4198), so the split-brain
+// ROLLBACK-failure log lives there, not in workflow-manifest.ts.
+
+test("readTransaction logs ROLLBACK failures as split-brain signal", () => {
+  const dbSrc = readFileSync(
+    join(import.meta.dirname, "..", "gsd-db.ts"),
+    "utf-8",
+  );
+  assert.match(
+    dbSrc,
+    /logError\("db",\s*"snapshotState ROLLBACK failed"/,
+    "readTransaction ROLLBACK catch should call logError",
+  );
+});
+
+// ─── Runtime: state.ts and workflow-projections.ts log silent bailouts ─────
+
+test("state.ts logs roadmap read failures instead of silently continuing", () => {
+  const stateSrc = readFileSync(
+    join(import.meta.dirname, "..", "state.ts"),
+    "utf-8",
+  );
+  assert.match(
+    stateSrc,
+    /logWarning\("state",\s*"reconcileDiskToDb: roadmap read failed/,
+    "state.ts reconcileDiskToDb should log roadmap read failures",
+  );
+});
+
+test("workflow-projections.ts logs DB probe failures instead of silent return", () => {
+  const projectionsSrc = readFileSync(
+    join(import.meta.dirname, "..", "workflow-projections.ts"),
+    "utf-8",
+  );
+  assert.match(
+    projectionsSrc,
+    /logWarning\("projection",\s*"renderStateProjection: DB handle probe failed/,
+    "renderStateProjection DB probe should log on failure",
+  );
+});
--- a/src/resources/extensions/gsd/workflow-projections.ts
+++ b/src/resources/extensions/gsd/workflow-projections.ts
@ -350,7 +350,14 @@ export async function renderStateProjection(basePath: string): Promise<void> {
    // Probe DB handle — adapter may be set but underlying handle closed
    const adapter = _getAdapter();
    if (!adapter) return;
-    try { adapter.prepare("SELECT 1").get(); } catch { return; }
+    try {
+      adapter.prepare("SELECT 1").get();
+    } catch (err) {
+      logWarning("projection", "renderStateProjection: DB handle probe failed, skipping render", {
+        error: (err as Error).message,
+      });
+      return;
+    }
    const state = await deriveState(basePath);
    const content = renderStateContent(state);
    const dir = join(basePath, ".gsd");