fix: make journal scanning intelligent — limit parsed files, line-count older ones

scanJournalForForensics() previously called queryJournal() which loaded ALL journal entries from ALL daily files into memory. For long-running projects this could be thousands of entries and megabytes of data. Now: - Only the last 3 daily files are fully JSON-parsed (event counts, flows) - Older files are line-counted only (no JSON parsing) for totals - Recent events use a rolling window of 20 (shift, not accumulate) - Constants MAX_JOURNAL_RECENT_FILES and MAX_JOURNAL_RECENT_EVENTS make limits explicit and tunable Activity log scanning was already intelligent: - nativeParseJsonlTail with 10MB byte cap - Only last 5 files scanned - extractTrace() distills raw JSONL into compact ExecutionTrace structs - formatReportForPrompt has 30KB hard cap on total output Co-authored-by: glittercowboy <186001655+glittercowboy@users.noreply.github.com> Agent-Logs-Url: https://github.com/gsd-build/gsd-2/sessions/7e7f71ec-0d56-409b-930e-5dff1305ff2a
2026-03-25 21:53:37 +00:00 · 2026-03-25 21:53:37 +00:00 · aee8973d81
commit aee8973d81
parent ce4720bad8
2 changed files with 155 additions and 30 deletions
--- a/src/resources/extensions/gsd/forensics.ts
+++ b/src/resources/extensions/gsd/forensics.ts
@ -28,7 +28,6 @@ import { deriveState } from "./state.js";
 import { isAutoActive } from "./auto.js";
 import { loadPrompt } from "./prompt-loader.js";
 import { gsdRoot } from "./paths.js";
-import { queryJournal } from "./journal.js";
 import { formatDuration } from "../shared/format-utils.js";
 import { getAutoWorktreePath } from "./auto-worktree.js";
 import { loadEffectiveGSDPreferences, loadGlobalGSDPreferences, getGlobalGSDPreferencesPath } from "./preferences.js";
@ -63,13 +62,19 @@ interface ActivityLogMeta {
  newestFile: string | null;
 }

-/** Summary of .gsd/journal/ data for forensic investigation. */
+/**
+ * Summary of .gsd/journal/ data for forensic investigation.
+ *
+ * To avoid loading huge journal histories into memory, only the most recent
+ * daily files are fully parsed. Older files are line-counted for totals.
+ * Event counts and flow IDs reflect only recent files.
+ */
 interface JournalSummary {
-  /** Total journal entries scanned */
+  /** Total journal entries across all files (recent parsed + older line-counted) */
  totalEntries: number;
-  /** Distinct flow IDs (each = one auto-mode iteration) */
+  /** Distinct flow IDs from recent files (each = one auto-mode iteration) */
  flowCount: number;
-  /** Event counts by type */
+  /** Event counts by type (from recent files only) */
  eventCounts: Record<string, number>;
  /** Most recent journal entries (last 20) for context */
  recentEvents: { ts: string; flowId: string; eventType: string; rule?: string; unitId?: string }[];
@ -422,6 +427,24 @@ function resolveActivityDirs(basePath: string, activeMilestone?: string | null):

 // ─── Journal Scanner ──────────────────────────────────────────────────────────

+/**
+ * Max recent journal files to fully parse for event counts and recent events.
+ * Older files are line-counted only to avoid loading huge amounts of data.
+ */
+const MAX_JOURNAL_RECENT_FILES = 3;
+
+/** Max recent events to extract for the forensic report timeline. */
+const MAX_JOURNAL_RECENT_EVENTS = 20;
+
+/**
+ * Intelligently scan journal files for forensic summary.
+ *
+ * Journal files can be huge (thousands of JSONL entries over weeks of auto-mode).
+ * Instead of loading all entries into memory:
+ * - Only fully parse the most recent N daily files (event counts, flow tracking)
+ * - Line-count older files for approximate totals (no JSON parsing)
+ * - Extract only the last 20 events for the timeline
+ */
 function scanJournalForForensics(basePath: string): JournalSummary | null {
  try {
    const journalDir = join(gsdRoot(basePath), "journal");
@ -430,33 +453,80 @@ function scanJournalForForensics(basePath: string): JournalSummary | null {
    const files = readdirSync(journalDir).filter(f => f.endsWith(".jsonl")).sort();
    if (files.length === 0) return null;

-    const entries = queryJournal(basePath);
-    if (entries.length === 0) return null;
+    // Split into recent (fully parsed) and older (line-counted only)
+    const recentFiles = files.slice(-MAX_JOURNAL_RECENT_FILES);
+    const olderFiles = files.slice(0, -MAX_JOURNAL_RECENT_FILES);

-    // Count events by type
-    const eventCounts: Record<string, number> = {};
-    const flowIds = new Set<string>();
-    for (const e of entries) {
-      eventCounts[e.eventType] = (eventCounts[e.eventType] ?? 0) + 1;
-      flowIds.add(e.flowId);
+    // Line-count older files without parsing — avoids loading megabytes of JSON
+    let olderEntryCount = 0;
+    let oldestEntry: string | null = null;
+    for (const file of olderFiles) {
+      try {
+        const raw = readFileSync(join(journalDir, file), "utf-8");
+        const lines = raw.split("\n");
+        for (const line of lines) {
+          if (!line.trim()) continue;
+          olderEntryCount++;
+          // Extract only the timestamp from the first non-empty line of the oldest file
+          if (!oldestEntry) {
+            try {
+              const parsed = JSON.parse(line) as { ts?: string };
+              if (parsed.ts) oldestEntry = parsed.ts;
+            } catch { /* skip malformed */ }
+          }
+        }
+      } catch { /* skip unreadable files */ }
    }

-    // Extract recent events (last 20) with key fields for the report
-    const recentEvents = entries.slice(-20).map(e => ({
-      ts: e.ts,
-      flowId: e.flowId,
-      eventType: e.eventType,
-      rule: e.rule,
-      unitId: e.data?.unitId as string | undefined,
-    }));
+    // Fully parse recent files for event counts and timeline
+    const eventCounts: Record<string, number> = {};
+    const flowIds = new Set<string>();
+    const recentParsedEntries: { ts: string; flowId: string; eventType: string; rule?: string; unitId?: string }[] = [];
+    let recentEntryCount = 0;
+
+    for (const file of recentFiles) {
+      try {
+        const raw = readFileSync(join(journalDir, file), "utf-8");
+        for (const line of raw.split("\n")) {
+          if (!line.trim()) continue;
+          try {
+            const entry = JSON.parse(line) as { ts: string; flowId: string; eventType: string; rule?: string; data?: Record<string, unknown> };
+            recentEntryCount++;
+            eventCounts[entry.eventType] = (eventCounts[entry.eventType] ?? 0) + 1;
+            flowIds.add(entry.flowId);
+
+            if (!oldestEntry) oldestEntry = entry.ts;
+
+            // Keep a rolling window of last N events — avoids accumulating unbounded arrays
+            recentParsedEntries.push({
+              ts: entry.ts,
+              flowId: entry.flowId,
+              eventType: entry.eventType,
+              rule: entry.rule,
+              unitId: entry.data?.unitId as string | undefined,
+            });
+            if (recentParsedEntries.length > MAX_JOURNAL_RECENT_EVENTS) {
+              recentParsedEntries.shift();
+            }
+          } catch { /* skip malformed lines */ }
+        }
+      } catch { /* skip unreadable files */ }
+    }
+
+    const totalEntries = olderEntryCount + recentEntryCount;
+    if (totalEntries === 0) return null;
+
+    const newestEntry = recentParsedEntries.length > 0
+      ? recentParsedEntries[recentParsedEntries.length - 1]!.ts
+      : null;

    return {
-      totalEntries: entries.length,
+      totalEntries,
      flowCount: flowIds.size,
      eventCounts,
-      recentEvents,
-      oldestEntry: entries[0]?.ts ?? null,
-      newestEntry: entries[entries.length - 1]?.ts ?? null,
+      recentEvents: recentParsedEntries,
+      oldestEntry,
+      newestEntry,
      fileCount: files.length,
    };
  } catch {
--- a/src/resources/extensions/gsd/tests/forensics-journal.test.ts
+++ b/src/resources/extensions/gsd/tests/forensics-journal.test.ts
@ -11,14 +11,34 @@ describe("forensics journal & activity log awareness", () => {
  const forensicsSrc = readFileSync(join(gsdDir, "forensics.ts"), "utf-8");
  const promptSrc = readFileSync(join(gsdDir, "prompts", "forensics.md"), "utf-8");

-  it("forensics.ts imports queryJournal from journal module", () => {
+  it("scanJournalForForensics reads journal files directly (no full queryJournal load)", () => {
+    // Must NOT use queryJournal which loads ALL entries into memory
    assert.ok(
-      forensicsSrc.includes('from "./journal.js"') || forensicsSrc.includes("from './journal.js'"),
-      "forensics.ts must import from journal.js",
+      !forensicsSrc.includes('queryJournal('),
+      "forensics.ts must NOT call queryJournal() which loads all entries at once",
+    );
+    // Must have its own journal scanning with file-level limits
+    assert.ok(
+      forensicsSrc.includes("scanJournalForForensics"),
+      "forensics.ts must have scanJournalForForensics function",
+    );
+  });
+
+  it("journal scanning limits files parsed to avoid memory bloat", () => {
+    assert.ok(
+      forensicsSrc.includes("MAX_JOURNAL_RECENT_FILES"),
+      "must have MAX_JOURNAL_RECENT_FILES constant to limit parsed files",
    );
    assert.ok(
-      forensicsSrc.includes("queryJournal"),
-      "forensics.ts must reference queryJournal",
+      forensicsSrc.includes("MAX_JOURNAL_RECENT_EVENTS"),
+      "must have MAX_JOURNAL_RECENT_EVENTS constant to limit events extracted",
+    );
+  });
+
+  it("older journal files are line-counted without full JSON parse", () => {
+    assert.ok(
+      forensicsSrc.includes("olderEntryCount") || forensicsSrc.includes("olderFiles"),
+      "must handle older files separately from recent files",
    );
  });

@ -76,6 +96,41 @@ describe("forensics journal & activity log awareness", () => {
    );
  });

+  it("activity log scanning uses tail-read with byte cap (not full file load)", () => {
+    // scanActivityLogs uses nativeParseJsonlTail + MAX_JSONL_BYTES for efficient reading
+    assert.ok(
+      forensicsSrc.includes("nativeParseJsonlTail"),
+      "activity log scanning must use nativeParseJsonlTail for tail-reading",
+    );
+    assert.ok(
+      forensicsSrc.includes("MAX_JSONL_BYTES"),
+      "activity log scanning must respect MAX_JSONL_BYTES cap",
+    );
+    // Only reads last 5 files
+    assert.ok(
+      forensicsSrc.includes("slice(-5)"),
+      "activity log scanning must limit to last 5 files",
+    );
+  });
+
+  it("activity log entries are distilled through extractTrace, not sent raw", () => {
+    assert.ok(
+      forensicsSrc.includes("extractTrace("),
+      "activity log entries must be distilled through extractTrace before reporting",
+    );
+  });
+
+  it("prompt output is hard-capped at 30KB", () => {
+    assert.ok(
+      forensicsSrc.includes("MAX_BYTES") && forensicsSrc.includes("30 * 1024"),
+      "formatReportForPrompt must have a 30KB hard cap",
+    );
+    assert.ok(
+      forensicsSrc.includes("truncated at 30KB"),
+      "prompt must show truncation message when capped",
+    );
+  });
+
  it("forensics prompt documents journal format", () => {
    assert.ok(
      promptSrc.includes("### Journal Format"),