From aee8973d81aeb97297a62c3611966882cd3ef98f Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 25 Mar 2026 21:53:37 +0000
Subject: [PATCH] =?UTF-8?q?fix:=20make=20journal=20scanning=20intelligent?=
 =?UTF-8?q?=20=E2=80=94=20limit=20parsed=20files,=20line-count=20older=20o?=
 =?UTF-8?q?nes?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

scanJournalForForensics() previously called queryJournal() which loaded
ALL journal entries from ALL daily files into memory. For long-running
projects this could be thousands of entries and megabytes of data.

Now:
- Only the last 3 daily files are fully JSON-parsed (event counts, flows)
- Older files are line-counted only (no JSON parsing) for totals
- Recent events use a rolling window of 20 (shift, not accumulate)
- Constants MAX_JOURNAL_RECENT_FILES and MAX_JOURNAL_RECENT_EVENTS
  make limits explicit and tunable

Activity log scanning was already intelligent:
- nativeParseJsonlTail with 10MB byte cap
- Only last 5 files scanned
- extractTrace() distills raw JSONL into compact ExecutionTrace structs
- formatReportForPrompt has 30KB hard cap on total output

Co-authored-by: glittercowboy <186001655+glittercowboy@users.noreply.github.com>
Agent-Logs-Url: https://github.com/gsd-build/gsd-2/sessions/7e7f71ec-0d56-409b-930e-5dff1305ff2a
---
 src/resources/extensions/gsd/forensics.ts     | 120 ++++++++++++++----
 .../gsd/tests/forensics-journal.test.ts       |  65 +++++++++-
 2 files changed, 155 insertions(+), 30 deletions(-)

diff --git a/src/resources/extensions/gsd/forensics.ts b/src/resources/extensions/gsd/forensics.ts
index f6dd0b022..78c074202 100644
--- a/src/resources/extensions/gsd/forensics.ts
+++ b/src/resources/extensions/gsd/forensics.ts
@@ -28,7 +28,6 @@ import { deriveState } from "./state.js";
 import { isAutoActive } from "./auto.js";
 import { loadPrompt } from "./prompt-loader.js";
 import { gsdRoot } from "./paths.js";
-import { queryJournal } from "./journal.js";
 import { formatDuration } from "../shared/format-utils.js";
 import { getAutoWorktreePath } from "./auto-worktree.js";
 import { loadEffectiveGSDPreferences, loadGlobalGSDPreferences, getGlobalGSDPreferencesPath } from "./preferences.js";
@@ -63,13 +62,19 @@ interface ActivityLogMeta {
   newestFile: string | null;
 }
 
-/** Summary of .gsd/journal/ data for forensic investigation. */
+/**
+ * Summary of .gsd/journal/ data for forensic investigation.
+ *
+ * To avoid loading huge journal histories into memory, only the most recent
+ * daily files are fully parsed. Older files are line-counted for totals.
+ * Event counts and flow IDs reflect only recent files.
+ */
 interface JournalSummary {
-  /** Total journal entries scanned */
+  /** Total journal entries across all files (recent parsed + older line-counted) */
   totalEntries: number;
-  /** Distinct flow IDs (each = one auto-mode iteration) */
+  /** Distinct flow IDs from recent files (each = one auto-mode iteration) */
   flowCount: number;
-  /** Event counts by type */
+  /** Event counts by type (from recent files only) */
   eventCounts: Record<string, number>;
   /** Most recent journal entries (last 20) for context */
   recentEvents: { ts: string; flowId: string; eventType: string; rule?: string; unitId?: string }[];
@@ -422,6 +427,24 @@ function resolveActivityDirs(basePath: string, activeMilestone?: string | null):
 
 // ─── Journal Scanner ──────────────────────────────────────────────────────────
 
+/**
+ * Max recent journal files to fully parse for event counts and recent events.
+ * Older files are line-counted only to avoid loading huge amounts of data.
+ */
+const MAX_JOURNAL_RECENT_FILES = 3;
+
+/** Max recent events to extract for the forensic report timeline. */
+const MAX_JOURNAL_RECENT_EVENTS = 20;
+
+/**
+ * Intelligently scan journal files for forensic summary.
+ *
+ * Journal files can be huge (thousands of JSONL entries over weeks of auto-mode).
+ * Instead of loading all entries into memory:
+ * - Only fully parse the most recent N daily files (event counts, flow tracking)
+ * - Line-count older files for approximate totals (no JSON parsing)
+ * - Extract only the last 20 events for the timeline
+ */
 function scanJournalForForensics(basePath: string): JournalSummary | null {
   try {
     const journalDir = join(gsdRoot(basePath), "journal");
@@ -430,33 +453,80 @@ function scanJournalForForensics(basePath: string): JournalSummary | null {
     const files = readdirSync(journalDir).filter(f => f.endsWith(".jsonl")).sort();
     if (files.length === 0) return null;
 
-    const entries = queryJournal(basePath);
-    if (entries.length === 0) return null;
+    // Split into recent (fully parsed) and older (line-counted only)
+    const recentFiles = files.slice(-MAX_JOURNAL_RECENT_FILES);
+    const olderFiles = files.slice(0, -MAX_JOURNAL_RECENT_FILES);
 
-    // Count events by type
-    const eventCounts: Record<string, number> = {};
-    const flowIds = new Set<string>();
-    for (const e of entries) {
-      eventCounts[e.eventType] = (eventCounts[e.eventType] ?? 0) + 1;
-      flowIds.add(e.flowId);
+    // Line-count older files without parsing — avoids loading megabytes of JSON
+    let olderEntryCount = 0;
+    let oldestEntry: string | null = null;
+    for (const file of olderFiles) {
+      try {
+        const raw = readFileSync(join(journalDir, file), "utf-8");
+        const lines = raw.split("\n");
+        for (const line of lines) {
+          if (!line.trim()) continue;
+          olderEntryCount++;
+          // Extract only the timestamp from the first non-empty line of the oldest file
+          if (!oldestEntry) {
+            try {
+              const parsed = JSON.parse(line) as { ts?: string };
+              if (parsed.ts) oldestEntry = parsed.ts;
+            } catch { /* skip malformed */ }
+          }
+        }
+      } catch { /* skip unreadable files */ }
     }
 
-    // Extract recent events (last 20) with key fields for the report
-    const recentEvents = entries.slice(-20).map(e => ({
-      ts: e.ts,
-      flowId: e.flowId,
-      eventType: e.eventType,
-      rule: e.rule,
-      unitId: e.data?.unitId as string | undefined,
-    }));
+    // Fully parse recent files for event counts and timeline
+    const eventCounts: Record<string, number> = {};
+    const flowIds = new Set<string>();
+    const recentParsedEntries: { ts: string; flowId: string; eventType: string; rule?: string; unitId?: string }[] = [];
+    let recentEntryCount = 0;
+
+    for (const file of recentFiles) {
+      try {
+        const raw = readFileSync(join(journalDir, file), "utf-8");
+        for (const line of raw.split("\n")) {
+          if (!line.trim()) continue;
+          try {
+            const entry = JSON.parse(line) as { ts: string; flowId: string; eventType: string; rule?: string; data?: Record<string, unknown> };
+            recentEntryCount++;
+            eventCounts[entry.eventType] = (eventCounts[entry.eventType] ?? 0) + 1;
+            flowIds.add(entry.flowId);
+
+            if (!oldestEntry) oldestEntry = entry.ts;
+
+            // Keep a rolling window of last N events — avoids accumulating unbounded arrays
+            recentParsedEntries.push({
+              ts: entry.ts,
+              flowId: entry.flowId,
+              eventType: entry.eventType,
+              rule: entry.rule,
+              unitId: entry.data?.unitId as string | undefined,
+            });
+            if (recentParsedEntries.length > MAX_JOURNAL_RECENT_EVENTS) {
+              recentParsedEntries.shift();
+            }
+          } catch { /* skip malformed lines */ }
+        }
+      } catch { /* skip unreadable files */ }
+    }
+
+    const totalEntries = olderEntryCount + recentEntryCount;
+    if (totalEntries === 0) return null;
+
+    const newestEntry = recentParsedEntries.length > 0
+      ? recentParsedEntries[recentParsedEntries.length - 1]!.ts
+      : null;
 
     return {
-      totalEntries: entries.length,
+      totalEntries,
       flowCount: flowIds.size,
       eventCounts,
-      recentEvents,
-      oldestEntry: entries[0]?.ts ?? null,
-      newestEntry: entries[entries.length - 1]?.ts ?? null,
+      recentEvents: recentParsedEntries,
+      oldestEntry,
+      newestEntry,
       fileCount: files.length,
     };
   } catch {
diff --git a/src/resources/extensions/gsd/tests/forensics-journal.test.ts b/src/resources/extensions/gsd/tests/forensics-journal.test.ts
index f086e6f6f..ead29c00a 100644
--- a/src/resources/extensions/gsd/tests/forensics-journal.test.ts
+++ b/src/resources/extensions/gsd/tests/forensics-journal.test.ts
@@ -11,14 +11,34 @@ describe("forensics journal & activity log awareness", () => {
   const forensicsSrc = readFileSync(join(gsdDir, "forensics.ts"), "utf-8");
   const promptSrc = readFileSync(join(gsdDir, "prompts", "forensics.md"), "utf-8");
 
-  it("forensics.ts imports queryJournal from journal module", () => {
+  it("scanJournalForForensics reads journal files directly (no full queryJournal load)", () => {
+    // Must NOT use queryJournal which loads ALL entries into memory
     assert.ok(
-      forensicsSrc.includes('from "./journal.js"') || forensicsSrc.includes("from './journal.js'"),
-      "forensics.ts must import from journal.js",
+      !forensicsSrc.includes('queryJournal('),
+      "forensics.ts must NOT call queryJournal() which loads all entries at once",
+    );
+    // Must have its own journal scanning with file-level limits
+    assert.ok(
+      forensicsSrc.includes("scanJournalForForensics"),
+      "forensics.ts must have scanJournalForForensics function",
+    );
+  });
+
+  it("journal scanning limits files parsed to avoid memory bloat", () => {
+    assert.ok(
+      forensicsSrc.includes("MAX_JOURNAL_RECENT_FILES"),
+      "must have MAX_JOURNAL_RECENT_FILES constant to limit parsed files",
     );
     assert.ok(
-      forensicsSrc.includes("queryJournal"),
-      "forensics.ts must reference queryJournal",
+      forensicsSrc.includes("MAX_JOURNAL_RECENT_EVENTS"),
+      "must have MAX_JOURNAL_RECENT_EVENTS constant to limit events extracted",
+    );
+  });
+
+  it("older journal files are line-counted without full JSON parse", () => {
+    assert.ok(
+      forensicsSrc.includes("olderEntryCount") || forensicsSrc.includes("olderFiles"),
+      "must handle older files separately from recent files",
     );
   });
 
@@ -76,6 +96,41 @@ describe("forensics journal & activity log awareness", () => {
     );
   });
 
+  it("activity log scanning uses tail-read with byte cap (not full file load)", () => {
+    // scanActivityLogs uses nativeParseJsonlTail + MAX_JSONL_BYTES for efficient reading
+    assert.ok(
+      forensicsSrc.includes("nativeParseJsonlTail"),
+      "activity log scanning must use nativeParseJsonlTail for tail-reading",
+    );
+    assert.ok(
+      forensicsSrc.includes("MAX_JSONL_BYTES"),
+      "activity log scanning must respect MAX_JSONL_BYTES cap",
+    );
+    // Only reads last 5 files
+    assert.ok(
+      forensicsSrc.includes("slice(-5)"),
+      "activity log scanning must limit to last 5 files",
+    );
+  });
+
+  it("activity log entries are distilled through extractTrace, not sent raw", () => {
+    assert.ok(
+      forensicsSrc.includes("extractTrace("),
+      "activity log entries must be distilled through extractTrace before reporting",
+    );
+  });
+
+  it("prompt output is hard-capped at 30KB", () => {
+    assert.ok(
+      forensicsSrc.includes("MAX_BYTES") && forensicsSrc.includes("30 * 1024"),
+      "formatReportForPrompt must have a 30KB hard cap",
+    );
+    assert.ok(
+      forensicsSrc.includes("truncated at 30KB"),
+      "prompt must show truncation message when capped",
+    );
+  });
+
   it("forensics prompt documents journal format", () => {
     assert.ok(
       promptSrc.includes("### Journal Format"),