From aee8973d81aeb97297a62c3611966882cd3ef98f Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 25 Mar 2026 21:53:37 +0000 Subject: [PATCH] =?UTF-8?q?fix:=20make=20journal=20scanning=20intelligent?= =?UTF-8?q?=20=E2=80=94=20limit=20parsed=20files,=20line-count=20older=20o?= =?UTF-8?q?nes?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit scanJournalForForensics() previously called queryJournal() which loaded ALL journal entries from ALL daily files into memory. For long-running projects this could be thousands of entries and megabytes of data. Now: - Only the last 3 daily files are fully JSON-parsed (event counts, flows) - Older files are line-counted only (no JSON parsing) for totals - Recent events use a rolling window of 20 (shift, not accumulate) - Constants MAX_JOURNAL_RECENT_FILES and MAX_JOURNAL_RECENT_EVENTS make limits explicit and tunable Activity log scanning was already intelligent: - nativeParseJsonlTail with 10MB byte cap - Only last 5 files scanned - extractTrace() distills raw JSONL into compact ExecutionTrace structs - formatReportForPrompt has 30KB hard cap on total output Co-authored-by: glittercowboy <186001655+glittercowboy@users.noreply.github.com> Agent-Logs-Url: https://github.com/gsd-build/gsd-2/sessions/7e7f71ec-0d56-409b-930e-5dff1305ff2a --- src/resources/extensions/gsd/forensics.ts | 120 ++++++++++++++---- .../gsd/tests/forensics-journal.test.ts | 65 +++++++++- 2 files changed, 155 insertions(+), 30 deletions(-) diff --git a/src/resources/extensions/gsd/forensics.ts b/src/resources/extensions/gsd/forensics.ts index f6dd0b022..78c074202 100644 --- a/src/resources/extensions/gsd/forensics.ts +++ b/src/resources/extensions/gsd/forensics.ts @@ -28,7 +28,6 @@ import { deriveState } from "./state.js"; import { isAutoActive } from "./auto.js"; import { loadPrompt } from "./prompt-loader.js"; import { gsdRoot } from "./paths.js"; -import { queryJournal } from "./journal.js"; import { formatDuration } from "../shared/format-utils.js"; import { getAutoWorktreePath } from "./auto-worktree.js"; import { loadEffectiveGSDPreferences, loadGlobalGSDPreferences, getGlobalGSDPreferencesPath } from "./preferences.js"; @@ -63,13 +62,19 @@ interface ActivityLogMeta { newestFile: string | null; } -/** Summary of .gsd/journal/ data for forensic investigation. */ +/** + * Summary of .gsd/journal/ data for forensic investigation. + * + * To avoid loading huge journal histories into memory, only the most recent + * daily files are fully parsed. Older files are line-counted for totals. + * Event counts and flow IDs reflect only recent files. + */ interface JournalSummary { - /** Total journal entries scanned */ + /** Total journal entries across all files (recent parsed + older line-counted) */ totalEntries: number; - /** Distinct flow IDs (each = one auto-mode iteration) */ + /** Distinct flow IDs from recent files (each = one auto-mode iteration) */ flowCount: number; - /** Event counts by type */ + /** Event counts by type (from recent files only) */ eventCounts: Record; /** Most recent journal entries (last 20) for context */ recentEvents: { ts: string; flowId: string; eventType: string; rule?: string; unitId?: string }[]; @@ -422,6 +427,24 @@ function resolveActivityDirs(basePath: string, activeMilestone?: string | null): // ─── Journal Scanner ────────────────────────────────────────────────────────── +/** + * Max recent journal files to fully parse for event counts and recent events. + * Older files are line-counted only to avoid loading huge amounts of data. + */ +const MAX_JOURNAL_RECENT_FILES = 3; + +/** Max recent events to extract for the forensic report timeline. */ +const MAX_JOURNAL_RECENT_EVENTS = 20; + +/** + * Intelligently scan journal files for forensic summary. + * + * Journal files can be huge (thousands of JSONL entries over weeks of auto-mode). + * Instead of loading all entries into memory: + * - Only fully parse the most recent N daily files (event counts, flow tracking) + * - Line-count older files for approximate totals (no JSON parsing) + * - Extract only the last 20 events for the timeline + */ function scanJournalForForensics(basePath: string): JournalSummary | null { try { const journalDir = join(gsdRoot(basePath), "journal"); @@ -430,33 +453,80 @@ function scanJournalForForensics(basePath: string): JournalSummary | null { const files = readdirSync(journalDir).filter(f => f.endsWith(".jsonl")).sort(); if (files.length === 0) return null; - const entries = queryJournal(basePath); - if (entries.length === 0) return null; + // Split into recent (fully parsed) and older (line-counted only) + const recentFiles = files.slice(-MAX_JOURNAL_RECENT_FILES); + const olderFiles = files.slice(0, -MAX_JOURNAL_RECENT_FILES); - // Count events by type - const eventCounts: Record = {}; - const flowIds = new Set(); - for (const e of entries) { - eventCounts[e.eventType] = (eventCounts[e.eventType] ?? 0) + 1; - flowIds.add(e.flowId); + // Line-count older files without parsing — avoids loading megabytes of JSON + let olderEntryCount = 0; + let oldestEntry: string | null = null; + for (const file of olderFiles) { + try { + const raw = readFileSync(join(journalDir, file), "utf-8"); + const lines = raw.split("\n"); + for (const line of lines) { + if (!line.trim()) continue; + olderEntryCount++; + // Extract only the timestamp from the first non-empty line of the oldest file + if (!oldestEntry) { + try { + const parsed = JSON.parse(line) as { ts?: string }; + if (parsed.ts) oldestEntry = parsed.ts; + } catch { /* skip malformed */ } + } + } + } catch { /* skip unreadable files */ } } - // Extract recent events (last 20) with key fields for the report - const recentEvents = entries.slice(-20).map(e => ({ - ts: e.ts, - flowId: e.flowId, - eventType: e.eventType, - rule: e.rule, - unitId: e.data?.unitId as string | undefined, - })); + // Fully parse recent files for event counts and timeline + const eventCounts: Record = {}; + const flowIds = new Set(); + const recentParsedEntries: { ts: string; flowId: string; eventType: string; rule?: string; unitId?: string }[] = []; + let recentEntryCount = 0; + + for (const file of recentFiles) { + try { + const raw = readFileSync(join(journalDir, file), "utf-8"); + for (const line of raw.split("\n")) { + if (!line.trim()) continue; + try { + const entry = JSON.parse(line) as { ts: string; flowId: string; eventType: string; rule?: string; data?: Record }; + recentEntryCount++; + eventCounts[entry.eventType] = (eventCounts[entry.eventType] ?? 0) + 1; + flowIds.add(entry.flowId); + + if (!oldestEntry) oldestEntry = entry.ts; + + // Keep a rolling window of last N events — avoids accumulating unbounded arrays + recentParsedEntries.push({ + ts: entry.ts, + flowId: entry.flowId, + eventType: entry.eventType, + rule: entry.rule, + unitId: entry.data?.unitId as string | undefined, + }); + if (recentParsedEntries.length > MAX_JOURNAL_RECENT_EVENTS) { + recentParsedEntries.shift(); + } + } catch { /* skip malformed lines */ } + } + } catch { /* skip unreadable files */ } + } + + const totalEntries = olderEntryCount + recentEntryCount; + if (totalEntries === 0) return null; + + const newestEntry = recentParsedEntries.length > 0 + ? recentParsedEntries[recentParsedEntries.length - 1]!.ts + : null; return { - totalEntries: entries.length, + totalEntries, flowCount: flowIds.size, eventCounts, - recentEvents, - oldestEntry: entries[0]?.ts ?? null, - newestEntry: entries[entries.length - 1]?.ts ?? null, + recentEvents: recentParsedEntries, + oldestEntry, + newestEntry, fileCount: files.length, }; } catch { diff --git a/src/resources/extensions/gsd/tests/forensics-journal.test.ts b/src/resources/extensions/gsd/tests/forensics-journal.test.ts index f086e6f6f..ead29c00a 100644 --- a/src/resources/extensions/gsd/tests/forensics-journal.test.ts +++ b/src/resources/extensions/gsd/tests/forensics-journal.test.ts @@ -11,14 +11,34 @@ describe("forensics journal & activity log awareness", () => { const forensicsSrc = readFileSync(join(gsdDir, "forensics.ts"), "utf-8"); const promptSrc = readFileSync(join(gsdDir, "prompts", "forensics.md"), "utf-8"); - it("forensics.ts imports queryJournal from journal module", () => { + it("scanJournalForForensics reads journal files directly (no full queryJournal load)", () => { + // Must NOT use queryJournal which loads ALL entries into memory assert.ok( - forensicsSrc.includes('from "./journal.js"') || forensicsSrc.includes("from './journal.js'"), - "forensics.ts must import from journal.js", + !forensicsSrc.includes('queryJournal('), + "forensics.ts must NOT call queryJournal() which loads all entries at once", + ); + // Must have its own journal scanning with file-level limits + assert.ok( + forensicsSrc.includes("scanJournalForForensics"), + "forensics.ts must have scanJournalForForensics function", + ); + }); + + it("journal scanning limits files parsed to avoid memory bloat", () => { + assert.ok( + forensicsSrc.includes("MAX_JOURNAL_RECENT_FILES"), + "must have MAX_JOURNAL_RECENT_FILES constant to limit parsed files", ); assert.ok( - forensicsSrc.includes("queryJournal"), - "forensics.ts must reference queryJournal", + forensicsSrc.includes("MAX_JOURNAL_RECENT_EVENTS"), + "must have MAX_JOURNAL_RECENT_EVENTS constant to limit events extracted", + ); + }); + + it("older journal files are line-counted without full JSON parse", () => { + assert.ok( + forensicsSrc.includes("olderEntryCount") || forensicsSrc.includes("olderFiles"), + "must handle older files separately from recent files", ); }); @@ -76,6 +96,41 @@ describe("forensics journal & activity log awareness", () => { ); }); + it("activity log scanning uses tail-read with byte cap (not full file load)", () => { + // scanActivityLogs uses nativeParseJsonlTail + MAX_JSONL_BYTES for efficient reading + assert.ok( + forensicsSrc.includes("nativeParseJsonlTail"), + "activity log scanning must use nativeParseJsonlTail for tail-reading", + ); + assert.ok( + forensicsSrc.includes("MAX_JSONL_BYTES"), + "activity log scanning must respect MAX_JSONL_BYTES cap", + ); + // Only reads last 5 files + assert.ok( + forensicsSrc.includes("slice(-5)"), + "activity log scanning must limit to last 5 files", + ); + }); + + it("activity log entries are distilled through extractTrace, not sent raw", () => { + assert.ok( + forensicsSrc.includes("extractTrace("), + "activity log entries must be distilled through extractTrace before reporting", + ); + }); + + it("prompt output is hard-capped at 30KB", () => { + assert.ok( + forensicsSrc.includes("MAX_BYTES") && forensicsSrc.includes("30 * 1024"), + "formatReportForPrompt must have a 30KB hard cap", + ); + assert.ok( + forensicsSrc.includes("truncated at 30KB"), + "prompt must show truncation message when capped", + ); + }); + it("forensics prompt documents journal format", () => { assert.ok( promptSrc.includes("### Journal Format"),