fix: make journal scanning intelligent — limit parsed files, line-count older ones

scanJournalForForensics() previously called queryJournal() which loaded
ALL journal entries from ALL daily files into memory. For long-running
projects this could be thousands of entries and megabytes of data.

Now:
- Only the last 3 daily files are fully JSON-parsed (event counts, flows)
- Older files are line-counted only (no JSON parsing) for totals
- Recent events use a rolling window of 20 (shift, not accumulate)
- Constants MAX_JOURNAL_RECENT_FILES and MAX_JOURNAL_RECENT_EVENTS
  make limits explicit and tunable

Activity log scanning was already intelligent:
- nativeParseJsonlTail with 10MB byte cap
- Only last 5 files scanned
- extractTrace() distills raw JSONL into compact ExecutionTrace structs
- formatReportForPrompt has 30KB hard cap on total output

Co-authored-by: glittercowboy <186001655+glittercowboy@users.noreply.github.com>
Agent-Logs-Url: https://github.com/gsd-build/gsd-2/sessions/7e7f71ec-0d56-409b-930e-5dff1305ff2a
This commit is contained in:
copilot-swe-agent[bot] 2026-03-25 21:53:37 +00:00
parent ce4720bad8
commit aee8973d81
2 changed files with 155 additions and 30 deletions

View file

@ -28,7 +28,6 @@ import { deriveState } from "./state.js";
import { isAutoActive } from "./auto.js";
import { loadPrompt } from "./prompt-loader.js";
import { gsdRoot } from "./paths.js";
import { queryJournal } from "./journal.js";
import { formatDuration } from "../shared/format-utils.js";
import { getAutoWorktreePath } from "./auto-worktree.js";
import { loadEffectiveGSDPreferences, loadGlobalGSDPreferences, getGlobalGSDPreferencesPath } from "./preferences.js";
@ -63,13 +62,19 @@ interface ActivityLogMeta {
newestFile: string | null;
}
/** Summary of .gsd/journal/ data for forensic investigation. */
/**
* Summary of .gsd/journal/ data for forensic investigation.
*
* To avoid loading huge journal histories into memory, only the most recent
* daily files are fully parsed. Older files are line-counted for totals.
* Event counts and flow IDs reflect only recent files.
*/
interface JournalSummary {
/** Total journal entries scanned */
/** Total journal entries across all files (recent parsed + older line-counted) */
totalEntries: number;
/** Distinct flow IDs (each = one auto-mode iteration) */
/** Distinct flow IDs from recent files (each = one auto-mode iteration) */
flowCount: number;
/** Event counts by type */
/** Event counts by type (from recent files only) */
eventCounts: Record<string, number>;
/** Most recent journal entries (last 20) for context */
recentEvents: { ts: string; flowId: string; eventType: string; rule?: string; unitId?: string }[];
@ -422,6 +427,24 @@ function resolveActivityDirs(basePath: string, activeMilestone?: string | null):
// ─── Journal Scanner ──────────────────────────────────────────────────────────
/**
* Max recent journal files to fully parse for event counts and recent events.
* Older files are line-counted only to avoid loading huge amounts of data.
*/
const MAX_JOURNAL_RECENT_FILES = 3;
/** Max recent events to extract for the forensic report timeline. */
const MAX_JOURNAL_RECENT_EVENTS = 20;
/**
* Intelligently scan journal files for forensic summary.
*
* Journal files can be huge (thousands of JSONL entries over weeks of auto-mode).
* Instead of loading all entries into memory:
* - Only fully parse the most recent N daily files (event counts, flow tracking)
* - Line-count older files for approximate totals (no JSON parsing)
* - Extract only the last 20 events for the timeline
*/
function scanJournalForForensics(basePath: string): JournalSummary | null {
try {
const journalDir = join(gsdRoot(basePath), "journal");
@ -430,33 +453,80 @@ function scanJournalForForensics(basePath: string): JournalSummary | null {
const files = readdirSync(journalDir).filter(f => f.endsWith(".jsonl")).sort();
if (files.length === 0) return null;
const entries = queryJournal(basePath);
if (entries.length === 0) return null;
// Split into recent (fully parsed) and older (line-counted only)
const recentFiles = files.slice(-MAX_JOURNAL_RECENT_FILES);
const olderFiles = files.slice(0, -MAX_JOURNAL_RECENT_FILES);
// Count events by type
const eventCounts: Record<string, number> = {};
const flowIds = new Set<string>();
for (const e of entries) {
eventCounts[e.eventType] = (eventCounts[e.eventType] ?? 0) + 1;
flowIds.add(e.flowId);
// Line-count older files without parsing — avoids loading megabytes of JSON
let olderEntryCount = 0;
let oldestEntry: string | null = null;
for (const file of olderFiles) {
try {
const raw = readFileSync(join(journalDir, file), "utf-8");
const lines = raw.split("\n");
for (const line of lines) {
if (!line.trim()) continue;
olderEntryCount++;
// Extract only the timestamp from the first non-empty line of the oldest file
if (!oldestEntry) {
try {
const parsed = JSON.parse(line) as { ts?: string };
if (parsed.ts) oldestEntry = parsed.ts;
} catch { /* skip malformed */ }
}
}
} catch { /* skip unreadable files */ }
}
// Extract recent events (last 20) with key fields for the report
const recentEvents = entries.slice(-20).map(e => ({
ts: e.ts,
flowId: e.flowId,
eventType: e.eventType,
rule: e.rule,
unitId: e.data?.unitId as string | undefined,
}));
// Fully parse recent files for event counts and timeline
const eventCounts: Record<string, number> = {};
const flowIds = new Set<string>();
const recentParsedEntries: { ts: string; flowId: string; eventType: string; rule?: string; unitId?: string }[] = [];
let recentEntryCount = 0;
for (const file of recentFiles) {
try {
const raw = readFileSync(join(journalDir, file), "utf-8");
for (const line of raw.split("\n")) {
if (!line.trim()) continue;
try {
const entry = JSON.parse(line) as { ts: string; flowId: string; eventType: string; rule?: string; data?: Record<string, unknown> };
recentEntryCount++;
eventCounts[entry.eventType] = (eventCounts[entry.eventType] ?? 0) + 1;
flowIds.add(entry.flowId);
if (!oldestEntry) oldestEntry = entry.ts;
// Keep a rolling window of last N events — avoids accumulating unbounded arrays
recentParsedEntries.push({
ts: entry.ts,
flowId: entry.flowId,
eventType: entry.eventType,
rule: entry.rule,
unitId: entry.data?.unitId as string | undefined,
});
if (recentParsedEntries.length > MAX_JOURNAL_RECENT_EVENTS) {
recentParsedEntries.shift();
}
} catch { /* skip malformed lines */ }
}
} catch { /* skip unreadable files */ }
}
const totalEntries = olderEntryCount + recentEntryCount;
if (totalEntries === 0) return null;
const newestEntry = recentParsedEntries.length > 0
? recentParsedEntries[recentParsedEntries.length - 1]!.ts
: null;
return {
totalEntries: entries.length,
totalEntries,
flowCount: flowIds.size,
eventCounts,
recentEvents,
oldestEntry: entries[0]?.ts ?? null,
newestEntry: entries[entries.length - 1]?.ts ?? null,
recentEvents: recentParsedEntries,
oldestEntry,
newestEntry,
fileCount: files.length,
};
} catch {

View file

@ -11,14 +11,34 @@ describe("forensics journal & activity log awareness", () => {
const forensicsSrc = readFileSync(join(gsdDir, "forensics.ts"), "utf-8");
const promptSrc = readFileSync(join(gsdDir, "prompts", "forensics.md"), "utf-8");
it("forensics.ts imports queryJournal from journal module", () => {
it("scanJournalForForensics reads journal files directly (no full queryJournal load)", () => {
// Must NOT use queryJournal which loads ALL entries into memory
assert.ok(
forensicsSrc.includes('from "./journal.js"') || forensicsSrc.includes("from './journal.js'"),
"forensics.ts must import from journal.js",
!forensicsSrc.includes('queryJournal('),
"forensics.ts must NOT call queryJournal() which loads all entries at once",
);
// Must have its own journal scanning with file-level limits
assert.ok(
forensicsSrc.includes("scanJournalForForensics"),
"forensics.ts must have scanJournalForForensics function",
);
});
it("journal scanning limits files parsed to avoid memory bloat", () => {
assert.ok(
forensicsSrc.includes("MAX_JOURNAL_RECENT_FILES"),
"must have MAX_JOURNAL_RECENT_FILES constant to limit parsed files",
);
assert.ok(
forensicsSrc.includes("queryJournal"),
"forensics.ts must reference queryJournal",
forensicsSrc.includes("MAX_JOURNAL_RECENT_EVENTS"),
"must have MAX_JOURNAL_RECENT_EVENTS constant to limit events extracted",
);
});
it("older journal files are line-counted without full JSON parse", () => {
assert.ok(
forensicsSrc.includes("olderEntryCount") || forensicsSrc.includes("olderFiles"),
"must handle older files separately from recent files",
);
});
@ -76,6 +96,41 @@ describe("forensics journal & activity log awareness", () => {
);
});
it("activity log scanning uses tail-read with byte cap (not full file load)", () => {
// scanActivityLogs uses nativeParseJsonlTail + MAX_JSONL_BYTES for efficient reading
assert.ok(
forensicsSrc.includes("nativeParseJsonlTail"),
"activity log scanning must use nativeParseJsonlTail for tail-reading",
);
assert.ok(
forensicsSrc.includes("MAX_JSONL_BYTES"),
"activity log scanning must respect MAX_JSONL_BYTES cap",
);
// Only reads last 5 files
assert.ok(
forensicsSrc.includes("slice(-5)"),
"activity log scanning must limit to last 5 files",
);
});
it("activity log entries are distilled through extractTrace, not sent raw", () => {
assert.ok(
forensicsSrc.includes("extractTrace("),
"activity log entries must be distilled through extractTrace before reporting",
);
});
it("prompt output is hard-capped at 30KB", () => {
assert.ok(
forensicsSrc.includes("MAX_BYTES") && forensicsSrc.includes("30 * 1024"),
"formatReportForPrompt must have a 30KB hard cap",
);
assert.ok(
forensicsSrc.includes("truncated at 30KB"),
"prompt must show truncation message when capped",
);
});
it("forensics prompt documents journal format", () => {
assert.ok(
promptSrc.includes("### Journal Format"),