fix: prevent heap OOM during long-running auto-mode sessions (#611) (#613)

Multiple sources of unbounded memory growth caused V8 to OOM after
~50 minutes of auto-mode operation:

1. activity-log.ts: saveActivityLog serialized ALL session entries
   into a single string for SHA1 dedup, allocating hundreds of MB
   per unit cycle. Now uses streaming writes (writeSync per entry)
   and a lightweight fingerprint (entry count + last 3 entries hash)
   instead of full-content hashing.

2. activity-log.ts: activityLogState Map was never cleared between
   sessions, accumulating lastSnapshotKeyByUnit entries indefinitely.
   Added clearActivityLogState() export, called from stopAuto().

3. auto.ts: completedUnits array grew unbounded for dashboard
   display. Now capped at 200 entries and cleared on stopAuto().

4. paths.ts: dirEntryCache and dirListCache Maps grew without bounds
   between clearPathCache() calls. Added DIR_CACHE_MAX (200) eviction
   — when cache exceeds limit, it's cleared before adding new entries.

Closes #611
This commit is contained in:
Flux Labs 2026-03-16 07:57:39 -05:00 committed by GitHub
parent 27cfababdb
commit b0f880689b
4 changed files with 138 additions and 8 deletions

View file

@ -8,7 +8,7 @@
* Diagnostic extraction is handled by session-forensics.ts.
*/
import { writeFileSync, mkdirSync, readdirSync, unlinkSync, statSync, openSync, closeSync, constants } from "node:fs";
import { writeFileSync, writeSync, mkdirSync, readdirSync, unlinkSync, statSync, openSync, closeSync, constants } from "node:fs";
import { createHash } from "node:crypto";
import { join } from "node:path";
@ -23,6 +23,15 @@ interface ActivityLogState {
const activityLogState = new Map<string, ActivityLogState>();
/**
* Clear accumulated activity log state (#611).
* Call when auto-mode stops to prevent unbounded memory growth
* from lastSnapshotKeyByUnit maps accumulating across units.
*/
export function clearActivityLogState(): void {
activityLogState.clear();
}
function scanNextSequence(activityDir: string): number {
let maxSeq = 0;
try {
@ -46,9 +55,21 @@ function getActivityState(activityDir: string): ActivityLogState {
return state;
}
function snapshotKey(unitType: string, unitId: string, content: string): string {
const digest = createHash("sha1").update(content).digest("hex");
return `${unitType}\0${unitId}\0${digest}`;
/**
* Build a lightweight dedup key from session entries without serializing
* the entire content to a string (#611). Uses entry count + hash of
* the last few entries as a fingerprint instead of hashing megabytes.
*/
function snapshotKey(unitType: string, unitId: string, entries: unknown[]): string {
const hash = createHash("sha1");
hash.update(`${unitType}\0${unitId}\0${entries.length}\0`);
// Hash only the last 3 entries as a fingerprint — if the session grew,
// the count change alone detects it; if content changed, the tail hash catches it.
const tail = entries.slice(-3);
for (const entry of tail) {
hash.update(JSON.stringify(entry));
}
return hash.digest("hex");
}
function nextActivityFilePath(
@ -91,14 +112,23 @@ export function saveActivityLog(
mkdirSync(activityDir, { recursive: true });
const safeUnitId = unitId.replace(/\//g, "-");
const content = `${entries.map(entry => JSON.stringify(entry)).join("\n")}\n`;
const state = getActivityState(activityDir);
const unitKey = `${unitType}\0${safeUnitId}`;
const key = snapshotKey(unitType, safeUnitId, content);
// Use lightweight fingerprint instead of serializing all entries (#611)
const key = snapshotKey(unitType, safeUnitId, entries);
if (state.lastSnapshotKeyByUnit.get(unitKey) === key) return;
const filePath = nextActivityFilePath(activityDir, state, unitType, safeUnitId);
writeFileSync(filePath, content, "utf-8");
// Stream entries to disk line-by-line instead of building one massive string (#611).
// For large sessions, the single-string approach allocated hundreds of MB.
const fd = openSync(filePath, "w");
try {
for (const entry of entries) {
writeSync(fd, JSON.stringify(entry) + "\n");
}
} finally {
closeSync(fd);
}
state.nextSeq += 1;
state.lastSnapshotKeyByUnit.set(unitKey, key);
} catch (e) {

View file

@ -29,7 +29,7 @@ import {
buildMilestoneFileName, buildSliceFileName, buildTaskFileName,
} from "./paths.js";
import { invalidateAllCaches } from "./cache.js";
import { saveActivityLog } from "./activity-log.js";
import { saveActivityLog, clearActivityLogState } from "./activity-log.js";
import { synthesizeCrashRecovery, getDeepDiagnostic } from "./session-forensics.js";
import { writeLock, clearLock, readCrashLock, formatCrashInfo, isLockProcessAlive } from "./crash-recovery.js";
import {
@ -485,7 +485,9 @@ export async function stopAuto(ctx?: ExtensionContext, pi?: ExtensionAPI): Promi
currentUnit = null;
currentMilestoneId = null;
originalBasePath = "";
completedUnits = [];
clearSliceProgressCache();
clearActivityLogState();
pendingCrashRecovery = null;
_handlingAgentEnd = false;
ctx?.ui.setStatus("gsd-auto", undefined);
@ -1784,6 +1786,10 @@ async function dispatchNextUnit(
startedAt: currentUnit.startedAt,
finishedAt: Date.now(),
});
// Cap to last 200 entries to prevent unbounded growth (#611)
if (completedUnits.length > 200) {
completedUnits = completedUnits.slice(-200);
}
clearUnitRuntimeRecord(basePath, currentUnit.type, currentUnit.id);
unitDispatchCount.delete(`${currentUnit.type}/${currentUnit.id}`);
unitRecoveryCount.delete(`${currentUnit.type}/${currentUnit.id}`);

View file

@ -15,6 +15,9 @@ import { nativeScanGsdTree, type GsdTreeEntry } from "./native-parser-bridge.js"
// ─── Directory Listing Cache ──────────────────────────────────────────────────
/** Max entries before eviction. Prevents unbounded growth in long sessions (#611). */
const DIR_CACHE_MAX = 200;
const dirEntryCache = new Map<string, Dirent[]>();
const dirListCache = new Map<string, string[]>();
@ -85,6 +88,7 @@ function cachedReaddirWithTypes(dirPath: string): Dirent[] {
d.isSocket = () => false;
return d;
});
if (dirEntryCache.size >= DIR_CACHE_MAX) dirEntryCache.clear();
dirEntryCache.set(dirPath, dirents);
return dirents;
}
@ -92,6 +96,7 @@ function cachedReaddirWithTypes(dirPath: string): Dirent[] {
}
const entries = readdirSync(dirPath, { withFileTypes: true });
if (dirEntryCache.size >= DIR_CACHE_MAX) dirEntryCache.clear();
dirEntryCache.set(dirPath, entries);
return entries;
}
@ -107,6 +112,7 @@ function cachedReaddir(dirPath: string): string[] {
const treeEntries = nativeTreeCache.get(key);
if (treeEntries) {
const names = treeEntries.map(e => e.name);
if (dirListCache.size >= DIR_CACHE_MAX) dirListCache.clear();
dirListCache.set(dirPath, names);
return names;
}
@ -114,6 +120,7 @@ function cachedReaddir(dirPath: string): string[] {
}
const entries = readdirSync(dirPath);
if (dirListCache.size >= DIR_CACHE_MAX) dirListCache.clear();
dirListCache.set(dirPath, entries);
return entries;
}

View file

@ -0,0 +1,87 @@
/**
* memory-leak-guards.test.ts Tests for #611 memory leak fixes.
*
* Verifies that module-level state accumulators are properly bounded
* and cleared to prevent OOM during long-running auto-mode sessions.
*/
import test from "node:test";
import assert from "node:assert/strict";
import { mkdtempSync, rmSync, existsSync, readdirSync, readFileSync } from "node:fs";
import { join } from "node:path";
import { tmpdir } from "node:os";
import { saveActivityLog, clearActivityLogState } from "../activity-log.ts";
import { clearPathCache } from "../paths.ts";
import type { ExtensionContext } from "@gsd/pi-coding-agent";
function createCtx(entries: unknown[]) {
return { sessionManager: { getEntries: () => entries } } as unknown as ExtensionContext;
}
// ─── activity-log: clearActivityLogState ─────────────────────────────────────
test("clearActivityLogState resets dedup state so identical saves write again", () => {
clearActivityLogState();
const baseDir = mkdtempSync(join(tmpdir(), "gsd-memleak-test-"));
try {
const entries = [{ role: "assistant", content: "test entry" }];
const ctx = createCtx(entries);
// First save
saveActivityLog(ctx, baseDir, "execute-task", "M001/S01/T01");
const actDir = join(baseDir, ".gsd", "activity");
assert.equal(readdirSync(actDir).length, 1, "first save creates one file");
// Same content, same unit — deduped
saveActivityLog(ctx, baseDir, "execute-task", "M001/S01/T01");
assert.equal(readdirSync(actDir).length, 1, "dedup prevents duplicate write");
// Clear state
clearActivityLogState();
// Same content again — after clear, writes again (fresh state)
saveActivityLog(ctx, baseDir, "execute-task", "M001/S01/T01");
assert.equal(readdirSync(actDir).length, 2, "after clear, dedup state is reset");
} finally {
rmSync(baseDir, { recursive: true, force: true });
}
});
// ─── activity-log: streaming JSONL write ────────────────────────────────────
test("saveActivityLog writes valid JSONL via streaming", () => {
clearActivityLogState();
const baseDir = mkdtempSync(join(tmpdir(), "gsd-memleak-jsonl-"));
try {
const entries = [
{ type: "message", message: { role: "user", content: "hello" } },
{ type: "message", message: { role: "assistant", content: "world" } },
{ type: "message", message: { role: "user", content: "test" } },
];
const ctx = createCtx(entries);
saveActivityLog(ctx, baseDir, "execute-task", "M002/S01/T01");
const actDir = join(baseDir, ".gsd", "activity");
const files = readdirSync(actDir);
assert.equal(files.length, 1, "one file written");
const content = readFileSync(join(actDir, files[0]), "utf-8");
const lines = content.trim().split("\n");
assert.equal(lines.length, 3, "three JSONL lines");
for (const line of lines) {
assert.doesNotThrow(() => JSON.parse(line), `line is valid JSON`);
}
} finally {
rmSync(baseDir, { recursive: true, force: true });
}
});
// ─── paths.ts: directory cache bounds ───────────────────────────────────────
test("clearPathCache does not throw", () => {
assert.doesNotThrow(() => clearPathCache(), "clearPathCache should not throw");
});