Merge pull request #2599 from gsd-build/refine/dedupe-session-lock-handlers

refactor: deduplicate session-lock compromise handler
This commit is contained in:
TÂCHES 2026-03-25 22:46:33 -06:00 committed by GitHub
commit 10e907a519

View file

@ -167,6 +167,56 @@ function ensureExitHandler(_gsdDir: string): void {
});
}
// ─── Lock Acquisition Helpers ───────────────────────────────────────────────
/**
* Create the onCompromised callback for proper-lockfile.
*
* proper-lockfile fires onCompromised when it detects mtime drift (system sleep,
* event loop stall, etc.). The default handler throws inside setTimeout an
* uncaught exception that crashes or corrupts process state.
*
* False-positive suppression (#1362): If we're still within the stale window
* (30 min since acquisition), the mtime mismatch is from an event loop stall
* during a long LLM call not a real takeover. Log and continue.
*
* PID ownership check (#1578): Past the stale window, check if the lock file
* still contains our PID before declaring compromise. Retry reads tolerate
* transient filesystem hiccups (NFS/CIFS latency, APFS snapshots, etc.) (#2324).
*/
function createLockCompromisedHandler(lockFilePath: string): () => void {
return () => {
const elapsed = Date.now() - _lockAcquiredAt;
if (elapsed < 1_800_000) {
process.stderr.write(
`[gsd] Lock heartbeat caught up after ${Math.round(elapsed / 1000)}s — long LLM call, no action needed.\n`,
);
return;
}
const existing = readExistingLockDataWithRetry(lockFilePath);
if (existing && existing.pid === process.pid) {
process.stderr.write(
`[gsd] Lock heartbeat mismatch after ${Math.round(elapsed / 1000)}s — lock file still owned by PID ${process.pid}, treating as false positive.\n`,
);
return;
}
_lockCompromised = true;
_releaseFunction = null;
};
}
/**
* Assign module-level lock state after a successful lock acquisition.
*/
function assignLockState(basePath: string, release: () => void, lockFilePath: string): void {
_releaseFunction = release;
_lockedPath = basePath;
_lockPid = process.pid;
_lockCompromised = false;
_lockAcquiredAt = Date.now();
_snapshotLockPath = lockFilePath;
}
// ─── Public API ─────────────────────────────────────────────────────────────
/**
@ -226,43 +276,10 @@ export function acquireSessionLock(basePath: string): SessionLockResult {
realpath: false,
stale: 1_800_000, // 30 minutes — safe for laptop sleep / long event loop stalls
update: 10_000, // Update lock mtime every 10s to prove liveness
onCompromised: () => {
// proper-lockfile detected mtime drift (system sleep, event loop stall, etc.).
// Default handler throws inside setTimeout — an uncaught exception that crashes
// or corrupts process state.
//
// False-positive suppression (#1362): If we're still within the stale window
// (30 min since acquisition), the mtime mismatch is from an event loop stall
// during a long LLM call — not a real takeover. Log and continue.
const elapsed = Date.now() - _lockAcquiredAt;
if (elapsed < 1_800_000) {
process.stderr.write(
`[gsd] Lock heartbeat caught up after ${Math.round(elapsed / 1000)}s — long LLM call, no action needed.\n`,
);
return; // Suppress false positive
}
// Past the stale window — check if the lock file still belongs to us before
// declaring compromise (#1578). Retry reads to tolerate transient filesystem
// hiccups (NFS/CIFS latency, APFS snapshots, etc.) (#2324).
const existing = readExistingLockDataWithRetry(lp);
if (existing && existing.pid === process.pid) {
process.stderr.write(
`[gsd] Lock heartbeat mismatch after ${Math.round(elapsed / 1000)}s — lock file still owned by PID ${process.pid}, treating as false positive.\n`,
);
return; // Our PID still owns the lock file — no real takeover
}
// Lock file is gone or owned by another PID after retries — real compromise
_lockCompromised = true;
_releaseFunction = null;
},
onCompromised: createLockCompromisedHandler(lp),
});
_releaseFunction = release;
_lockedPath = basePath;
_lockPid = process.pid;
_lockCompromised = false;
_lockAcquiredAt = Date.now();
_snapshotLockPath = lp; // Snapshot the resolved path for consistent access (#1363)
assignLockState(basePath, release, lp);
// Safety net: clean up lock dir on process exit if _releaseFunction
// wasn't called (e.g., normal exit after clean completion) (#1245).
@ -290,36 +307,9 @@ export function acquireSessionLock(basePath: string): SessionLockResult {
realpath: false,
stale: 1_800_000, // 30 minutes — match primary lock settings
update: 10_000,
onCompromised: () => {
// Same false-positive suppression as the primary lock (#1512).
// Without this, the retry path fires _lockCompromised unconditionally
// on benign mtime drift (laptop sleep, heavy LLM event loop stalls).
const elapsed = Date.now() - _lockAcquiredAt;
if (elapsed < 1_800_000) {
process.stderr.write(
`[gsd] Lock heartbeat caught up after ${Math.round(elapsed / 1000)}s — long LLM call, no action needed.\n`,
);
return;
}
// Check PID ownership before declaring compromise (#1578).
// Retry reads to tolerate transient filesystem hiccups (#2324).
const existing = readExistingLockDataWithRetry(lp);
if (existing && existing.pid === process.pid) {
process.stderr.write(
`[gsd] Lock heartbeat mismatch after ${Math.round(elapsed / 1000)}s — lock file still owned by PID ${process.pid}, treating as false positive.\n`,
);
return;
}
_lockCompromised = true;
_releaseFunction = null;
},
onCompromised: createLockCompromisedHandler(lp),
});
_releaseFunction = release;
_lockedPath = basePath;
_lockPid = process.pid;
_lockCompromised = false;
_lockAcquiredAt = Date.now();
_snapshotLockPath = lp; // Snapshot for retry path too (#1363)
assignLockState(basePath, release, lp);
// Safety net — uses centralized handler to avoid double-registration
ensureExitHandler(gsdDir);