Merge pull request #2599 from gsd-build/refine/dedupe-session-lock-handlers
refactor: deduplicate session-lock compromise handler
This commit is contained in:
commit
10e907a519
1 changed files with 54 additions and 64 deletions
|
|
@ -167,6 +167,56 @@ function ensureExitHandler(_gsdDir: string): void {
|
|||
});
|
||||
}
|
||||
|
||||
// ─── Lock Acquisition Helpers ───────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
* Create the onCompromised callback for proper-lockfile.
|
||||
*
|
||||
* proper-lockfile fires onCompromised when it detects mtime drift (system sleep,
|
||||
* event loop stall, etc.). The default handler throws inside setTimeout — an
|
||||
* uncaught exception that crashes or corrupts process state.
|
||||
*
|
||||
* False-positive suppression (#1362): If we're still within the stale window
|
||||
* (30 min since acquisition), the mtime mismatch is from an event loop stall
|
||||
* during a long LLM call — not a real takeover. Log and continue.
|
||||
*
|
||||
* PID ownership check (#1578): Past the stale window, check if the lock file
|
||||
* still contains our PID before declaring compromise. Retry reads tolerate
|
||||
* transient filesystem hiccups (NFS/CIFS latency, APFS snapshots, etc.) (#2324).
|
||||
*/
|
||||
function createLockCompromisedHandler(lockFilePath: string): () => void {
|
||||
return () => {
|
||||
const elapsed = Date.now() - _lockAcquiredAt;
|
||||
if (elapsed < 1_800_000) {
|
||||
process.stderr.write(
|
||||
`[gsd] Lock heartbeat caught up after ${Math.round(elapsed / 1000)}s — long LLM call, no action needed.\n`,
|
||||
);
|
||||
return;
|
||||
}
|
||||
const existing = readExistingLockDataWithRetry(lockFilePath);
|
||||
if (existing && existing.pid === process.pid) {
|
||||
process.stderr.write(
|
||||
`[gsd] Lock heartbeat mismatch after ${Math.round(elapsed / 1000)}s — lock file still owned by PID ${process.pid}, treating as false positive.\n`,
|
||||
);
|
||||
return;
|
||||
}
|
||||
_lockCompromised = true;
|
||||
_releaseFunction = null;
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Assign module-level lock state after a successful lock acquisition.
|
||||
*/
|
||||
function assignLockState(basePath: string, release: () => void, lockFilePath: string): void {
|
||||
_releaseFunction = release;
|
||||
_lockedPath = basePath;
|
||||
_lockPid = process.pid;
|
||||
_lockCompromised = false;
|
||||
_lockAcquiredAt = Date.now();
|
||||
_snapshotLockPath = lockFilePath;
|
||||
}
|
||||
|
||||
// ─── Public API ─────────────────────────────────────────────────────────────
|
||||
|
||||
/**
|
||||
|
|
@ -226,43 +276,10 @@ export function acquireSessionLock(basePath: string): SessionLockResult {
|
|||
realpath: false,
|
||||
stale: 1_800_000, // 30 minutes — safe for laptop sleep / long event loop stalls
|
||||
update: 10_000, // Update lock mtime every 10s to prove liveness
|
||||
onCompromised: () => {
|
||||
// proper-lockfile detected mtime drift (system sleep, event loop stall, etc.).
|
||||
// Default handler throws inside setTimeout — an uncaught exception that crashes
|
||||
// or corrupts process state.
|
||||
//
|
||||
// False-positive suppression (#1362): If we're still within the stale window
|
||||
// (30 min since acquisition), the mtime mismatch is from an event loop stall
|
||||
// during a long LLM call — not a real takeover. Log and continue.
|
||||
const elapsed = Date.now() - _lockAcquiredAt;
|
||||
if (elapsed < 1_800_000) {
|
||||
process.stderr.write(
|
||||
`[gsd] Lock heartbeat caught up after ${Math.round(elapsed / 1000)}s — long LLM call, no action needed.\n`,
|
||||
);
|
||||
return; // Suppress false positive
|
||||
}
|
||||
// Past the stale window — check if the lock file still belongs to us before
|
||||
// declaring compromise (#1578). Retry reads to tolerate transient filesystem
|
||||
// hiccups (NFS/CIFS latency, APFS snapshots, etc.) (#2324).
|
||||
const existing = readExistingLockDataWithRetry(lp);
|
||||
if (existing && existing.pid === process.pid) {
|
||||
process.stderr.write(
|
||||
`[gsd] Lock heartbeat mismatch after ${Math.round(elapsed / 1000)}s — lock file still owned by PID ${process.pid}, treating as false positive.\n`,
|
||||
);
|
||||
return; // Our PID still owns the lock file — no real takeover
|
||||
}
|
||||
// Lock file is gone or owned by another PID after retries — real compromise
|
||||
_lockCompromised = true;
|
||||
_releaseFunction = null;
|
||||
},
|
||||
onCompromised: createLockCompromisedHandler(lp),
|
||||
});
|
||||
|
||||
_releaseFunction = release;
|
||||
_lockedPath = basePath;
|
||||
_lockPid = process.pid;
|
||||
_lockCompromised = false;
|
||||
_lockAcquiredAt = Date.now();
|
||||
_snapshotLockPath = lp; // Snapshot the resolved path for consistent access (#1363)
|
||||
assignLockState(basePath, release, lp);
|
||||
|
||||
// Safety net: clean up lock dir on process exit if _releaseFunction
|
||||
// wasn't called (e.g., normal exit after clean completion) (#1245).
|
||||
|
|
@ -290,36 +307,9 @@ export function acquireSessionLock(basePath: string): SessionLockResult {
|
|||
realpath: false,
|
||||
stale: 1_800_000, // 30 minutes — match primary lock settings
|
||||
update: 10_000,
|
||||
onCompromised: () => {
|
||||
// Same false-positive suppression as the primary lock (#1512).
|
||||
// Without this, the retry path fires _lockCompromised unconditionally
|
||||
// on benign mtime drift (laptop sleep, heavy LLM event loop stalls).
|
||||
const elapsed = Date.now() - _lockAcquiredAt;
|
||||
if (elapsed < 1_800_000) {
|
||||
process.stderr.write(
|
||||
`[gsd] Lock heartbeat caught up after ${Math.round(elapsed / 1000)}s — long LLM call, no action needed.\n`,
|
||||
);
|
||||
return;
|
||||
}
|
||||
// Check PID ownership before declaring compromise (#1578).
|
||||
// Retry reads to tolerate transient filesystem hiccups (#2324).
|
||||
const existing = readExistingLockDataWithRetry(lp);
|
||||
if (existing && existing.pid === process.pid) {
|
||||
process.stderr.write(
|
||||
`[gsd] Lock heartbeat mismatch after ${Math.round(elapsed / 1000)}s — lock file still owned by PID ${process.pid}, treating as false positive.\n`,
|
||||
);
|
||||
return;
|
||||
}
|
||||
_lockCompromised = true;
|
||||
_releaseFunction = null;
|
||||
},
|
||||
onCompromised: createLockCompromisedHandler(lp),
|
||||
});
|
||||
_releaseFunction = release;
|
||||
_lockedPath = basePath;
|
||||
_lockPid = process.pid;
|
||||
_lockCompromised = false;
|
||||
_lockAcquiredAt = Date.now();
|
||||
_snapshotLockPath = lp; // Snapshot for retry path too (#1363)
|
||||
assignLockState(basePath, release, lp);
|
||||
|
||||
// Safety net — uses centralized handler to avoid double-registration
|
||||
ensureExitHandler(gsdDir);
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue