singularity-forge/src/resources/extensions/sf/session-lock.js
2026-05-13 02:01:41 +02:00

698 lines
24 KiB
JavaScript

/**
* SF Session Lock — OS-level exclusive locking for autonomous mode sessions.
*
* Prevents multiple SF processes from running autonomous mode concurrently on
* the same project. Uses proper-lockfile for OS-level file locking (flock/
* lockfile) which eliminates the TOCTOU race condition that existed with
* the old advisory JSON lock approach.
*
* The lock file (.sf/auto.lock) contains JSON metadata (PID, start time,
* unit info) for diagnostics, but the actual exclusion is enforced by the
* OS-level lock held via proper-lockfile.
*
* Lifecycle:
* acquireSessionLock() — called at the START of bootstrapAutoSession
* validateSessionLock() — called periodically during dispatch to detect takeover
* releaseSessionLock() — called on clean stop/pause
*/
import {
existsSync,
mkdirSync,
readdirSync,
readFileSync,
rmSync,
statSync,
unlinkSync,
} from "node:fs";
import { createRequire } from "node:module";
import { dirname, join } from "node:path";
import { atomicWriteSync } from "./atomic-write.js";
import { sfRoot } from "./paths.js";
const _require = createRequire(import.meta.url);
// ─── Module State ───────────────────────────────────────────────────────────
/** Release function from proper-lockfile — calling it releases the OS lock. */
let _releaseFunction = null;
/** The path we currently hold a lock on. */
let _lockedPath = null;
/** Our PID at lock acquisition time. */
let _lockPid = 0;
/** Set to true when proper-lockfile fires onCompromised (mtime drift, sleep, etc.). */
let _lockCompromised = false;
/** Whether we've already registered a process.on('exit') handler. */
let _exitHandlerRegistered = false;
/** Registry of all sfDir paths where locks were created during this session.
* The exit handler cleans ALL of these, not just the current sfRoot(). (#1578) */
const _lockDirRegistry = new Set();
/** Snapshotted lock file path — captured at acquireSessionLock time to avoid
* sfRoot() resolving differently in worktree vs project root contexts (#1363). */
let _snapshotLockPath = null;
/** Timestamp when the session lock was acquired — used to detect false-positive
* onCompromised events from event loop stalls within the stale window (#1362). */
let _lockAcquiredAt = 0;
const LOCK_FILE = "auto.lock";
/**
* Derive the effective lock file name for the current process.
* In parallel worker mode (SF_PARALLEL_WORKER + SF_MILESTONE_LOCK),
* each worker uses a per-milestone lock file (`auto-<milestoneId>.lock`)
* to avoid contending on the shared `.sf/auto.lock` (#2184).
*/
export function effectiveLockFile() {
const mid = process.env.SF_PARALLEL_WORKER
? process.env.SF_MILESTONE_LOCK
: null;
return mid ? `auto-${mid}.lock` : LOCK_FILE;
}
/**
* Derive the OS-level lock target directory for the current process.
* In parallel worker mode, uses `.sf/parallel/<milestoneId>/` instead of
* `.sf/` so workers don't contend on the same proper-lockfile directory (#2184).
*/
export function effectiveLockTarget(sfDir) {
const mid = process.env.SF_PARALLEL_WORKER
? process.env.SF_MILESTONE_LOCK
: null;
return mid ? join(sfDir, "parallel", mid) : sfDir;
}
function lockPath(basePath) {
// If we have a snapshotted path from acquisition, use it for consistency
if (_snapshotLockPath) return _snapshotLockPath;
return join(sfRoot(basePath), effectiveLockFile());
}
// ─── Stray Lock Cleanup ─────────────────────────────────────────────────────
/**
* Remove numbered lock file variants (e.g. "auto 2.lock", "auto 3.lock")
* that accumulate from macOS file conflict resolution (iCloud/Dropbox/OneDrive)
* or other filesystem-level copy-on-conflict behavior (#1315).
*
* Also removes stray proper-lockfile directories beyond the canonical `.sf.lock/`.
*/
export function cleanupStrayLockFiles(basePath) {
const sfDir = sfRoot(basePath);
// Clean numbered auto lock files inside .sf/
try {
if (existsSync(sfDir)) {
for (const entry of readdirSync(sfDir)) {
// Match "auto <N>.lock" or "auto (<N>).lock" variants but NOT the canonical "auto.lock"
if (entry !== LOCK_FILE && /^auto\s.+\.lock$/i.test(entry)) {
try {
unlinkSync(join(sfDir, entry));
} catch {
/* best-effort */
}
}
}
}
} catch {
/* non-fatal: directory read failure */
}
// Clean stray proper-lockfile directories (e.g. ".sf 2.lock/")
// The canonical one is ".sf.lock/" — anything else is stray.
try {
const parentDir = dirname(sfDir);
const sfDirName = sfDir.split("/").pop() || ".sf";
if (existsSync(parentDir)) {
for (const entry of readdirSync(parentDir)) {
// Match ".sf <N>.lock" or ".sf (<N>).lock" directories but NOT ".sf.lock"
if (
entry !== `${sfDirName}.lock` &&
entry.startsWith(sfDirName) &&
entry.endsWith(".lock")
) {
const fullPath = join(parentDir, entry);
try {
const stat = statSync(fullPath);
if (stat.isDirectory()) {
rmSync(fullPath, { recursive: true, force: true });
}
} catch {
/* best-effort */
}
}
}
}
} catch {
/* non-fatal */
}
}
/**
* Register a single process exit handler that cleans up lock state.
* Uses module-level references so it always operates on current state.
* Only registers once — subsequent calls are no-ops.
*/
function ensureExitHandler(_sfDir) {
// Register the sfDir so exit cleanup covers it
_lockDirRegistry.add(_sfDir);
if (_exitHandlerRegistered) return;
_exitHandlerRegistered = true;
process.once("exit", () => {
try {
if (_releaseFunction) {
_releaseFunction();
_releaseFunction = null;
}
} catch {
/* best-effort */
}
// Clean ALL registered lock paths, not just the current one (#1578).
// Lock files accumulate across main project .sf/, worktree .sf/,
// and projects registry paths — cleanup must cover all of them.
for (const dir of _lockDirRegistry) {
try {
const lockFile = join(dir, LOCK_FILE);
if (existsSync(lockFile)) unlinkSync(lockFile);
} catch {
/* best-effort */
}
try {
const lockDir = join(dir + ".lock");
if (existsSync(lockDir))
rmSync(lockDir, { recursive: true, force: true });
} catch {
/* best-effort */
}
}
});
}
// ─── Lock Acquisition Helpers ───────────────────────────────────────────────
/**
* Create the onCompromised callback for proper-lockfile.
*
* proper-lockfile fires onCompromised when it detects mtime drift (system sleep,
* event loop stall, etc.). The default handler throws inside setTimeout — an
* uncaught exception that crashes or corrupts process state.
*
* False-positive suppression (#1362): If we're still within the stale window
* (30 min since acquisition), the mtime mismatch is from an event loop stall
* during a long LLM call — not a real takeover. Log and continue.
*
* PID ownership check (#1578): Past the stale window, check if the lock file
* still contains our PID before declaring compromise. Retry reads tolerate
* transient filesystem hiccups (NFS/CIFS latency, APFS snapshots, etc.) (#2324).
*/
function createLockCompromisedHandler(lockFilePath) {
return () => {
const elapsed = Date.now() - _lockAcquiredAt;
if (elapsed < 1_800_000) {
process.stderr.write(
`[forge] Lock heartbeat caught up after ${Math.round(elapsed / 1000)}s — long LLM call, no action needed.\n`,
);
return;
}
const existing = readExistingLockDataWithRetry(lockFilePath);
if (existing && existing.pid === process.pid) {
process.stderr.write(
`[forge] Lock heartbeat mismatch after ${Math.round(elapsed / 1000)}s — lock file still owned by PID ${process.pid}, treating as false positive.\n`,
);
return;
}
_lockCompromised = true;
_releaseFunction = null;
};
}
/**
* Assign module-level lock state after a successful lock acquisition.
*/
function assignLockState(basePath, release, lockFilePath) {
_releaseFunction = release;
_lockedPath = basePath;
_lockPid = process.pid;
_lockCompromised = false;
_lockAcquiredAt = Date.now();
_snapshotLockPath = lockFilePath;
}
// ─── Public API ─────────────────────────────────────────────────────────────
/**
* Attempt to acquire an exclusive session lock for the given project.
*
* This uses proper-lockfile for OS-level file locking. If another process
* already holds the lock, this returns { acquired: false } with details.
*
* The lock file also contains JSON metadata about the session for
* diagnostic purposes (PID, unit info, etc.).
*/
export function acquireSessionLock(basePath, sessionInfo) {
const lp = lockPath(basePath);
// Re-entrant acquire on the same path: release our current OS lock first so
// proper-lockfile clears its update timer before we acquire a fresh lock.
if (_releaseFunction && _lockedPath === basePath) {
try {
_releaseFunction();
} catch {
/* may already be released */
}
_releaseFunction = null;
_lockedPath = null;
_lockPid = 0;
_lockCompromised = false;
}
// Ensure the directory exists
mkdirSync(dirname(lp), { recursive: true });
// Clean up numbered lock file variants from cloud sync conflicts (#1315)
cleanupStrayLockFiles(basePath);
// Write our lock data first (the content is informational; the OS lock is the real guard).
// sessionId/sessionFile let observers correlate this autonomous mode session with the
// .sf/sessions/<...>.jsonl event log (closes sf-moocx6lv-9grpvt).
const lockData = {
pid: process.pid,
startedAt: new Date().toISOString(),
unitType: "starting",
unitId: "bootstrap",
unitStartedAt: new Date().toISOString(),
sessionId: sessionInfo?.sessionId,
sessionFile: sessionInfo?.sessionFile,
};
let lockfile;
try {
lockfile = _require("proper-lockfile");
} catch {
// proper-lockfile not available — fall back to PID-based check
return acquireFallbackLock(basePath, lp, lockData);
}
const sfDir = sfRoot(basePath);
const lockTarget = effectiveLockTarget(sfDir);
// #3218: Pre-flight stale lock cleanup — if the .lock/ directory exists but
// no auto.lock metadata is present (or the PID is dead), remove the lock
// directory before attempting acquisition. This prevents the 30-min stale
// window from blocking /after crashes, SIGKILL, or laptop sleep.
const lockDir = lockTarget + ".lock";
if (existsSync(lockDir)) {
const existingData = readExistingLockData(lp);
const isOrphan =
!existingData || (existingData.pid && !isPidAlive(existingData.pid));
if (isOrphan) {
try {
rmSync(lockDir, { recursive: true, force: true });
} catch {
/* best-effort */
}
try {
if (existsSync(lp)) unlinkSync(lp);
} catch {
/* best-effort */
}
}
}
try {
// Try to acquire an exclusive OS-level lock on the lock target.
// We lock a directory since proper-lockfile works best on directories,
// and the lock file itself may not exist yet.
// In parallel worker mode, lockTarget is .sf/parallel/<MID>/ (#2184).
mkdirSync(lockTarget, { recursive: true });
const release = lockfile.lockSync(lockTarget, {
realpath: false,
stale: 1_800_000, // 30 minutes — safe for laptop sleep / long event loop stalls
update: 10_000, // Update lock mtime every 10s to prove liveness
onCompromised: createLockCompromisedHandler(lp),
});
assignLockState(basePath, release, lp);
// Safety net: clean up lock dir on process exit if _releaseFunction
// wasn't called (e.g., normal exit after clean completion) (#1245).
ensureExitHandler(lockTarget);
// Write the informational lock data
atomicWriteSync(lp, JSON.stringify(lockData, null, 2));
return { acquired: true };
} catch (_err) {
// Lock is held by another process — or the .sf.lock/ directory is stranded.
// Check: if auto.lock is gone and no process is alive, the lock dir is stale.
const existingData = readExistingLockData(lp);
const existingPid = existingData?.pid;
// If no lock file or no alive process, try to clean up and re-acquire (#1245)
if (!existingData || (existingPid && !isPidAlive(existingPid))) {
try {
const lockDir = join(lockTarget + ".lock");
if (existsSync(lockDir))
rmSync(lockDir, { recursive: true, force: true });
if (existsSync(lp)) unlinkSync(lp);
// Retry acquisition after cleanup
const release = lockfile.lockSync(lockTarget, {
realpath: false,
stale: 1_800_000, // 30 minutes — match primary lock settings
update: 10_000,
onCompromised: createLockCompromisedHandler(lp),
});
assignLockState(basePath, release, lp);
// Safety net — uses centralized handler to avoid double-registration
ensureExitHandler(lockTarget);
atomicWriteSync(lp, JSON.stringify(lockData, null, 2));
return { acquired: true };
} catch {
// Retry also failed — fall through to the error path
}
}
// #3218: Provide actionable workaround when lock recovery fails
const lockDirPath = lockTarget + ".lock";
const reason = existingPid
? `Another autonomous mode session (PID ${existingPid}) appears to be running.\nStop it with \`kill ${existingPid}\` before starting a new session.`
: `Another autonomous mode session lock is stuck on this project.\nRun: rm -rf "${lockDirPath}" && rm -f "${lp}"`;
return { acquired: false, reason, existingPid };
}
}
/**
* Fallback lock acquisition when proper-lockfile is not available.
* Uses PID-based liveness checking (the old approach, but with the lock
* written BEFORE initialization rather than after).
*/
function acquireFallbackLock(basePath, lp, lockData) {
// Check if an existing lock is held by a live process
const existing = readExistingLockData(lp);
if (existing && existing.pid !== process.pid) {
if (isPidAlive(existing.pid)) {
return {
acquired: false,
reason: `Another autonomous mode session (PID ${existing.pid}) is already running on this project.`,
existingPid: existing.pid,
};
}
// Stale lock from dead process — we can take over
}
// Write our lock data
atomicWriteSync(lp, JSON.stringify(lockData, null, 2));
_lockedPath = basePath;
_lockPid = process.pid;
return { acquired: true };
}
/**
* Update the lock file metadata (called on each unit dispatch).
* Does NOT re-acquire the OS lock — just updates the JSON content.
*/
export function updateSessionLock(
basePath,
unitType,
unitId,
sessionFile,
sessionId,
) {
if (_lockedPath !== basePath && _lockedPath !== null) return;
const lp = lockPath(basePath);
try {
const data = {
pid: process.pid,
startedAt: new Date().toISOString(),
unitType,
unitId,
unitStartedAt: new Date().toISOString(),
sessionFile,
sessionId,
};
atomicWriteSync(lp, JSON.stringify(data, null, 2));
} catch {
// Non-fatal: lock update failure
}
}
/**
* Validate that we still own the session lock.
*
* Returns true if we still hold the lock, false if another process
* has taken over (indicating we should gracefully stop).
*
* This is called periodically during the dispatch loop.
*/
export function getSessionLockStatus(basePath) {
// Lock was compromised by proper-lockfile (mtime drift from sleep, stall, etc.)
if (_lockCompromised) {
// Recovery gate (#1512): Before declaring the lock lost, check if the lock
// file still contains our PID. If it does, no other process took over — the
// onCompromised fired from benign mtime drift (laptop sleep, event loop stall
// beyond the stale window). Attempt re-acquisition instead of giving up.
const lp = lockPath(basePath);
// Retry reads to tolerate transient filesystem hiccups (#2324).
const existing = readExistingLockDataWithRetry(lp);
if (existing && existing.pid === process.pid) {
// Lock file still ours — try to re-acquire the OS lock
try {
const result = acquireSessionLock(basePath);
if (result.acquired) {
process.stderr.write(
`[forge] Lock recovered after onCompromised — lock file PID matched, re-acquired.\n`,
);
return { valid: true, recovered: true };
}
} catch {
// Re-acquisition failed — fall through to return false
}
}
return {
valid: false,
failureReason: "compromised",
existingPid: existing?.pid,
expectedPid: process.pid,
};
}
// If we have an OS-level lock, we're still the owner
if (_releaseFunction && _lockedPath === basePath) {
return { valid: true };
}
// Fallback: check the lock file PID
const lp = lockPath(basePath);
const existing = readExistingLockData(lp);
if (!existing) {
// Lock file was deleted — we lost ownership
return {
valid: false,
failureReason: "missing-metadata",
expectedPid: process.pid,
};
}
if (existing.pid !== process.pid) {
return {
valid: false,
failureReason: "pid-mismatch",
existingPid: existing.pid,
expectedPid: process.pid,
};
}
return { valid: true };
}
export function validateSessionLock(basePath) {
return getSessionLockStatus(basePath).valid;
}
/**
* Release the session lock. Called on clean stop/pause.
*/
export function releaseSessionLock(basePath) {
// Release the OS-level lock
if (_releaseFunction) {
try {
_releaseFunction();
} catch {
// Lock may already be released
}
_releaseFunction = null;
}
// Remove the lock file at the current path
const lp = lockPath(basePath);
try {
if (existsSync(lp)) unlinkSync(lp);
} catch {
// Non-fatal
}
// Remove the proper-lockfile directory for the current lock target.
// In parallel worker mode, this is .sf/parallel/<MID>.lock/ (#2184).
const sfDir = sfRoot(basePath);
const lockTarget = effectiveLockTarget(sfDir);
try {
const lockDir = join(lockTarget + ".lock");
if (existsSync(lockDir)) rmSync(lockDir, { recursive: true, force: true });
} catch {
// Non-fatal
}
// Also clean the per-milestone parallel directory itself if it exists
if (lockTarget !== sfDir) {
try {
if (existsSync(lockTarget))
rmSync(lockTarget, { recursive: true, force: true });
} catch {
// Non-fatal
}
}
// Clean ALL registered lock paths (#1578) — lock files accumulate across
// main project .sf/, worktree .sf/, and projects registry paths.
for (const dir of _lockDirRegistry) {
try {
const lockFile = join(dir, LOCK_FILE);
if (existsSync(lockFile)) unlinkSync(lockFile);
} catch {
/* best-effort */
}
try {
const lockDir = join(dir + ".lock");
if (existsSync(lockDir))
rmSync(lockDir, { recursive: true, force: true });
} catch {
/* best-effort */
}
}
_lockDirRegistry.clear();
// Clean up numbered lock file variants from cloud sync conflicts (#1315)
cleanupStrayLockFiles(basePath);
_lockedPath = null;
_lockPid = 0;
_lockCompromised = false;
_lockAcquiredAt = 0;
_snapshotLockPath = null;
}
/**
* Check if a session lock exists and return its data (for crash recovery).
* Does NOT acquire the lock.
*/
export function readSessionLockData(basePath) {
return readExistingLockData(lockPath(basePath));
}
/**
* Check if the process that wrote the lock is still alive.
*/
export function isSessionLockProcessAlive(data) {
return isPidAlive(data.pid);
}
/**
* Returns true if we currently hold a session lock for the given path.
*/
export function isSessionLockHeld(basePath) {
return _lockedPath === basePath && _lockPid === process.pid;
}
/**
* Returns a snapshot of the registered lock directory paths for diagnostics.
* Exported for tests only.
*/
export function _getRegisteredLockDirs() {
return [..._lockDirRegistry];
}
// ─── Internal Helpers ───────────────────────────────────────────────────────
function readExistingLockData(lp) {
try {
if (!existsSync(lp)) return null;
const raw = readFileSync(lp, "utf-8");
return JSON.parse(raw);
} catch {
return null;
}
}
export function readExistingLockDataWithRetry(lp, options) {
const maxAttempts = options?.maxAttempts ?? 3;
const delayMs = options?.delayMs ?? 200;
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
const data = readExistingLockData(lp);
if (data !== null) return data;
if (attempt < maxAttempts) {
// Synchronous busy-wait — onCompromised runs in a sync callback context
// and the delays are short (200ms default).
const start = Date.now();
while (Date.now() - start < delayMs) {
// busy-wait
}
}
}
return null;
}
function isPidAlive(pid) {
if (!Number.isInteger(pid) || pid <= 0) return false;
if (pid === process.pid) return false;
try {
process.kill(pid, 0);
return true;
} catch (err) {
if (err.code === "EPERM") return true;
return false;
}
}
/**
* Public wrapper around isPidAlive for callers outside this module.
*
* Consumer: auto-start's prompt-to-kill flow needs to decide whether the
* existingPid from acquireSessionLock's failure result is still alive before
* offering to terminate it.
*/
export function isSessionPidAlive(pid) {
return isPidAlive(Number(pid));
}
/**
* Terminate an existing SF auto session by PID.
*
* Why: when acquireSessionLock reports `{ acquired: false, existingPid }`
* because another SF process is holding the lock, we want a one-call helper
* that an interactive caller can invoke after confirming with the user. The
* helper sends SIGTERM, polls for the process to exit, escalates to SIGKILL
* after the grace window, and waits a short tail for the kernel to reap the
* PID so a subsequent acquireSessionLock retry sees a dead PID and proceeds
* down the stale-lock recovery path.
*
* Returns `{ terminated: boolean, escalated: boolean, alreadyDead: boolean }`.
* `terminated` is true iff the PID is no longer alive when the call returns.
* `escalated` is true iff SIGKILL was needed because SIGTERM did not produce
* an exit within `gracePeriodMs`.
*
* Consumer: auto-start's prompt-to-kill flow. Not part of the normal
* autonomous loop — only invoked after explicit operator consent.
*
* @param {number} pid - The PID to terminate.
* @param {object} [options]
* @param {number} [options.gracePeriodMs=5000] - How long to wait between
* SIGTERM and SIGKILL.
* @param {number} [options.reapWaitMs=1000] - How long to wait after the
* final kill signal for the kernel to reap.
* @param {number} [options.pollIntervalMs=100] - Poll interval used while
* waiting for exit.
*/
export async function terminateExistingSession(pid, options = {}) {
const numericPid = Number(pid);
if (!Number.isInteger(numericPid) || numericPid <= 0) {
return { terminated: false, escalated: false, alreadyDead: true };
}
if (numericPid === process.pid) {
// Refuse to terminate ourselves — would deadlock the caller.
return { terminated: false, escalated: false, alreadyDead: false };
}
if (!isPidAlive(numericPid)) {
return { terminated: true, escalated: false, alreadyDead: true };
}
const gracePeriodMs = Number(options.gracePeriodMs ?? 5000);
const reapWaitMs = Number(options.reapWaitMs ?? 1000);
const pollIntervalMs = Math.max(50, Number(options.pollIntervalMs ?? 100));
try {
process.kill(numericPid, "SIGTERM");
} catch (err) {
// ESRCH: process already gone between the alive check and the kill.
// EPERM: not ours to kill — surface as not-terminated.
if (err?.code === "ESRCH") {
return { terminated: true, escalated: false, alreadyDead: true };
}
if (err?.code === "EPERM") {
return { terminated: false, escalated: false, alreadyDead: false };
}
throw err;
}
const deadline = Date.now() + gracePeriodMs;
while (Date.now() < deadline) {
if (!isPidAlive(numericPid)) {
return { terminated: true, escalated: false, alreadyDead: false };
}
await new Promise((resolve) => setTimeout(resolve, pollIntervalMs));
}
// Grace expired — escalate to SIGKILL.
try {
process.kill(numericPid, "SIGKILL");
} catch (err) {
if (err?.code === "ESRCH") {
return { terminated: true, escalated: true, alreadyDead: false };
}
if (err?.code === "EPERM") {
return { terminated: false, escalated: true, alreadyDead: false };
}
throw err;
}
const reapDeadline = Date.now() + reapWaitMs;
while (Date.now() < reapDeadline) {
if (!isPidAlive(numericPid)) {
return { terminated: true, escalated: true, alreadyDead: false };
}
await new Promise((resolve) => setTimeout(resolve, pollIntervalMs));
}
return {
terminated: !isPidAlive(numericPid),
escalated: true,
alreadyDead: false,
};
}