698 lines
24 KiB
JavaScript
698 lines
24 KiB
JavaScript
/**
|
|
* SF Session Lock — OS-level exclusive locking for autonomous mode sessions.
|
|
*
|
|
* Prevents multiple SF processes from running autonomous mode concurrently on
|
|
* the same project. Uses proper-lockfile for OS-level file locking (flock/
|
|
* lockfile) which eliminates the TOCTOU race condition that existed with
|
|
* the old advisory JSON lock approach.
|
|
*
|
|
* The lock file (.sf/auto.lock) contains JSON metadata (PID, start time,
|
|
* unit info) for diagnostics, but the actual exclusion is enforced by the
|
|
* OS-level lock held via proper-lockfile.
|
|
*
|
|
* Lifecycle:
|
|
* acquireSessionLock() — called at the START of bootstrapAutoSession
|
|
* validateSessionLock() — called periodically during dispatch to detect takeover
|
|
* releaseSessionLock() — called on clean stop/pause
|
|
*/
|
|
import {
|
|
existsSync,
|
|
mkdirSync,
|
|
readdirSync,
|
|
readFileSync,
|
|
rmSync,
|
|
statSync,
|
|
unlinkSync,
|
|
} from "node:fs";
|
|
import { createRequire } from "node:module";
|
|
import { dirname, join } from "node:path";
|
|
import { atomicWriteSync } from "./atomic-write.js";
|
|
import { sfRoot } from "./paths.js";
|
|
|
|
const _require = createRequire(import.meta.url);
|
|
// ─── Module State ───────────────────────────────────────────────────────────
|
|
/** Release function from proper-lockfile — calling it releases the OS lock. */
|
|
let _releaseFunction = null;
|
|
/** The path we currently hold a lock on. */
|
|
let _lockedPath = null;
|
|
/** Our PID at lock acquisition time. */
|
|
let _lockPid = 0;
|
|
/** Set to true when proper-lockfile fires onCompromised (mtime drift, sleep, etc.). */
|
|
let _lockCompromised = false;
|
|
/** Whether we've already registered a process.on('exit') handler. */
|
|
let _exitHandlerRegistered = false;
|
|
/** Registry of all sfDir paths where locks were created during this session.
|
|
* The exit handler cleans ALL of these, not just the current sfRoot(). (#1578) */
|
|
const _lockDirRegistry = new Set();
|
|
/** Snapshotted lock file path — captured at acquireSessionLock time to avoid
|
|
* sfRoot() resolving differently in worktree vs project root contexts (#1363). */
|
|
let _snapshotLockPath = null;
|
|
/** Timestamp when the session lock was acquired — used to detect false-positive
|
|
* onCompromised events from event loop stalls within the stale window (#1362). */
|
|
let _lockAcquiredAt = 0;
|
|
const LOCK_FILE = "auto.lock";
|
|
/**
|
|
* Derive the effective lock file name for the current process.
|
|
* In parallel worker mode (SF_PARALLEL_WORKER + SF_MILESTONE_LOCK),
|
|
* each worker uses a per-milestone lock file (`auto-<milestoneId>.lock`)
|
|
* to avoid contending on the shared `.sf/auto.lock` (#2184).
|
|
*/
|
|
export function effectiveLockFile() {
|
|
const mid = process.env.SF_PARALLEL_WORKER
|
|
? process.env.SF_MILESTONE_LOCK
|
|
: null;
|
|
return mid ? `auto-${mid}.lock` : LOCK_FILE;
|
|
}
|
|
/**
|
|
* Derive the OS-level lock target directory for the current process.
|
|
* In parallel worker mode, uses `.sf/parallel/<milestoneId>/` instead of
|
|
* `.sf/` so workers don't contend on the same proper-lockfile directory (#2184).
|
|
*/
|
|
export function effectiveLockTarget(sfDir) {
|
|
const mid = process.env.SF_PARALLEL_WORKER
|
|
? process.env.SF_MILESTONE_LOCK
|
|
: null;
|
|
return mid ? join(sfDir, "parallel", mid) : sfDir;
|
|
}
|
|
function lockPath(basePath) {
|
|
// If we have a snapshotted path from acquisition, use it for consistency
|
|
if (_snapshotLockPath) return _snapshotLockPath;
|
|
return join(sfRoot(basePath), effectiveLockFile());
|
|
}
|
|
// ─── Stray Lock Cleanup ─────────────────────────────────────────────────────
|
|
/**
|
|
* Remove numbered lock file variants (e.g. "auto 2.lock", "auto 3.lock")
|
|
* that accumulate from macOS file conflict resolution (iCloud/Dropbox/OneDrive)
|
|
* or other filesystem-level copy-on-conflict behavior (#1315).
|
|
*
|
|
* Also removes stray proper-lockfile directories beyond the canonical `.sf.lock/`.
|
|
*/
|
|
export function cleanupStrayLockFiles(basePath) {
|
|
const sfDir = sfRoot(basePath);
|
|
// Clean numbered auto lock files inside .sf/
|
|
try {
|
|
if (existsSync(sfDir)) {
|
|
for (const entry of readdirSync(sfDir)) {
|
|
// Match "auto <N>.lock" or "auto (<N>).lock" variants but NOT the canonical "auto.lock"
|
|
if (entry !== LOCK_FILE && /^auto\s.+\.lock$/i.test(entry)) {
|
|
try {
|
|
unlinkSync(join(sfDir, entry));
|
|
} catch {
|
|
/* best-effort */
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} catch {
|
|
/* non-fatal: directory read failure */
|
|
}
|
|
// Clean stray proper-lockfile directories (e.g. ".sf 2.lock/")
|
|
// The canonical one is ".sf.lock/" — anything else is stray.
|
|
try {
|
|
const parentDir = dirname(sfDir);
|
|
const sfDirName = sfDir.split("/").pop() || ".sf";
|
|
if (existsSync(parentDir)) {
|
|
for (const entry of readdirSync(parentDir)) {
|
|
// Match ".sf <N>.lock" or ".sf (<N>).lock" directories but NOT ".sf.lock"
|
|
if (
|
|
entry !== `${sfDirName}.lock` &&
|
|
entry.startsWith(sfDirName) &&
|
|
entry.endsWith(".lock")
|
|
) {
|
|
const fullPath = join(parentDir, entry);
|
|
try {
|
|
const stat = statSync(fullPath);
|
|
if (stat.isDirectory()) {
|
|
rmSync(fullPath, { recursive: true, force: true });
|
|
}
|
|
} catch {
|
|
/* best-effort */
|
|
}
|
|
}
|
|
}
|
|
}
|
|
} catch {
|
|
/* non-fatal */
|
|
}
|
|
}
|
|
/**
|
|
* Register a single process exit handler that cleans up lock state.
|
|
* Uses module-level references so it always operates on current state.
|
|
* Only registers once — subsequent calls are no-ops.
|
|
*/
|
|
function ensureExitHandler(_sfDir) {
|
|
// Register the sfDir so exit cleanup covers it
|
|
_lockDirRegistry.add(_sfDir);
|
|
if (_exitHandlerRegistered) return;
|
|
_exitHandlerRegistered = true;
|
|
process.once("exit", () => {
|
|
try {
|
|
if (_releaseFunction) {
|
|
_releaseFunction();
|
|
_releaseFunction = null;
|
|
}
|
|
} catch {
|
|
/* best-effort */
|
|
}
|
|
// Clean ALL registered lock paths, not just the current one (#1578).
|
|
// Lock files accumulate across main project .sf/, worktree .sf/,
|
|
// and projects registry paths — cleanup must cover all of them.
|
|
for (const dir of _lockDirRegistry) {
|
|
try {
|
|
const lockFile = join(dir, LOCK_FILE);
|
|
if (existsSync(lockFile)) unlinkSync(lockFile);
|
|
} catch {
|
|
/* best-effort */
|
|
}
|
|
try {
|
|
const lockDir = join(dir + ".lock");
|
|
if (existsSync(lockDir))
|
|
rmSync(lockDir, { recursive: true, force: true });
|
|
} catch {
|
|
/* best-effort */
|
|
}
|
|
}
|
|
});
|
|
}
|
|
// ─── Lock Acquisition Helpers ───────────────────────────────────────────────
|
|
/**
|
|
* Create the onCompromised callback for proper-lockfile.
|
|
*
|
|
* proper-lockfile fires onCompromised when it detects mtime drift (system sleep,
|
|
* event loop stall, etc.). The default handler throws inside setTimeout — an
|
|
* uncaught exception that crashes or corrupts process state.
|
|
*
|
|
* False-positive suppression (#1362): If we're still within the stale window
|
|
* (30 min since acquisition), the mtime mismatch is from an event loop stall
|
|
* during a long LLM call — not a real takeover. Log and continue.
|
|
*
|
|
* PID ownership check (#1578): Past the stale window, check if the lock file
|
|
* still contains our PID before declaring compromise. Retry reads tolerate
|
|
* transient filesystem hiccups (NFS/CIFS latency, APFS snapshots, etc.) (#2324).
|
|
*/
|
|
function createLockCompromisedHandler(lockFilePath) {
|
|
return () => {
|
|
const elapsed = Date.now() - _lockAcquiredAt;
|
|
if (elapsed < 1_800_000) {
|
|
process.stderr.write(
|
|
`[forge] Lock heartbeat caught up after ${Math.round(elapsed / 1000)}s — long LLM call, no action needed.\n`,
|
|
);
|
|
return;
|
|
}
|
|
const existing = readExistingLockDataWithRetry(lockFilePath);
|
|
if (existing && existing.pid === process.pid) {
|
|
process.stderr.write(
|
|
`[forge] Lock heartbeat mismatch after ${Math.round(elapsed / 1000)}s — lock file still owned by PID ${process.pid}, treating as false positive.\n`,
|
|
);
|
|
return;
|
|
}
|
|
_lockCompromised = true;
|
|
_releaseFunction = null;
|
|
};
|
|
}
|
|
/**
|
|
* Assign module-level lock state after a successful lock acquisition.
|
|
*/
|
|
function assignLockState(basePath, release, lockFilePath) {
|
|
_releaseFunction = release;
|
|
_lockedPath = basePath;
|
|
_lockPid = process.pid;
|
|
_lockCompromised = false;
|
|
_lockAcquiredAt = Date.now();
|
|
_snapshotLockPath = lockFilePath;
|
|
}
|
|
// ─── Public API ─────────────────────────────────────────────────────────────
|
|
/**
|
|
* Attempt to acquire an exclusive session lock for the given project.
|
|
*
|
|
* This uses proper-lockfile for OS-level file locking. If another process
|
|
* already holds the lock, this returns { acquired: false } with details.
|
|
*
|
|
* The lock file also contains JSON metadata about the session for
|
|
* diagnostic purposes (PID, unit info, etc.).
|
|
*/
|
|
export function acquireSessionLock(basePath, sessionInfo) {
|
|
const lp = lockPath(basePath);
|
|
// Re-entrant acquire on the same path: release our current OS lock first so
|
|
// proper-lockfile clears its update timer before we acquire a fresh lock.
|
|
if (_releaseFunction && _lockedPath === basePath) {
|
|
try {
|
|
_releaseFunction();
|
|
} catch {
|
|
/* may already be released */
|
|
}
|
|
_releaseFunction = null;
|
|
_lockedPath = null;
|
|
_lockPid = 0;
|
|
_lockCompromised = false;
|
|
}
|
|
// Ensure the directory exists
|
|
mkdirSync(dirname(lp), { recursive: true });
|
|
// Clean up numbered lock file variants from cloud sync conflicts (#1315)
|
|
cleanupStrayLockFiles(basePath);
|
|
// Write our lock data first (the content is informational; the OS lock is the real guard).
|
|
// sessionId/sessionFile let observers correlate this autonomous mode session with the
|
|
// .sf/sessions/<...>.jsonl event log (closes sf-moocx6lv-9grpvt).
|
|
const lockData = {
|
|
pid: process.pid,
|
|
startedAt: new Date().toISOString(),
|
|
unitType: "starting",
|
|
unitId: "bootstrap",
|
|
unitStartedAt: new Date().toISOString(),
|
|
sessionId: sessionInfo?.sessionId,
|
|
sessionFile: sessionInfo?.sessionFile,
|
|
};
|
|
let lockfile;
|
|
try {
|
|
lockfile = _require("proper-lockfile");
|
|
} catch {
|
|
// proper-lockfile not available — fall back to PID-based check
|
|
return acquireFallbackLock(basePath, lp, lockData);
|
|
}
|
|
const sfDir = sfRoot(basePath);
|
|
const lockTarget = effectiveLockTarget(sfDir);
|
|
// #3218: Pre-flight stale lock cleanup — if the .lock/ directory exists but
|
|
// no auto.lock metadata is present (or the PID is dead), remove the lock
|
|
// directory before attempting acquisition. This prevents the 30-min stale
|
|
// window from blocking /after crashes, SIGKILL, or laptop sleep.
|
|
const lockDir = lockTarget + ".lock";
|
|
if (existsSync(lockDir)) {
|
|
const existingData = readExistingLockData(lp);
|
|
const isOrphan =
|
|
!existingData || (existingData.pid && !isPidAlive(existingData.pid));
|
|
if (isOrphan) {
|
|
try {
|
|
rmSync(lockDir, { recursive: true, force: true });
|
|
} catch {
|
|
/* best-effort */
|
|
}
|
|
try {
|
|
if (existsSync(lp)) unlinkSync(lp);
|
|
} catch {
|
|
/* best-effort */
|
|
}
|
|
}
|
|
}
|
|
try {
|
|
// Try to acquire an exclusive OS-level lock on the lock target.
|
|
// We lock a directory since proper-lockfile works best on directories,
|
|
// and the lock file itself may not exist yet.
|
|
// In parallel worker mode, lockTarget is .sf/parallel/<MID>/ (#2184).
|
|
mkdirSync(lockTarget, { recursive: true });
|
|
const release = lockfile.lockSync(lockTarget, {
|
|
realpath: false,
|
|
stale: 1_800_000, // 30 minutes — safe for laptop sleep / long event loop stalls
|
|
update: 10_000, // Update lock mtime every 10s to prove liveness
|
|
onCompromised: createLockCompromisedHandler(lp),
|
|
});
|
|
assignLockState(basePath, release, lp);
|
|
// Safety net: clean up lock dir on process exit if _releaseFunction
|
|
// wasn't called (e.g., normal exit after clean completion) (#1245).
|
|
ensureExitHandler(lockTarget);
|
|
// Write the informational lock data
|
|
atomicWriteSync(lp, JSON.stringify(lockData, null, 2));
|
|
return { acquired: true };
|
|
} catch (_err) {
|
|
// Lock is held by another process — or the .sf.lock/ directory is stranded.
|
|
// Check: if auto.lock is gone and no process is alive, the lock dir is stale.
|
|
const existingData = readExistingLockData(lp);
|
|
const existingPid = existingData?.pid;
|
|
// If no lock file or no alive process, try to clean up and re-acquire (#1245)
|
|
if (!existingData || (existingPid && !isPidAlive(existingPid))) {
|
|
try {
|
|
const lockDir = join(lockTarget + ".lock");
|
|
if (existsSync(lockDir))
|
|
rmSync(lockDir, { recursive: true, force: true });
|
|
if (existsSync(lp)) unlinkSync(lp);
|
|
// Retry acquisition after cleanup
|
|
const release = lockfile.lockSync(lockTarget, {
|
|
realpath: false,
|
|
stale: 1_800_000, // 30 minutes — match primary lock settings
|
|
update: 10_000,
|
|
onCompromised: createLockCompromisedHandler(lp),
|
|
});
|
|
assignLockState(basePath, release, lp);
|
|
// Safety net — uses centralized handler to avoid double-registration
|
|
ensureExitHandler(lockTarget);
|
|
atomicWriteSync(lp, JSON.stringify(lockData, null, 2));
|
|
return { acquired: true };
|
|
} catch {
|
|
// Retry also failed — fall through to the error path
|
|
}
|
|
}
|
|
// #3218: Provide actionable workaround when lock recovery fails
|
|
const lockDirPath = lockTarget + ".lock";
|
|
const reason = existingPid
|
|
? `Another autonomous mode session (PID ${existingPid}) appears to be running.\nStop it with \`kill ${existingPid}\` before starting a new session.`
|
|
: `Another autonomous mode session lock is stuck on this project.\nRun: rm -rf "${lockDirPath}" && rm -f "${lp}"`;
|
|
return { acquired: false, reason, existingPid };
|
|
}
|
|
}
|
|
/**
|
|
* Fallback lock acquisition when proper-lockfile is not available.
|
|
* Uses PID-based liveness checking (the old approach, but with the lock
|
|
* written BEFORE initialization rather than after).
|
|
*/
|
|
function acquireFallbackLock(basePath, lp, lockData) {
|
|
// Check if an existing lock is held by a live process
|
|
const existing = readExistingLockData(lp);
|
|
if (existing && existing.pid !== process.pid) {
|
|
if (isPidAlive(existing.pid)) {
|
|
return {
|
|
acquired: false,
|
|
reason: `Another autonomous mode session (PID ${existing.pid}) is already running on this project.`,
|
|
existingPid: existing.pid,
|
|
};
|
|
}
|
|
// Stale lock from dead process — we can take over
|
|
}
|
|
// Write our lock data
|
|
atomicWriteSync(lp, JSON.stringify(lockData, null, 2));
|
|
_lockedPath = basePath;
|
|
_lockPid = process.pid;
|
|
return { acquired: true };
|
|
}
|
|
/**
|
|
* Update the lock file metadata (called on each unit dispatch).
|
|
* Does NOT re-acquire the OS lock — just updates the JSON content.
|
|
*/
|
|
export function updateSessionLock(
|
|
basePath,
|
|
unitType,
|
|
unitId,
|
|
sessionFile,
|
|
sessionId,
|
|
) {
|
|
if (_lockedPath !== basePath && _lockedPath !== null) return;
|
|
const lp = lockPath(basePath);
|
|
try {
|
|
const data = {
|
|
pid: process.pid,
|
|
startedAt: new Date().toISOString(),
|
|
unitType,
|
|
unitId,
|
|
unitStartedAt: new Date().toISOString(),
|
|
sessionFile,
|
|
sessionId,
|
|
};
|
|
atomicWriteSync(lp, JSON.stringify(data, null, 2));
|
|
} catch {
|
|
// Non-fatal: lock update failure
|
|
}
|
|
}
|
|
/**
|
|
* Validate that we still own the session lock.
|
|
*
|
|
* Returns true if we still hold the lock, false if another process
|
|
* has taken over (indicating we should gracefully stop).
|
|
*
|
|
* This is called periodically during the dispatch loop.
|
|
*/
|
|
export function getSessionLockStatus(basePath) {
|
|
// Lock was compromised by proper-lockfile (mtime drift from sleep, stall, etc.)
|
|
if (_lockCompromised) {
|
|
// Recovery gate (#1512): Before declaring the lock lost, check if the lock
|
|
// file still contains our PID. If it does, no other process took over — the
|
|
// onCompromised fired from benign mtime drift (laptop sleep, event loop stall
|
|
// beyond the stale window). Attempt re-acquisition instead of giving up.
|
|
const lp = lockPath(basePath);
|
|
// Retry reads to tolerate transient filesystem hiccups (#2324).
|
|
const existing = readExistingLockDataWithRetry(lp);
|
|
if (existing && existing.pid === process.pid) {
|
|
// Lock file still ours — try to re-acquire the OS lock
|
|
try {
|
|
const result = acquireSessionLock(basePath);
|
|
if (result.acquired) {
|
|
process.stderr.write(
|
|
`[forge] Lock recovered after onCompromised — lock file PID matched, re-acquired.\n`,
|
|
);
|
|
return { valid: true, recovered: true };
|
|
}
|
|
} catch {
|
|
// Re-acquisition failed — fall through to return false
|
|
}
|
|
}
|
|
return {
|
|
valid: false,
|
|
failureReason: "compromised",
|
|
existingPid: existing?.pid,
|
|
expectedPid: process.pid,
|
|
};
|
|
}
|
|
// If we have an OS-level lock, we're still the owner
|
|
if (_releaseFunction && _lockedPath === basePath) {
|
|
return { valid: true };
|
|
}
|
|
// Fallback: check the lock file PID
|
|
const lp = lockPath(basePath);
|
|
const existing = readExistingLockData(lp);
|
|
if (!existing) {
|
|
// Lock file was deleted — we lost ownership
|
|
return {
|
|
valid: false,
|
|
failureReason: "missing-metadata",
|
|
expectedPid: process.pid,
|
|
};
|
|
}
|
|
if (existing.pid !== process.pid) {
|
|
return {
|
|
valid: false,
|
|
failureReason: "pid-mismatch",
|
|
existingPid: existing.pid,
|
|
expectedPid: process.pid,
|
|
};
|
|
}
|
|
return { valid: true };
|
|
}
|
|
export function validateSessionLock(basePath) {
|
|
return getSessionLockStatus(basePath).valid;
|
|
}
|
|
/**
|
|
* Release the session lock. Called on clean stop/pause.
|
|
*/
|
|
export function releaseSessionLock(basePath) {
|
|
// Release the OS-level lock
|
|
if (_releaseFunction) {
|
|
try {
|
|
_releaseFunction();
|
|
} catch {
|
|
// Lock may already be released
|
|
}
|
|
_releaseFunction = null;
|
|
}
|
|
// Remove the lock file at the current path
|
|
const lp = lockPath(basePath);
|
|
try {
|
|
if (existsSync(lp)) unlinkSync(lp);
|
|
} catch {
|
|
// Non-fatal
|
|
}
|
|
// Remove the proper-lockfile directory for the current lock target.
|
|
// In parallel worker mode, this is .sf/parallel/<MID>.lock/ (#2184).
|
|
const sfDir = sfRoot(basePath);
|
|
const lockTarget = effectiveLockTarget(sfDir);
|
|
try {
|
|
const lockDir = join(lockTarget + ".lock");
|
|
if (existsSync(lockDir)) rmSync(lockDir, { recursive: true, force: true });
|
|
} catch {
|
|
// Non-fatal
|
|
}
|
|
// Also clean the per-milestone parallel directory itself if it exists
|
|
if (lockTarget !== sfDir) {
|
|
try {
|
|
if (existsSync(lockTarget))
|
|
rmSync(lockTarget, { recursive: true, force: true });
|
|
} catch {
|
|
// Non-fatal
|
|
}
|
|
}
|
|
// Clean ALL registered lock paths (#1578) — lock files accumulate across
|
|
// main project .sf/, worktree .sf/, and projects registry paths.
|
|
for (const dir of _lockDirRegistry) {
|
|
try {
|
|
const lockFile = join(dir, LOCK_FILE);
|
|
if (existsSync(lockFile)) unlinkSync(lockFile);
|
|
} catch {
|
|
/* best-effort */
|
|
}
|
|
try {
|
|
const lockDir = join(dir + ".lock");
|
|
if (existsSync(lockDir))
|
|
rmSync(lockDir, { recursive: true, force: true });
|
|
} catch {
|
|
/* best-effort */
|
|
}
|
|
}
|
|
_lockDirRegistry.clear();
|
|
// Clean up numbered lock file variants from cloud sync conflicts (#1315)
|
|
cleanupStrayLockFiles(basePath);
|
|
_lockedPath = null;
|
|
_lockPid = 0;
|
|
_lockCompromised = false;
|
|
_lockAcquiredAt = 0;
|
|
_snapshotLockPath = null;
|
|
}
|
|
/**
|
|
* Check if a session lock exists and return its data (for crash recovery).
|
|
* Does NOT acquire the lock.
|
|
*/
|
|
export function readSessionLockData(basePath) {
|
|
return readExistingLockData(lockPath(basePath));
|
|
}
|
|
/**
|
|
* Check if the process that wrote the lock is still alive.
|
|
*/
|
|
export function isSessionLockProcessAlive(data) {
|
|
return isPidAlive(data.pid);
|
|
}
|
|
/**
|
|
* Returns true if we currently hold a session lock for the given path.
|
|
*/
|
|
export function isSessionLockHeld(basePath) {
|
|
return _lockedPath === basePath && _lockPid === process.pid;
|
|
}
|
|
/**
|
|
* Returns a snapshot of the registered lock directory paths for diagnostics.
|
|
* Exported for tests only.
|
|
*/
|
|
export function _getRegisteredLockDirs() {
|
|
return [..._lockDirRegistry];
|
|
}
|
|
// ─── Internal Helpers ───────────────────────────────────────────────────────
|
|
function readExistingLockData(lp) {
|
|
try {
|
|
if (!existsSync(lp)) return null;
|
|
const raw = readFileSync(lp, "utf-8");
|
|
return JSON.parse(raw);
|
|
} catch {
|
|
return null;
|
|
}
|
|
}
|
|
export function readExistingLockDataWithRetry(lp, options) {
|
|
const maxAttempts = options?.maxAttempts ?? 3;
|
|
const delayMs = options?.delayMs ?? 200;
|
|
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
|
|
const data = readExistingLockData(lp);
|
|
if (data !== null) return data;
|
|
if (attempt < maxAttempts) {
|
|
// Synchronous busy-wait — onCompromised runs in a sync callback context
|
|
// and the delays are short (200ms default).
|
|
const start = Date.now();
|
|
while (Date.now() - start < delayMs) {
|
|
// busy-wait
|
|
}
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
function isPidAlive(pid) {
|
|
if (!Number.isInteger(pid) || pid <= 0) return false;
|
|
if (pid === process.pid) return false;
|
|
try {
|
|
process.kill(pid, 0);
|
|
return true;
|
|
} catch (err) {
|
|
if (err.code === "EPERM") return true;
|
|
return false;
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Public wrapper around isPidAlive for callers outside this module.
|
|
*
|
|
* Consumer: auto-start's prompt-to-kill flow needs to decide whether the
|
|
* existingPid from acquireSessionLock's failure result is still alive before
|
|
* offering to terminate it.
|
|
*/
|
|
export function isSessionPidAlive(pid) {
|
|
return isPidAlive(Number(pid));
|
|
}
|
|
|
|
/**
|
|
* Terminate an existing SF auto session by PID.
|
|
*
|
|
* Why: when acquireSessionLock reports `{ acquired: false, existingPid }`
|
|
* because another SF process is holding the lock, we want a one-call helper
|
|
* that an interactive caller can invoke after confirming with the user. The
|
|
* helper sends SIGTERM, polls for the process to exit, escalates to SIGKILL
|
|
* after the grace window, and waits a short tail for the kernel to reap the
|
|
* PID so a subsequent acquireSessionLock retry sees a dead PID and proceeds
|
|
* down the stale-lock recovery path.
|
|
*
|
|
* Returns `{ terminated: boolean, escalated: boolean, alreadyDead: boolean }`.
|
|
* `terminated` is true iff the PID is no longer alive when the call returns.
|
|
* `escalated` is true iff SIGKILL was needed because SIGTERM did not produce
|
|
* an exit within `gracePeriodMs`.
|
|
*
|
|
* Consumer: auto-start's prompt-to-kill flow. Not part of the normal
|
|
* autonomous loop — only invoked after explicit operator consent.
|
|
*
|
|
* @param {number} pid - The PID to terminate.
|
|
* @param {object} [options]
|
|
* @param {number} [options.gracePeriodMs=5000] - How long to wait between
|
|
* SIGTERM and SIGKILL.
|
|
* @param {number} [options.reapWaitMs=1000] - How long to wait after the
|
|
* final kill signal for the kernel to reap.
|
|
* @param {number} [options.pollIntervalMs=100] - Poll interval used while
|
|
* waiting for exit.
|
|
*/
|
|
export async function terminateExistingSession(pid, options = {}) {
|
|
const numericPid = Number(pid);
|
|
if (!Number.isInteger(numericPid) || numericPid <= 0) {
|
|
return { terminated: false, escalated: false, alreadyDead: true };
|
|
}
|
|
if (numericPid === process.pid) {
|
|
// Refuse to terminate ourselves — would deadlock the caller.
|
|
return { terminated: false, escalated: false, alreadyDead: false };
|
|
}
|
|
if (!isPidAlive(numericPid)) {
|
|
return { terminated: true, escalated: false, alreadyDead: true };
|
|
}
|
|
const gracePeriodMs = Number(options.gracePeriodMs ?? 5000);
|
|
const reapWaitMs = Number(options.reapWaitMs ?? 1000);
|
|
const pollIntervalMs = Math.max(50, Number(options.pollIntervalMs ?? 100));
|
|
try {
|
|
process.kill(numericPid, "SIGTERM");
|
|
} catch (err) {
|
|
// ESRCH: process already gone between the alive check and the kill.
|
|
// EPERM: not ours to kill — surface as not-terminated.
|
|
if (err?.code === "ESRCH") {
|
|
return { terminated: true, escalated: false, alreadyDead: true };
|
|
}
|
|
if (err?.code === "EPERM") {
|
|
return { terminated: false, escalated: false, alreadyDead: false };
|
|
}
|
|
throw err;
|
|
}
|
|
const deadline = Date.now() + gracePeriodMs;
|
|
while (Date.now() < deadline) {
|
|
if (!isPidAlive(numericPid)) {
|
|
return { terminated: true, escalated: false, alreadyDead: false };
|
|
}
|
|
await new Promise((resolve) => setTimeout(resolve, pollIntervalMs));
|
|
}
|
|
// Grace expired — escalate to SIGKILL.
|
|
try {
|
|
process.kill(numericPid, "SIGKILL");
|
|
} catch (err) {
|
|
if (err?.code === "ESRCH") {
|
|
return { terminated: true, escalated: true, alreadyDead: false };
|
|
}
|
|
if (err?.code === "EPERM") {
|
|
return { terminated: false, escalated: true, alreadyDead: false };
|
|
}
|
|
throw err;
|
|
}
|
|
const reapDeadline = Date.now() + reapWaitMs;
|
|
while (Date.now() < reapDeadline) {
|
|
if (!isPidAlive(numericPid)) {
|
|
return { terminated: true, escalated: true, alreadyDead: false };
|
|
}
|
|
await new Promise((resolve) => setTimeout(resolve, pollIntervalMs));
|
|
}
|
|
return {
|
|
terminated: !isPidAlive(numericPid),
|
|
escalated: true,
|
|
alreadyDead: false,
|
|
};
|
|
}
|