Merge pull request #2591 from gsd-build/fix/lock-heartbeat-retry-v2

fix(session-lock): retry lock file reads before declaring compromise
This commit is contained in:
TÂCHES 2026-03-25 22:01:55 -06:00 committed by GitHub
commit 2a39adf5b4
2 changed files with 268 additions and 7 deletions

View file

@ -242,16 +242,16 @@ export function acquireSessionLock(basePath: string): SessionLockResult {
return; // Suppress false positive
}
// Past the stale window — check if the lock file still belongs to us before
// declaring compromise (#1578). If our PID still owns the metadata, this is
// a false positive from a very long event loop stall (e.g. subagent execution).
const existing = readExistingLockData(lp);
// declaring compromise (#1578). Retry reads to tolerate transient filesystem
// hiccups (NFS/CIFS latency, APFS snapshots, etc.) (#2324).
const existing = readExistingLockDataWithRetry(lp);
if (existing && existing.pid === process.pid) {
process.stderr.write(
`[gsd] Lock heartbeat mismatch after ${Math.round(elapsed / 1000)}s — lock file still owned by PID ${process.pid}, treating as false positive.\n`,
);
return; // Our PID still owns the lock file — no real takeover
}
// Lock file is gone or owned by another PID — real compromise
// Lock file is gone or owned by another PID after retries — real compromise
_lockCompromised = true;
_releaseFunction = null;
},
@ -301,8 +301,9 @@ export function acquireSessionLock(basePath: string): SessionLockResult {
);
return;
}
// Check PID ownership before declaring compromise (#1578)
const existing = readExistingLockData(lp);
// Check PID ownership before declaring compromise (#1578).
// Retry reads to tolerate transient filesystem hiccups (#2324).
const existing = readExistingLockDataWithRetry(lp);
if (existing && existing.pid === process.pid) {
process.stderr.write(
`[gsd] Lock heartbeat mismatch after ${Math.round(elapsed / 1000)}s — lock file still owned by PID ${process.pid}, treating as false positive.\n`,
@ -413,7 +414,8 @@ export function getSessionLockStatus(basePath: string): SessionLockStatus {
// onCompromised fired from benign mtime drift (laptop sleep, event loop stall
// beyond the stale window). Attempt re-acquisition instead of giving up.
const lp = lockPath(basePath);
const existing = readExistingLockData(lp);
// Retry reads to tolerate transient filesystem hiccups (#2324).
const existing = readExistingLockDataWithRetry(lp);
if (existing && existing.pid === process.pid) {
// Lock file still ours — try to re-acquire the OS lock
try {
@ -565,6 +567,42 @@ function readExistingLockData(lp: string): SessionLockData | null {
}
}
/**
* Retry-tolerant variant of readExistingLockData for use in onCompromised and
* other paths where a transient filesystem hiccup (NFS/CIFS latency, macOS APFS
* snapshot, concurrent process briefly holding the file) should NOT be treated
* as "lock file gone" (#2324).
*
* Retries up to `maxAttempts` times with `delayMs` between each attempt.
* Only returns null when ALL retries fail to read valid data.
*/
export interface RetryOptions {
maxAttempts?: number;
delayMs?: number;
}
export function readExistingLockDataWithRetry(
lp: string,
options?: RetryOptions,
): SessionLockData | null {
const maxAttempts = options?.maxAttempts ?? 3;
const delayMs = options?.delayMs ?? 200;
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
const data = readExistingLockData(lp);
if (data !== null) return data;
if (attempt < maxAttempts) {
// Synchronous busy-wait — onCompromised runs in a sync callback context
// and the delays are short (200ms default).
const start = Date.now();
while (Date.now() - start < delayMs) {
// busy-wait
}
}
}
return null;
}
function isPidAlive(pid: number): boolean {
if (!Number.isInteger(pid) || pid <= 0) return false;
if (pid === process.pid) return false;

View file

@ -0,0 +1,223 @@
/**
* session-lock-transient-read.test.ts Tests for transient lock file unreadability (#2324).
*
* Regression coverage for:
* #2324 onCompromised declares lock lost when the lock file is temporarily
* unreadable (NFS/CIFS latency, macOS APFS snapshot, concurrent process
* briefly holding the file).
*
* Tests:
* - readExistingLockDataWithRetry retries on transient read failure
* - readExistingLockDataWithRetry returns data when file becomes readable after retries
* - readExistingLockDataWithRetry returns null only when ALL retries exhausted
* - onCompromised does not declare compromise when lock file is transiently unreadable
*/
import { mkdtempSync, mkdirSync, writeFileSync, rmSync, existsSync, renameSync, unlinkSync, chmodSync } from 'node:fs';
import { join } from 'node:path';
import { tmpdir } from 'node:os';
import { execSync, spawn } from 'node:child_process';
import {
acquireSessionLock,
getSessionLockStatus,
releaseSessionLock,
readExistingLockDataWithRetry,
type SessionLockData,
} from '../session-lock.ts';
import { gsdRoot } from '../paths.ts';
import { createTestContext } from './test-helpers.ts';
const { assertEq, assertTrue, report } = createTestContext();
async function main(): Promise<void> {
// ─── 1. readExistingLockDataWithRetry succeeds on first read when file is fine ─
console.log('\n=== 1. readExistingLockDataWithRetry reads file normally ===');
{
const base = mkdtempSync(join(tmpdir(), 'gsd-transient-'));
mkdirSync(join(base, '.gsd'), { recursive: true });
try {
const lockFile = join(gsdRoot(base), 'auto.lock');
const lockData: SessionLockData = {
pid: process.pid,
startedAt: new Date().toISOString(),
unitType: 'execute-task',
unitId: 'M001/S01/T01',
unitStartedAt: new Date().toISOString(),
sessionFile: 'test-session.json',
};
writeFileSync(lockFile, JSON.stringify(lockData, null, 2));
const result = readExistingLockDataWithRetry(lockFile);
assertTrue(result !== null, 'data returned for readable file');
assertEq(result!.pid, process.pid, 'correct PID read');
assertEq(result!.sessionFile, 'test-session.json', 'correct sessionFile read');
} finally {
rmSync(base, { recursive: true, force: true });
}
}
// ─── 2. readExistingLockDataWithRetry returns null for truly missing file ──
console.log('\n=== 2. readExistingLockDataWithRetry returns null for missing file ===');
{
const base = mkdtempSync(join(tmpdir(), 'gsd-transient-'));
mkdirSync(join(base, '.gsd'), { recursive: true });
try {
const lockFile = join(gsdRoot(base), 'auto.lock');
// File doesn't exist
const result = readExistingLockDataWithRetry(lockFile, { maxAttempts: 2, delayMs: 10 });
assertEq(result, null, 'null for truly missing file after retries');
} finally {
rmSync(base, { recursive: true, force: true });
}
}
// ─── 3. readExistingLockDataWithRetry recovers after transient rename ──────
console.log('\n=== 3. readExistingLockDataWithRetry recovers after transient unavailability ===');
{
const base = mkdtempSync(join(tmpdir(), 'gsd-transient-'));
mkdirSync(join(base, '.gsd'), { recursive: true });
try {
const lockFile = join(gsdRoot(base), 'auto.lock');
const tmpFile = lockFile + '.hidden';
const lockData: SessionLockData = {
pid: process.pid,
startedAt: new Date().toISOString(),
unitType: 'execute-task',
unitId: 'M001/S01/T01',
unitStartedAt: new Date().toISOString(),
sessionFile: 'recovery-session.json',
};
writeFileSync(lockFile, JSON.stringify(lockData, null, 2));
// Simulate transient unavailability: move file away, spawn a child process
// to restore it after 100ms. The child runs outside our event loop so it
// fires even during busy-wait retries.
renameSync(lockFile, tmpFile);
spawn('bash', ['-c', `sleep 0.1 && mv "${tmpFile}" "${lockFile}"`], { stdio: 'ignore', detached: true }).unref();
// With retries (3 attempts, 200ms delay), it should recover on 2nd or 3rd attempt
const result = readExistingLockDataWithRetry(lockFile, { maxAttempts: 3, delayMs: 200 });
assertTrue(result !== null, 'data recovered after transient unavailability');
if (result) {
assertEq(result.pid, process.pid, 'correct PID after recovery');
assertEq(result.sessionFile, 'recovery-session.json', 'correct sessionFile after recovery');
}
} finally {
rmSync(base, { recursive: true, force: true });
}
}
// ─── 4. readExistingLockDataWithRetry recovers from transient permission error ─
console.log('\n=== 4. readExistingLockDataWithRetry recovers from transient permission error ===');
{
const base = mkdtempSync(join(tmpdir(), 'gsd-transient-'));
mkdirSync(join(base, '.gsd'), { recursive: true });
try {
const lockFile = join(gsdRoot(base), 'auto.lock');
const lockData: SessionLockData = {
pid: process.pid,
startedAt: new Date().toISOString(),
unitType: 'execute-task',
unitId: 'M001/S01/T01',
unitStartedAt: new Date().toISOString(),
sessionFile: 'perm-session.json',
};
writeFileSync(lockFile, JSON.stringify(lockData, null, 2));
// Remove read permission to simulate NFS/CIFS latency, then spawn a child
// to restore permissions after 100ms (runs outside our event loop).
chmodSync(lockFile, 0o000);
spawn('bash', ['-c', `sleep 0.1 && chmod 644 "${lockFile}"`], { stdio: 'ignore', detached: true }).unref();
const result = readExistingLockDataWithRetry(lockFile, { maxAttempts: 3, delayMs: 200 });
assertTrue(result !== null, 'data recovered after transient permission error');
if (result) {
assertEq(result.pid, process.pid, 'correct PID after permission recovery');
}
// Ensure permissions restored for cleanup
try { chmodSync(lockFile, 0o644); } catch { /* best-effort */ }
} finally {
rmSync(base, { recursive: true, force: true });
}
}
// ─── 5. getSessionLockStatus does not false-positive on transient read failure ─
console.log('\n=== 5. getSessionLockStatus tolerates transient lock file unavailability ===');
{
const base = mkdtempSync(join(tmpdir(), 'gsd-transient-'));
mkdirSync(join(base, '.gsd'), { recursive: true });
try {
const result = acquireSessionLock(base);
assertTrue(result.acquired, 'lock acquired');
// Validate works initially
const status1 = getSessionLockStatus(base);
assertTrue(status1.valid, 'lock valid before transient failure');
// Temporarily hide the lock file
const lockFile = join(gsdRoot(base), 'auto.lock');
const tmpFile = lockFile + '.hidden';
renameSync(lockFile, tmpFile);
// Schedule restoration
setTimeout(() => {
try { renameSync(tmpFile, lockFile); } catch { /* best-effort */ }
}, 30);
// Small delay to ensure restoration runs, then check — with the OS lock
// still held, getSessionLockStatus should return valid=true even if the
// lock file was briefly missing (it checks _releaseFunction first).
await new Promise(r => setTimeout(r, 60));
const status2 = getSessionLockStatus(base);
assertTrue(status2.valid, 'lock still valid after transient file disappearance (OS lock held)');
// Restore if not yet restored
try { renameSync(tmpFile, lockFile); } catch { /* already restored */ }
releaseSessionLock(base);
} finally {
rmSync(base, { recursive: true, force: true });
}
}
// ─── 6. Retry defaults: 3 attempts with 200ms delay ────────────────────────
console.log('\n=== 6. Default retry params: function works with defaults ===');
{
const base = mkdtempSync(join(tmpdir(), 'gsd-transient-'));
mkdirSync(join(base, '.gsd'), { recursive: true });
try {
const lockFile = join(gsdRoot(base), 'auto.lock');
const lockData: SessionLockData = {
pid: process.pid,
startedAt: new Date().toISOString(),
unitType: 'execute-task',
unitId: 'M001/S01/T01',
unitStartedAt: new Date().toISOString(),
sessionFile: 'status-session.json',
};
writeFileSync(lockFile, JSON.stringify(lockData, null, 2));
// Call with no options — uses defaults (3 attempts, 200ms)
const result = readExistingLockDataWithRetry(lockFile);
assertTrue(result !== null, 'default params work for readable file');
} finally {
rmSync(base, { recursive: true, force: true });
}
}
report();
}
main().catch((error) => {
console.error(error);
process.exit(1);
});