From c09c256f285074e3cab971e5d33123825f99f535 Mon Sep 17 00:00:00 2001 From: Tom Boucher Date: Mon, 23 Mar 2026 22:35:43 -0400 Subject: [PATCH 1/2] fix(session-lock): retry lock file reads before declaring compromise onCompromised was declaring lock lost when the lock file was temporarily unreadable (NFS/CIFS latency, macOS APFS snapshot, or concurrent process briefly holding the file). Add readExistingLockDataWithRetry (3 attempts, 200ms delay) so transient filesystem hiccups do not trigger false-positive compromise events. Fixes #2324 Co-Authored-By: Claude Opus 4.6 (1M context) --- src/resources/extensions/gsd/session-lock.ts | 52 +++- .../tests/session-lock-transient-read.test.ts | 223 ++++++++++++++++++ 2 files changed, 268 insertions(+), 7 deletions(-) create mode 100644 src/resources/extensions/gsd/tests/session-lock-transient-read.test.ts diff --git a/src/resources/extensions/gsd/session-lock.ts b/src/resources/extensions/gsd/session-lock.ts index e77c8bd7a..7c0a0d6ce 100644 --- a/src/resources/extensions/gsd/session-lock.ts +++ b/src/resources/extensions/gsd/session-lock.ts @@ -242,16 +242,16 @@ export function acquireSessionLock(basePath: string): SessionLockResult { return; // Suppress false positive } // Past the stale window — check if the lock file still belongs to us before - // declaring compromise (#1578). If our PID still owns the metadata, this is - // a false positive from a very long event loop stall (e.g. subagent execution). - const existing = readExistingLockData(lp); + // declaring compromise (#1578). Retry reads to tolerate transient filesystem + // hiccups (NFS/CIFS latency, APFS snapshots, etc.) (#2324). + const existing = readExistingLockDataWithRetry(lp); if (existing && existing.pid === process.pid) { process.stderr.write( `[gsd] Lock heartbeat mismatch after ${Math.round(elapsed / 1000)}s — lock file still owned by PID ${process.pid}, treating as false positive.\n`, ); return; // Our PID still owns the lock file — no real takeover } - // Lock file is gone or owned by another PID — real compromise + // Lock file is gone or owned by another PID after retries — real compromise _lockCompromised = true; _releaseFunction = null; }, @@ -301,8 +301,9 @@ export function acquireSessionLock(basePath: string): SessionLockResult { ); return; } - // Check PID ownership before declaring compromise (#1578) - const existing = readExistingLockData(lp); + // Check PID ownership before declaring compromise (#1578). + // Retry reads to tolerate transient filesystem hiccups (#2324). + const existing = readExistingLockDataWithRetry(lp); if (existing && existing.pid === process.pid) { process.stderr.write( `[gsd] Lock heartbeat mismatch after ${Math.round(elapsed / 1000)}s — lock file still owned by PID ${process.pid}, treating as false positive.\n`, @@ -413,7 +414,8 @@ export function getSessionLockStatus(basePath: string): SessionLockStatus { // onCompromised fired from benign mtime drift (laptop sleep, event loop stall // beyond the stale window). Attempt re-acquisition instead of giving up. const lp = lockPath(basePath); - const existing = readExistingLockData(lp); + // Retry reads to tolerate transient filesystem hiccups (#2324). + const existing = readExistingLockDataWithRetry(lp); if (existing && existing.pid === process.pid) { // Lock file still ours — try to re-acquire the OS lock try { @@ -565,6 +567,42 @@ function readExistingLockData(lp: string): SessionLockData | null { } } +/** + * Retry-tolerant variant of readExistingLockData for use in onCompromised and + * other paths where a transient filesystem hiccup (NFS/CIFS latency, macOS APFS + * snapshot, concurrent process briefly holding the file) should NOT be treated + * as "lock file gone" (#2324). + * + * Retries up to `maxAttempts` times with `delayMs` between each attempt. + * Only returns null when ALL retries fail to read valid data. + */ +export interface RetryOptions { + maxAttempts?: number; + delayMs?: number; +} + +export function readExistingLockDataWithRetry( + lp: string, + options?: RetryOptions, +): SessionLockData | null { + const maxAttempts = options?.maxAttempts ?? 3; + const delayMs = options?.delayMs ?? 200; + + for (let attempt = 1; attempt <= maxAttempts; attempt++) { + const data = readExistingLockData(lp); + if (data !== null) return data; + if (attempt < maxAttempts) { + // Synchronous busy-wait — onCompromised runs in a sync callback context + // and the delays are short (200ms default). + const start = Date.now(); + while (Date.now() - start < delayMs) { + // busy-wait + } + } + } + return null; +} + function isPidAlive(pid: number): boolean { if (!Number.isInteger(pid) || pid <= 0) return false; if (pid === process.pid) return false; diff --git a/src/resources/extensions/gsd/tests/session-lock-transient-read.test.ts b/src/resources/extensions/gsd/tests/session-lock-transient-read.test.ts new file mode 100644 index 000000000..33b3d0f21 --- /dev/null +++ b/src/resources/extensions/gsd/tests/session-lock-transient-read.test.ts @@ -0,0 +1,223 @@ +/** + * session-lock-transient-read.test.ts — Tests for transient lock file unreadability (#2324). + * + * Regression coverage for: + * #2324 onCompromised declares lock lost when the lock file is temporarily + * unreadable (NFS/CIFS latency, macOS APFS snapshot, concurrent process + * briefly holding the file). + * + * Tests: + * - readExistingLockDataWithRetry retries on transient read failure + * - readExistingLockDataWithRetry returns data when file becomes readable after retries + * - readExistingLockDataWithRetry returns null only when ALL retries exhausted + * - onCompromised does not declare compromise when lock file is transiently unreadable + */ + +import { mkdtempSync, mkdirSync, writeFileSync, rmSync, existsSync, renameSync, unlinkSync, chmodSync } from 'node:fs'; +import { join } from 'node:path'; +import { tmpdir } from 'node:os'; +import { execSync, spawn } from 'node:child_process'; + +import { + acquireSessionLock, + getSessionLockStatus, + releaseSessionLock, + readExistingLockDataWithRetry, + type SessionLockData, +} from '../session-lock.ts'; +import { gsdRoot } from '../paths.ts'; +import { createTestContext } from './test-helpers.ts'; + +const { assertEq, assertTrue, report } = createTestContext(); + +async function main(): Promise { + + // ─── 1. readExistingLockDataWithRetry succeeds on first read when file is fine ─ + console.log('\n=== 1. readExistingLockDataWithRetry reads file normally ==='); + { + const base = mkdtempSync(join(tmpdir(), 'gsd-transient-')); + mkdirSync(join(base, '.gsd'), { recursive: true }); + + try { + const lockFile = join(gsdRoot(base), 'auto.lock'); + const lockData: SessionLockData = { + pid: process.pid, + startedAt: new Date().toISOString(), + unitType: 'execute-task', + unitId: 'M001/S01/T01', + unitStartedAt: new Date().toISOString(), + completedUnits: 3, + }; + writeFileSync(lockFile, JSON.stringify(lockData, null, 2)); + + const result = readExistingLockDataWithRetry(lockFile); + assertTrue(result !== null, 'data returned for readable file'); + assertEq(result!.pid, process.pid, 'correct PID read'); + assertEq(result!.completedUnits, 3, 'correct completedUnits read'); + } finally { + rmSync(base, { recursive: true, force: true }); + } + } + + // ─── 2. readExistingLockDataWithRetry returns null for truly missing file ── + console.log('\n=== 2. readExistingLockDataWithRetry returns null for missing file ==='); + { + const base = mkdtempSync(join(tmpdir(), 'gsd-transient-')); + mkdirSync(join(base, '.gsd'), { recursive: true }); + + try { + const lockFile = join(gsdRoot(base), 'auto.lock'); + // File doesn't exist + const result = readExistingLockDataWithRetry(lockFile, { maxAttempts: 2, delayMs: 10 }); + assertEq(result, null, 'null for truly missing file after retries'); + } finally { + rmSync(base, { recursive: true, force: true }); + } + } + + // ─── 3. readExistingLockDataWithRetry recovers after transient rename ────── + console.log('\n=== 3. readExistingLockDataWithRetry recovers after transient unavailability ==='); + { + const base = mkdtempSync(join(tmpdir(), 'gsd-transient-')); + mkdirSync(join(base, '.gsd'), { recursive: true }); + + try { + const lockFile = join(gsdRoot(base), 'auto.lock'); + const tmpFile = lockFile + '.hidden'; + const lockData: SessionLockData = { + pid: process.pid, + startedAt: new Date().toISOString(), + unitType: 'execute-task', + unitId: 'M001/S01/T01', + unitStartedAt: new Date().toISOString(), + completedUnits: 7, + }; + writeFileSync(lockFile, JSON.stringify(lockData, null, 2)); + + // Simulate transient unavailability: move file away, spawn a child process + // to restore it after 100ms. The child runs outside our event loop so it + // fires even during busy-wait retries. + renameSync(lockFile, tmpFile); + spawn('bash', ['-c', `sleep 0.1 && mv "${tmpFile}" "${lockFile}"`], { stdio: 'ignore', detached: true }).unref(); + + // With retries (3 attempts, 200ms delay), it should recover on 2nd or 3rd attempt + const result = readExistingLockDataWithRetry(lockFile, { maxAttempts: 3, delayMs: 200 }); + assertTrue(result !== null, 'data recovered after transient unavailability'); + if (result) { + assertEq(result.pid, process.pid, 'correct PID after recovery'); + assertEq(result.completedUnits, 7, 'correct completedUnits after recovery'); + } + } finally { + rmSync(base, { recursive: true, force: true }); + } + } + + // ─── 4. readExistingLockDataWithRetry recovers from transient permission error ─ + console.log('\n=== 4. readExistingLockDataWithRetry recovers from transient permission error ==='); + { + const base = mkdtempSync(join(tmpdir(), 'gsd-transient-')); + mkdirSync(join(base, '.gsd'), { recursive: true }); + + try { + const lockFile = join(gsdRoot(base), 'auto.lock'); + const lockData: SessionLockData = { + pid: process.pid, + startedAt: new Date().toISOString(), + unitType: 'execute-task', + unitId: 'M001/S01/T01', + unitStartedAt: new Date().toISOString(), + completedUnits: 5, + }; + writeFileSync(lockFile, JSON.stringify(lockData, null, 2)); + + // Remove read permission to simulate NFS/CIFS latency, then spawn a child + // to restore permissions after 100ms (runs outside our event loop). + chmodSync(lockFile, 0o000); + spawn('bash', ['-c', `sleep 0.1 && chmod 644 "${lockFile}"`], { stdio: 'ignore', detached: true }).unref(); + + const result = readExistingLockDataWithRetry(lockFile, { maxAttempts: 3, delayMs: 200 }); + assertTrue(result !== null, 'data recovered after transient permission error'); + if (result) { + assertEq(result.pid, process.pid, 'correct PID after permission recovery'); + } + + // Ensure permissions restored for cleanup + try { chmodSync(lockFile, 0o644); } catch { /* best-effort */ } + } finally { + rmSync(base, { recursive: true, force: true }); + } + } + + // ─── 5. getSessionLockStatus does not false-positive on transient read failure ─ + console.log('\n=== 5. getSessionLockStatus tolerates transient lock file unavailability ==='); + { + const base = mkdtempSync(join(tmpdir(), 'gsd-transient-')); + mkdirSync(join(base, '.gsd'), { recursive: true }); + + try { + const result = acquireSessionLock(base); + assertTrue(result.acquired, 'lock acquired'); + + // Validate works initially + const status1 = getSessionLockStatus(base); + assertTrue(status1.valid, 'lock valid before transient failure'); + + // Temporarily hide the lock file + const lockFile = join(gsdRoot(base), 'auto.lock'); + const tmpFile = lockFile + '.hidden'; + renameSync(lockFile, tmpFile); + + // Schedule restoration + setTimeout(() => { + try { renameSync(tmpFile, lockFile); } catch { /* best-effort */ } + }, 30); + + // Small delay to ensure restoration runs, then check — with the OS lock + // still held, getSessionLockStatus should return valid=true even if the + // lock file was briefly missing (it checks _releaseFunction first). + await new Promise(r => setTimeout(r, 60)); + const status2 = getSessionLockStatus(base); + assertTrue(status2.valid, 'lock still valid after transient file disappearance (OS lock held)'); + + // Restore if not yet restored + try { renameSync(tmpFile, lockFile); } catch { /* already restored */ } + + releaseSessionLock(base); + } finally { + rmSync(base, { recursive: true, force: true }); + } + } + + // ─── 6. Retry defaults: 3 attempts with 200ms delay ──────────────────────── + console.log('\n=== 6. Default retry params: function works with defaults ==='); + { + const base = mkdtempSync(join(tmpdir(), 'gsd-transient-')); + mkdirSync(join(base, '.gsd'), { recursive: true }); + + try { + const lockFile = join(gsdRoot(base), 'auto.lock'); + const lockData: SessionLockData = { + pid: process.pid, + startedAt: new Date().toISOString(), + unitType: 'execute-task', + unitId: 'M001/S01/T01', + unitStartedAt: new Date().toISOString(), + completedUnits: 0, + }; + writeFileSync(lockFile, JSON.stringify(lockData, null, 2)); + + // Call with no options — uses defaults (3 attempts, 200ms) + const result = readExistingLockDataWithRetry(lockFile); + assertTrue(result !== null, 'default params work for readable file'); + } finally { + rmSync(base, { recursive: true, force: true }); + } + } + + report(); +} + +main().catch((error) => { + console.error(error); + process.exit(1); +}); From b4405cbb3579d39a83f10496ae7a84ace848f78a Mon Sep 17 00:00:00 2001 From: Lex Christopherson Date: Wed, 25 Mar 2026 21:56:08 -0600 Subject: [PATCH 2/2] fix(test): replace stale completedUnits with sessionFile in session-lock test SessionLockData no longer has a completedUnits field. Use sessionFile (an actual optional field) for the same assertion coverage. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../gsd/tests/session-lock-transient-read.test.ts | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/resources/extensions/gsd/tests/session-lock-transient-read.test.ts b/src/resources/extensions/gsd/tests/session-lock-transient-read.test.ts index 33b3d0f21..85d0b93f4 100644 --- a/src/resources/extensions/gsd/tests/session-lock-transient-read.test.ts +++ b/src/resources/extensions/gsd/tests/session-lock-transient-read.test.ts @@ -46,14 +46,14 @@ async function main(): Promise { unitType: 'execute-task', unitId: 'M001/S01/T01', unitStartedAt: new Date().toISOString(), - completedUnits: 3, + sessionFile: 'test-session.json', }; writeFileSync(lockFile, JSON.stringify(lockData, null, 2)); const result = readExistingLockDataWithRetry(lockFile); assertTrue(result !== null, 'data returned for readable file'); assertEq(result!.pid, process.pid, 'correct PID read'); - assertEq(result!.completedUnits, 3, 'correct completedUnits read'); + assertEq(result!.sessionFile, 'test-session.json', 'correct sessionFile read'); } finally { rmSync(base, { recursive: true, force: true }); } @@ -90,7 +90,7 @@ async function main(): Promise { unitType: 'execute-task', unitId: 'M001/S01/T01', unitStartedAt: new Date().toISOString(), - completedUnits: 7, + sessionFile: 'recovery-session.json', }; writeFileSync(lockFile, JSON.stringify(lockData, null, 2)); @@ -105,7 +105,7 @@ async function main(): Promise { assertTrue(result !== null, 'data recovered after transient unavailability'); if (result) { assertEq(result.pid, process.pid, 'correct PID after recovery'); - assertEq(result.completedUnits, 7, 'correct completedUnits after recovery'); + assertEq(result.sessionFile, 'recovery-session.json', 'correct sessionFile after recovery'); } } finally { rmSync(base, { recursive: true, force: true }); @@ -126,7 +126,7 @@ async function main(): Promise { unitType: 'execute-task', unitId: 'M001/S01/T01', unitStartedAt: new Date().toISOString(), - completedUnits: 5, + sessionFile: 'perm-session.json', }; writeFileSync(lockFile, JSON.stringify(lockData, null, 2)); @@ -202,7 +202,7 @@ async function main(): Promise { unitType: 'execute-task', unitId: 'M001/S01/T01', unitStartedAt: new Date().toISOString(), - completedUnits: 0, + sessionFile: 'status-session.json', }; writeFileSync(lockFile, JSON.stringify(lockData, null, 2));