From c09c256f285074e3cab971e5d33123825f99f535 Mon Sep 17 00:00:00 2001
From: Tom Boucher <trekkie@nomorestars.com>
Date: Mon, 23 Mar 2026 22:35:43 -0400
Subject: [PATCH 1/2] fix(session-lock): retry lock file reads before declaring
 compromise

onCompromised was declaring lock lost when the lock file was temporarily
unreadable (NFS/CIFS latency, macOS APFS snapshot, or concurrent process
briefly holding the file). Add readExistingLockDataWithRetry (3 attempts,
200ms delay) so transient filesystem hiccups do not trigger false-positive
compromise events.

Fixes #2324

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 src/resources/extensions/gsd/session-lock.ts  |  52 +++-
 .../tests/session-lock-transient-read.test.ts | 223 ++++++++++++++++++
 2 files changed, 268 insertions(+), 7 deletions(-)
 create mode 100644 src/resources/extensions/gsd/tests/session-lock-transient-read.test.ts

diff --git a/src/resources/extensions/gsd/session-lock.ts b/src/resources/extensions/gsd/session-lock.ts
index e77c8bd7a..7c0a0d6ce 100644
--- a/src/resources/extensions/gsd/session-lock.ts
+++ b/src/resources/extensions/gsd/session-lock.ts
@@ -242,16 +242,16 @@ export function acquireSessionLock(basePath: string): SessionLockResult {
           return; // Suppress false positive
         }
         // Past the stale window — check if the lock file still belongs to us before
-        // declaring compromise (#1578). If our PID still owns the metadata, this is
-        // a false positive from a very long event loop stall (e.g. subagent execution).
-        const existing = readExistingLockData(lp);
+        // declaring compromise (#1578). Retry reads to tolerate transient filesystem
+        // hiccups (NFS/CIFS latency, APFS snapshots, etc.) (#2324).
+        const existing = readExistingLockDataWithRetry(lp);
         if (existing && existing.pid === process.pid) {
           process.stderr.write(
             `[gsd] Lock heartbeat mismatch after ${Math.round(elapsed / 1000)}s — lock file still owned by PID ${process.pid}, treating as false positive.\n`,
           );
           return; // Our PID still owns the lock file — no real takeover
         }
-        // Lock file is gone or owned by another PID — real compromise
+        // Lock file is gone or owned by another PID after retries — real compromise
         _lockCompromised = true;
         _releaseFunction = null;
       },
@@ -301,8 +301,9 @@ export function acquireSessionLock(basePath: string): SessionLockResult {
               );
               return;
             }
-            // Check PID ownership before declaring compromise (#1578)
-            const existing = readExistingLockData(lp);
+            // Check PID ownership before declaring compromise (#1578).
+            // Retry reads to tolerate transient filesystem hiccups (#2324).
+            const existing = readExistingLockDataWithRetry(lp);
             if (existing && existing.pid === process.pid) {
               process.stderr.write(
                 `[gsd] Lock heartbeat mismatch after ${Math.round(elapsed / 1000)}s — lock file still owned by PID ${process.pid}, treating as false positive.\n`,
@@ -413,7 +414,8 @@ export function getSessionLockStatus(basePath: string): SessionLockStatus {
     // onCompromised fired from benign mtime drift (laptop sleep, event loop stall
     // beyond the stale window). Attempt re-acquisition instead of giving up.
     const lp = lockPath(basePath);
-    const existing = readExistingLockData(lp);
+    // Retry reads to tolerate transient filesystem hiccups (#2324).
+    const existing = readExistingLockDataWithRetry(lp);
     if (existing && existing.pid === process.pid) {
       // Lock file still ours — try to re-acquire the OS lock
       try {
@@ -565,6 +567,42 @@ function readExistingLockData(lp: string): SessionLockData | null {
   }
 }
 
+/**
+ * Retry-tolerant variant of readExistingLockData for use in onCompromised and
+ * other paths where a transient filesystem hiccup (NFS/CIFS latency, macOS APFS
+ * snapshot, concurrent process briefly holding the file) should NOT be treated
+ * as "lock file gone" (#2324).
+ *
+ * Retries up to `maxAttempts` times with `delayMs` between each attempt.
+ * Only returns null when ALL retries fail to read valid data.
+ */
+export interface RetryOptions {
+  maxAttempts?: number;
+  delayMs?: number;
+}
+
+export function readExistingLockDataWithRetry(
+  lp: string,
+  options?: RetryOptions,
+): SessionLockData | null {
+  const maxAttempts = options?.maxAttempts ?? 3;
+  const delayMs = options?.delayMs ?? 200;
+
+  for (let attempt = 1; attempt <= maxAttempts; attempt++) {
+    const data = readExistingLockData(lp);
+    if (data !== null) return data;
+    if (attempt < maxAttempts) {
+      // Synchronous busy-wait — onCompromised runs in a sync callback context
+      // and the delays are short (200ms default).
+      const start = Date.now();
+      while (Date.now() - start < delayMs) {
+        // busy-wait
+      }
+    }
+  }
+  return null;
+}
+
 function isPidAlive(pid: number): boolean {
   if (!Number.isInteger(pid) || pid <= 0) return false;
   if (pid === process.pid) return false;
diff --git a/src/resources/extensions/gsd/tests/session-lock-transient-read.test.ts b/src/resources/extensions/gsd/tests/session-lock-transient-read.test.ts
new file mode 100644
index 000000000..33b3d0f21
--- /dev/null
+++ b/src/resources/extensions/gsd/tests/session-lock-transient-read.test.ts
@@ -0,0 +1,223 @@
+/**
+ * session-lock-transient-read.test.ts — Tests for transient lock file unreadability (#2324).
+ *
+ * Regression coverage for:
+ *   #2324  onCompromised declares lock lost when the lock file is temporarily
+ *          unreadable (NFS/CIFS latency, macOS APFS snapshot, concurrent process
+ *          briefly holding the file).
+ *
+ * Tests:
+ *   - readExistingLockDataWithRetry retries on transient read failure
+ *   - readExistingLockDataWithRetry returns data when file becomes readable after retries
+ *   - readExistingLockDataWithRetry returns null only when ALL retries exhausted
+ *   - onCompromised does not declare compromise when lock file is transiently unreadable
+ */
+
+import { mkdtempSync, mkdirSync, writeFileSync, rmSync, existsSync, renameSync, unlinkSync, chmodSync } from 'node:fs';
+import { join } from 'node:path';
+import { tmpdir } from 'node:os';
+import { execSync, spawn } from 'node:child_process';
+
+import {
+  acquireSessionLock,
+  getSessionLockStatus,
+  releaseSessionLock,
+  readExistingLockDataWithRetry,
+  type SessionLockData,
+} from '../session-lock.ts';
+import { gsdRoot } from '../paths.ts';
+import { createTestContext } from './test-helpers.ts';
+
+const { assertEq, assertTrue, report } = createTestContext();
+
+async function main(): Promise<void> {
+
+  // ─── 1. readExistingLockDataWithRetry succeeds on first read when file is fine ─
+  console.log('\n=== 1. readExistingLockDataWithRetry reads file normally ===');
+  {
+    const base = mkdtempSync(join(tmpdir(), 'gsd-transient-'));
+    mkdirSync(join(base, '.gsd'), { recursive: true });
+
+    try {
+      const lockFile = join(gsdRoot(base), 'auto.lock');
+      const lockData: SessionLockData = {
+        pid: process.pid,
+        startedAt: new Date().toISOString(),
+        unitType: 'execute-task',
+        unitId: 'M001/S01/T01',
+        unitStartedAt: new Date().toISOString(),
+        completedUnits: 3,
+      };
+      writeFileSync(lockFile, JSON.stringify(lockData, null, 2));
+
+      const result = readExistingLockDataWithRetry(lockFile);
+      assertTrue(result !== null, 'data returned for readable file');
+      assertEq(result!.pid, process.pid, 'correct PID read');
+      assertEq(result!.completedUnits, 3, 'correct completedUnits read');
+    } finally {
+      rmSync(base, { recursive: true, force: true });
+    }
+  }
+
+  // ─── 2. readExistingLockDataWithRetry returns null for truly missing file ──
+  console.log('\n=== 2. readExistingLockDataWithRetry returns null for missing file ===');
+  {
+    const base = mkdtempSync(join(tmpdir(), 'gsd-transient-'));
+    mkdirSync(join(base, '.gsd'), { recursive: true });
+
+    try {
+      const lockFile = join(gsdRoot(base), 'auto.lock');
+      // File doesn't exist
+      const result = readExistingLockDataWithRetry(lockFile, { maxAttempts: 2, delayMs: 10 });
+      assertEq(result, null, 'null for truly missing file after retries');
+    } finally {
+      rmSync(base, { recursive: true, force: true });
+    }
+  }
+
+  // ─── 3. readExistingLockDataWithRetry recovers after transient rename ──────
+  console.log('\n=== 3. readExistingLockDataWithRetry recovers after transient unavailability ===');
+  {
+    const base = mkdtempSync(join(tmpdir(), 'gsd-transient-'));
+    mkdirSync(join(base, '.gsd'), { recursive: true });
+
+    try {
+      const lockFile = join(gsdRoot(base), 'auto.lock');
+      const tmpFile = lockFile + '.hidden';
+      const lockData: SessionLockData = {
+        pid: process.pid,
+        startedAt: new Date().toISOString(),
+        unitType: 'execute-task',
+        unitId: 'M001/S01/T01',
+        unitStartedAt: new Date().toISOString(),
+        completedUnits: 7,
+      };
+      writeFileSync(lockFile, JSON.stringify(lockData, null, 2));
+
+      // Simulate transient unavailability: move file away, spawn a child process
+      // to restore it after 100ms. The child runs outside our event loop so it
+      // fires even during busy-wait retries.
+      renameSync(lockFile, tmpFile);
+      spawn('bash', ['-c', `sleep 0.1 && mv "${tmpFile}" "${lockFile}"`], { stdio: 'ignore', detached: true }).unref();
+
+      // With retries (3 attempts, 200ms delay), it should recover on 2nd or 3rd attempt
+      const result = readExistingLockDataWithRetry(lockFile, { maxAttempts: 3, delayMs: 200 });
+      assertTrue(result !== null, 'data recovered after transient unavailability');
+      if (result) {
+        assertEq(result.pid, process.pid, 'correct PID after recovery');
+        assertEq(result.completedUnits, 7, 'correct completedUnits after recovery');
+      }
+    } finally {
+      rmSync(base, { recursive: true, force: true });
+    }
+  }
+
+  // ─── 4. readExistingLockDataWithRetry recovers from transient permission error ─
+  console.log('\n=== 4. readExistingLockDataWithRetry recovers from transient permission error ===');
+  {
+    const base = mkdtempSync(join(tmpdir(), 'gsd-transient-'));
+    mkdirSync(join(base, '.gsd'), { recursive: true });
+
+    try {
+      const lockFile = join(gsdRoot(base), 'auto.lock');
+      const lockData: SessionLockData = {
+        pid: process.pid,
+        startedAt: new Date().toISOString(),
+        unitType: 'execute-task',
+        unitId: 'M001/S01/T01',
+        unitStartedAt: new Date().toISOString(),
+        completedUnits: 5,
+      };
+      writeFileSync(lockFile, JSON.stringify(lockData, null, 2));
+
+      // Remove read permission to simulate NFS/CIFS latency, then spawn a child
+      // to restore permissions after 100ms (runs outside our event loop).
+      chmodSync(lockFile, 0o000);
+      spawn('bash', ['-c', `sleep 0.1 && chmod 644 "${lockFile}"`], { stdio: 'ignore', detached: true }).unref();
+
+      const result = readExistingLockDataWithRetry(lockFile, { maxAttempts: 3, delayMs: 200 });
+      assertTrue(result !== null, 'data recovered after transient permission error');
+      if (result) {
+        assertEq(result.pid, process.pid, 'correct PID after permission recovery');
+      }
+
+      // Ensure permissions restored for cleanup
+      try { chmodSync(lockFile, 0o644); } catch { /* best-effort */ }
+    } finally {
+      rmSync(base, { recursive: true, force: true });
+    }
+  }
+
+  // ─── 5. getSessionLockStatus does not false-positive on transient read failure ─
+  console.log('\n=== 5. getSessionLockStatus tolerates transient lock file unavailability ===');
+  {
+    const base = mkdtempSync(join(tmpdir(), 'gsd-transient-'));
+    mkdirSync(join(base, '.gsd'), { recursive: true });
+
+    try {
+      const result = acquireSessionLock(base);
+      assertTrue(result.acquired, 'lock acquired');
+
+      // Validate works initially
+      const status1 = getSessionLockStatus(base);
+      assertTrue(status1.valid, 'lock valid before transient failure');
+
+      // Temporarily hide the lock file
+      const lockFile = join(gsdRoot(base), 'auto.lock');
+      const tmpFile = lockFile + '.hidden';
+      renameSync(lockFile, tmpFile);
+
+      // Schedule restoration
+      setTimeout(() => {
+        try { renameSync(tmpFile, lockFile); } catch { /* best-effort */ }
+      }, 30);
+
+      // Small delay to ensure restoration runs, then check — with the OS lock
+      // still held, getSessionLockStatus should return valid=true even if the
+      // lock file was briefly missing (it checks _releaseFunction first).
+      await new Promise(r => setTimeout(r, 60));
+      const status2 = getSessionLockStatus(base);
+      assertTrue(status2.valid, 'lock still valid after transient file disappearance (OS lock held)');
+
+      // Restore if not yet restored
+      try { renameSync(tmpFile, lockFile); } catch { /* already restored */ }
+
+      releaseSessionLock(base);
+    } finally {
+      rmSync(base, { recursive: true, force: true });
+    }
+  }
+
+  // ─── 6. Retry defaults: 3 attempts with 200ms delay ────────────────────────
+  console.log('\n=== 6. Default retry params: function works with defaults ===');
+  {
+    const base = mkdtempSync(join(tmpdir(), 'gsd-transient-'));
+    mkdirSync(join(base, '.gsd'), { recursive: true });
+
+    try {
+      const lockFile = join(gsdRoot(base), 'auto.lock');
+      const lockData: SessionLockData = {
+        pid: process.pid,
+        startedAt: new Date().toISOString(),
+        unitType: 'execute-task',
+        unitId: 'M001/S01/T01',
+        unitStartedAt: new Date().toISOString(),
+        completedUnits: 0,
+      };
+      writeFileSync(lockFile, JSON.stringify(lockData, null, 2));
+
+      // Call with no options — uses defaults (3 attempts, 200ms)
+      const result = readExistingLockDataWithRetry(lockFile);
+      assertTrue(result !== null, 'default params work for readable file');
+    } finally {
+      rmSync(base, { recursive: true, force: true });
+    }
+  }
+
+  report();
+}
+
+main().catch((error) => {
+  console.error(error);
+  process.exit(1);
+});

From b4405cbb3579d39a83f10496ae7a84ace848f78a Mon Sep 17 00:00:00 2001
From: Lex Christopherson <lex@glittercowboy.com>
Date: Wed, 25 Mar 2026 21:56:08 -0600
Subject: [PATCH 2/2] fix(test): replace stale completedUnits with sessionFile
 in session-lock test

SessionLockData no longer has a completedUnits field. Use sessionFile
(an actual optional field) for the same assertion coverage.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../gsd/tests/session-lock-transient-read.test.ts    | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/resources/extensions/gsd/tests/session-lock-transient-read.test.ts b/src/resources/extensions/gsd/tests/session-lock-transient-read.test.ts
index 33b3d0f21..85d0b93f4 100644
--- a/src/resources/extensions/gsd/tests/session-lock-transient-read.test.ts
+++ b/src/resources/extensions/gsd/tests/session-lock-transient-read.test.ts
@@ -46,14 +46,14 @@ async function main(): Promise<void> {
         unitType: 'execute-task',
         unitId: 'M001/S01/T01',
         unitStartedAt: new Date().toISOString(),
-        completedUnits: 3,
+        sessionFile: 'test-session.json',
       };
       writeFileSync(lockFile, JSON.stringify(lockData, null, 2));
 
       const result = readExistingLockDataWithRetry(lockFile);
       assertTrue(result !== null, 'data returned for readable file');
       assertEq(result!.pid, process.pid, 'correct PID read');
-      assertEq(result!.completedUnits, 3, 'correct completedUnits read');
+      assertEq(result!.sessionFile, 'test-session.json', 'correct sessionFile read');
     } finally {
       rmSync(base, { recursive: true, force: true });
     }
@@ -90,7 +90,7 @@ async function main(): Promise<void> {
         unitType: 'execute-task',
         unitId: 'M001/S01/T01',
         unitStartedAt: new Date().toISOString(),
-        completedUnits: 7,
+        sessionFile: 'recovery-session.json',
       };
       writeFileSync(lockFile, JSON.stringify(lockData, null, 2));
 
@@ -105,7 +105,7 @@ async function main(): Promise<void> {
       assertTrue(result !== null, 'data recovered after transient unavailability');
       if (result) {
         assertEq(result.pid, process.pid, 'correct PID after recovery');
-        assertEq(result.completedUnits, 7, 'correct completedUnits after recovery');
+        assertEq(result.sessionFile, 'recovery-session.json', 'correct sessionFile after recovery');
       }
     } finally {
       rmSync(base, { recursive: true, force: true });
@@ -126,7 +126,7 @@ async function main(): Promise<void> {
         unitType: 'execute-task',
         unitId: 'M001/S01/T01',
         unitStartedAt: new Date().toISOString(),
-        completedUnits: 5,
+        sessionFile: 'perm-session.json',
       };
       writeFileSync(lockFile, JSON.stringify(lockData, null, 2));
 
@@ -202,7 +202,7 @@ async function main(): Promise<void> {
         unitType: 'execute-task',
         unitId: 'M001/S01/T01',
         unitStartedAt: new Date().toISOString(),
-        completedUnits: 0,
+        sessionFile: 'status-session.json',
       };
       writeFileSync(lockFile, JSON.stringify(lockData, null, 2));