From b8d4f037477bbe20b26f86ae21e62db42f295af2 Mon Sep 17 00:00:00 2001 From: Jeremy McSpadden Date: Fri, 27 Mar 2026 15:48:35 -0500 Subject: [PATCH] fix(parallel): resolve session lock contention and 3 related parallel-mode bugs (#2184) (#2800) Per-milestone lock isolation prevents workers from contending on shared .gsd/auto.lock. Budget ceiling scoped to current session for parallel workers. Symlink sync skip prevents ERR_FS_CP_EINVAL. Planning artifacts copied to worktree so workers can find their roadmap. Closes #2184 --- src/resources/extensions/gsd/auto-worktree.ts | 10 + src/resources/extensions/gsd/auto/phases.ts | 13 +- .../extensions/gsd/crash-recovery.ts | 5 +- .../extensions/gsd/parallel-orchestrator.ts | 7 +- src/resources/extensions/gsd/session-lock.ts | 58 ++++- .../parallel-worker-lock-contention.test.ts | 226 ++++++++++++++++++ 6 files changed, 301 insertions(+), 18 deletions(-) create mode 100644 src/resources/extensions/gsd/tests/parallel-worker-lock-contention.test.ts diff --git a/src/resources/extensions/gsd/auto-worktree.ts b/src/resources/extensions/gsd/auto-worktree.ts index ca39f72b8..1e9e78eb2 100644 --- a/src/resources/extensions/gsd/auto-worktree.ts +++ b/src/resources/extensions/gsd/auto-worktree.ts @@ -196,6 +196,11 @@ export function syncProjectRootToWorktree( const prGsd = join(projectRoot, ".gsd"); const wtGsd = join(worktreePath_, ".gsd"); + // When .gsd is a symlink to the same external directory in both locations, + // cpSync rejects the copy because source === destination (ERR_FS_CP_EINVAL). + // Compare realpaths and skip when they resolve to the same physical path (#2184). + if (isSamePath(prGsd, wtGsd)) return; + // Copy milestone directory from project root to worktree — additive only. // force:false prevents cpSync from overwriting existing worktree files. // Without this, worktree-authoritative files (e.g. VALIDATION.md written @@ -245,6 +250,11 @@ export function syncStateToProjectRoot( const wtGsd = join(worktreePath_, ".gsd"); const prGsd = join(projectRoot, ".gsd"); + // When .gsd is a symlink to the same external directory in both locations, + // cpSync rejects the copy because source === destination (ERR_FS_CP_EINVAL). + // Compare realpaths and skip when they resolve to the same physical path (#2184). + if (isSamePath(wtGsd, prGsd)) return; + // 1. STATE.md — the quick-glance status used by initial deriveState() safeCopy(join(wtGsd, "STATE.md"), join(prGsd, "STATE.md"), { force: true }); diff --git a/src/resources/extensions/gsd/auto/phases.ts b/src/resources/extensions/gsd/auto/phases.ts index 6269bfc0d..c8297ee3c 100644 --- a/src/resources/extensions/gsd/auto/phases.ts +++ b/src/resources/extensions/gsd/auto/phases.ts @@ -719,8 +719,17 @@ export async function runGuards( const budgetCeiling = prefs?.budget_ceiling; if (budgetCeiling !== undefined && budgetCeiling > 0) { const currentLedger = deps.getLedger() as { units: unknown } | null; - const totalCost = currentLedger - ? deps.getProjectTotals(currentLedger.units).cost + // In parallel worker mode, only count cost from the current auto-mode session + // to avoid hitting the ceiling due to historical project-wide spend (#2184). + let costUnits = currentLedger?.units; + if (process.env.GSD_PARALLEL_WORKER && s.autoStartTime && Array.isArray(costUnits)) { + const sessionStartISO = new Date(s.autoStartTime).toISOString(); + costUnits = costUnits.filter( + (u: { startedAt?: string }) => u.startedAt != null && u.startedAt >= sessionStartISO, + ); + } + const totalCost = costUnits + ? deps.getProjectTotals(costUnits).cost : 0; const budgetPct = totalCost / budgetCeiling; const budgetAlertLevel = deps.getBudgetAlertLevel(budgetPct); diff --git a/src/resources/extensions/gsd/crash-recovery.ts b/src/resources/extensions/gsd/crash-recovery.ts index 9d5caa8ef..1b147fead 100644 --- a/src/resources/extensions/gsd/crash-recovery.ts +++ b/src/resources/extensions/gsd/crash-recovery.ts @@ -14,8 +14,7 @@ import { readFileSync, unlinkSync, existsSync } from "node:fs"; import { join } from "node:path"; import { gsdRoot } from "./paths.js"; import { atomicWriteSync } from "./atomic-write.js"; - -const LOCK_FILE = "auto.lock"; +import { effectiveLockFile } from "./session-lock.js"; export interface LockData { pid: number; @@ -28,7 +27,7 @@ export interface LockData { } function lockPath(basePath: string): string { - return join(gsdRoot(basePath), LOCK_FILE); + return join(gsdRoot(basePath), effectiveLockFile()); } /** Write or update the lock file with current auto-mode state. */ diff --git a/src/resources/extensions/gsd/parallel-orchestrator.ts b/src/resources/extensions/gsd/parallel-orchestrator.ts index b59579a26..ff2ce775b 100644 --- a/src/resources/extensions/gsd/parallel-orchestrator.ts +++ b/src/resources/extensions/gsd/parallel-orchestrator.ts @@ -21,7 +21,7 @@ import { join, dirname } from "node:path"; import { fileURLToPath } from "node:url"; import { gsdRoot } from "./paths.js"; import { createWorktree, worktreePath } from "./worktree-manager.js"; -import { autoWorktreeBranch, runWorktreePostCreateHook } from "./auto-worktree.js"; +import { autoWorktreeBranch, runWorktreePostCreateHook, syncGsdStateToWorktree } from "./auto-worktree.js"; import { nativeBranchExists } from "./native-git-bridge.js"; import { readIntegrationBranch } from "./git-service.js"; import { resolveParallelConfig } from "./preferences.js"; @@ -507,6 +507,11 @@ function createMilestoneWorktree(basePath: string, milestoneId: string): string // Run post-create hook if configured runWorktreePostCreateHook(basePath, info.path); + // Copy .gsd/ planning artifacts (milestones, CONTEXT, ROADMAP, etc.) from the + // project root into the worktree. Without this, workers for newly-planned + // milestones can't find their roadmap and exit immediately (#2184 Bug 4). + syncGsdStateToWorktree(basePath, info.path); + return info.path; } diff --git a/src/resources/extensions/gsd/session-lock.ts b/src/resources/extensions/gsd/session-lock.ts index f0f3d2562..1d5a4e7a3 100644 --- a/src/resources/extensions/gsd/session-lock.ts +++ b/src/resources/extensions/gsd/session-lock.ts @@ -83,10 +83,31 @@ let _lockAcquiredAt: number = 0; const LOCK_FILE = "auto.lock"; +/** + * Derive the effective lock file name for the current process. + * In parallel worker mode (GSD_PARALLEL_WORKER + GSD_MILESTONE_LOCK), + * each worker uses a per-milestone lock file (`auto-.lock`) + * to avoid contending on the shared `.gsd/auto.lock` (#2184). + */ +export function effectiveLockFile(): string { + const mid = process.env.GSD_PARALLEL_WORKER ? process.env.GSD_MILESTONE_LOCK : null; + return mid ? `auto-${mid}.lock` : LOCK_FILE; +} + +/** + * Derive the OS-level lock target directory for the current process. + * In parallel worker mode, uses `.gsd/parallel//` instead of + * `.gsd/` so workers don't contend on the same proper-lockfile directory (#2184). + */ +export function effectiveLockTarget(gsdDir: string): string { + const mid = process.env.GSD_PARALLEL_WORKER ? process.env.GSD_MILESTONE_LOCK : null; + return mid ? join(gsdDir, "parallel", mid) : gsdDir; +} + function lockPath(basePath: string): string { // If we have a snapshotted path from acquisition, use it for consistency if (_snapshotLockPath) return _snapshotLockPath; - return join(gsdRoot(basePath), LOCK_FILE); + return join(gsdRoot(basePath), effectiveLockFile()); } // ─── Stray Lock Cleanup ───────────────────────────────────────────────────── @@ -265,14 +286,16 @@ export function acquireSessionLock(basePath: string): SessionLockResult { } const gsdDir = gsdRoot(basePath); + const lockTarget = effectiveLockTarget(gsdDir); try { - // Try to acquire an exclusive OS-level lock on the lock file. - // We lock the directory (gsdRoot) since proper-lockfile works best - // on directories, and the lock file itself may not exist yet. - mkdirSync(gsdDir, { recursive: true }); + // Try to acquire an exclusive OS-level lock on the lock target. + // We lock a directory since proper-lockfile works best on directories, + // and the lock file itself may not exist yet. + // In parallel worker mode, lockTarget is .gsd/parallel// (#2184). + mkdirSync(lockTarget, { recursive: true }); - const release = lockfile.lockSync(gsdDir, { + const release = lockfile.lockSync(lockTarget, { realpath: false, stale: 1_800_000, // 30 minutes — safe for laptop sleep / long event loop stalls update: 10_000, // Update lock mtime every 10s to prove liveness @@ -283,7 +306,7 @@ export function acquireSessionLock(basePath: string): SessionLockResult { // Safety net: clean up lock dir on process exit if _releaseFunction // wasn't called (e.g., normal exit after clean completion) (#1245). - ensureExitHandler(gsdDir); + ensureExitHandler(lockTarget); // Write the informational lock data atomicWriteSync(lp, JSON.stringify(lockData, null, 2)); @@ -298,12 +321,12 @@ export function acquireSessionLock(basePath: string): SessionLockResult { // If no lock file or no alive process, try to clean up and re-acquire (#1245) if (!existingData || (existingPid && !isPidAlive(existingPid))) { try { - const lockDir = join(gsdDir + ".lock"); + const lockDir = join(lockTarget + ".lock"); if (existsSync(lockDir)) rmSync(lockDir, { recursive: true, force: true }); if (existsSync(lp)) unlinkSync(lp); // Retry acquisition after cleanup - const release = lockfile.lockSync(gsdDir, { + const release = lockfile.lockSync(lockTarget, { realpath: false, stale: 1_800_000, // 30 minutes — match primary lock settings update: 10_000, @@ -312,7 +335,7 @@ export function acquireSessionLock(basePath: string): SessionLockResult { assignLockState(basePath, release, lp); // Safety net — uses centralized handler to avoid double-registration - ensureExitHandler(gsdDir); + ensureExitHandler(lockTarget); atomicWriteSync(lp, JSON.stringify(lockData, null, 2)); return { acquired: true }; @@ -483,13 +506,24 @@ export function releaseSessionLock(basePath: string): void { // Non-fatal } - // Remove the proper-lockfile directory (.gsd.lock/) for the current path + // Remove the proper-lockfile directory for the current lock target. + // In parallel worker mode, this is .gsd/parallel/.lock/ (#2184). + const gsdDir = gsdRoot(basePath); + const lockTarget = effectiveLockTarget(gsdDir); try { - const lockDir = join(gsdRoot(basePath) + ".lock"); + const lockDir = join(lockTarget + ".lock"); if (existsSync(lockDir)) rmSync(lockDir, { recursive: true, force: true }); } catch { // Non-fatal } + // Also clean the per-milestone parallel directory itself if it exists + if (lockTarget !== gsdDir) { + try { + if (existsSync(lockTarget)) rmSync(lockTarget, { recursive: true, force: true }); + } catch { + // Non-fatal + } + } // Clean ALL registered lock paths (#1578) — lock files accumulate across // main project .gsd/, worktree .gsd/, and projects registry paths. diff --git a/src/resources/extensions/gsd/tests/parallel-worker-lock-contention.test.ts b/src/resources/extensions/gsd/tests/parallel-worker-lock-contention.test.ts new file mode 100644 index 000000000..0f27fa0ac --- /dev/null +++ b/src/resources/extensions/gsd/tests/parallel-worker-lock-contention.test.ts @@ -0,0 +1,226 @@ +/** + * parallel-worker-lock-contention.test.ts — Regression tests for #2184. + * + * Covers all four bugs from the parallel worker contention issue: + * Bug 1: Session lock contention — per-milestone lock isolation + * Bug 2: Budget ceiling scoped to current session for parallel workers + * Bug 3: syncProjectRootToWorktree skips when source === destination (symlinks) + * Bug 4: createMilestoneWorktree copies planning artifacts + * + * Copyright (c) 2026 Jeremy McSpadden + */ + +import { + mkdtempSync, + mkdirSync, + writeFileSync, + rmSync, + existsSync, + symlinkSync, + readFileSync, +} from "node:fs"; +import { join } from "node:path"; +import { tmpdir } from "node:os"; + +import { + acquireSessionLock, + releaseSessionLock, + effectiveLockFile, + effectiveLockTarget, +} from "../session-lock.ts"; +import { gsdRoot } from "../paths.ts"; +import { + syncProjectRootToWorktree, + syncStateToProjectRoot, +} from "../auto-worktree.ts"; +import { writeLock, readCrashLock, clearLock } from "../crash-recovery.ts"; +import { describe, test, beforeEach, afterEach } from "node:test"; +import assert from "node:assert/strict"; + +// ─── Bug 1: Per-milestone lock isolation ────────────────────────────────────── + +describe("parallel-worker-lock-contention (#2184)", () => { + // Save and restore env vars between tests + const savedEnv: Record = {}; + + beforeEach(() => { + savedEnv.GSD_PARALLEL_WORKER = process.env.GSD_PARALLEL_WORKER; + savedEnv.GSD_MILESTONE_LOCK = process.env.GSD_MILESTONE_LOCK; + }); + + afterEach(() => { + if (savedEnv.GSD_PARALLEL_WORKER === undefined) { + delete process.env.GSD_PARALLEL_WORKER; + } else { + process.env.GSD_PARALLEL_WORKER = savedEnv.GSD_PARALLEL_WORKER; + } + if (savedEnv.GSD_MILESTONE_LOCK === undefined) { + delete process.env.GSD_MILESTONE_LOCK; + } else { + process.env.GSD_MILESTONE_LOCK = savedEnv.GSD_MILESTONE_LOCK; + } + }); + + // ─── Bug 1a: effectiveLockFile returns per-milestone name ──────────────── + test("Bug 1a: effectiveLockFile returns auto.lock without parallel env", () => { + delete process.env.GSD_PARALLEL_WORKER; + delete process.env.GSD_MILESTONE_LOCK; + assert.equal(effectiveLockFile(), "auto.lock"); + }); + + test("Bug 1a: effectiveLockFile returns auto-.lock in parallel mode", () => { + process.env.GSD_PARALLEL_WORKER = "1"; + process.env.GSD_MILESTONE_LOCK = "M003"; + assert.equal(effectiveLockFile(), "auto-M003.lock"); + }); + + // ─── Bug 1b: effectiveLockTarget returns per-milestone directory ───────── + test("Bug 1b: effectiveLockTarget returns gsdDir without parallel env", () => { + delete process.env.GSD_PARALLEL_WORKER; + const gsdDir = "/tmp/test/.gsd"; + assert.equal(effectiveLockTarget(gsdDir), gsdDir); + }); + + test("Bug 1b: effectiveLockTarget returns parallel/ in parallel mode", () => { + process.env.GSD_PARALLEL_WORKER = "1"; + process.env.GSD_MILESTONE_LOCK = "M003"; + const gsdDir = "/tmp/test/.gsd"; + assert.equal(effectiveLockTarget(gsdDir), join(gsdDir, "parallel", "M003")); + }); + + // ─── Bug 1c: Two parallel workers acquire independent locks ────────────── + test("Bug 1c: parallel workers use per-milestone lock files, not shared auto.lock", () => { + const base = mkdtempSync(join(tmpdir(), "gsd-parallel-lock-")); + mkdirSync(join(base, ".gsd"), { recursive: true }); + + try { + // Simulate worker for M001 + process.env.GSD_PARALLEL_WORKER = "1"; + process.env.GSD_MILESTONE_LOCK = "M001"; + + const r1 = acquireSessionLock(base); + assert.ok(r1.acquired, "M001 worker acquires lock"); + + // Verify the lock file is per-milestone + const gsdDir = gsdRoot(base); + const m001LockFile = join(gsdDir, "auto-M001.lock"); + assert.ok(existsSync(m001LockFile), "auto-M001.lock exists"); + + // The shared auto.lock should NOT exist + const sharedLockFile = join(gsdDir, "auto.lock"); + assert.ok(!existsSync(sharedLockFile), "shared auto.lock does NOT exist"); + + // The per-milestone lock target directory should exist + const m001LockTarget = join(gsdDir, "parallel", "M001"); + assert.ok(existsSync(m001LockTarget), "parallel/M001 directory exists"); + + releaseSessionLock(base); + + // After release, per-milestone lock file should be cleaned + assert.ok(!existsSync(m001LockFile), "auto-M001.lock cleaned after release"); + } finally { + delete process.env.GSD_PARALLEL_WORKER; + delete process.env.GSD_MILESTONE_LOCK; + rmSync(base, { recursive: true, force: true }); + } + }); + + // ─── Bug 1d: crash-recovery uses per-milestone lock file ───────────────── + test("Bug 1d: crash-recovery writeLock/readCrashLock uses per-milestone lock in parallel mode", () => { + const base = mkdtempSync(join(tmpdir(), "gsd-parallel-crash-")); + mkdirSync(join(base, ".gsd"), { recursive: true }); + + try { + process.env.GSD_PARALLEL_WORKER = "1"; + process.env.GSD_MILESTONE_LOCK = "M002"; + + writeLock(base, "execute-task", "M002/S01/T01"); + + const gsdDir = gsdRoot(base); + const lockFile = join(gsdDir, "auto-M002.lock"); + assert.ok(existsSync(lockFile), "crash-recovery writes auto-M002.lock"); + + const data = readCrashLock(base); + assert.ok(data !== null, "readCrashLock reads per-milestone lock"); + assert.equal(data!.unitId, "M002/S01/T01"); + + clearLock(base); + assert.ok(!existsSync(lockFile), "clearLock removes per-milestone lock"); + } finally { + delete process.env.GSD_PARALLEL_WORKER; + delete process.env.GSD_MILESTONE_LOCK; + rmSync(base, { recursive: true, force: true }); + } + }); + + // ─── Bug 3: syncProjectRootToWorktree skips same-path symlinks ─────────── + test("Bug 3: syncProjectRootToWorktree skips when .gsd resolves to same path (symlink)", () => { + const base = mkdtempSync(join(tmpdir(), "gsd-symlink-sync-")); + const externalGsd = join(base, "external-gsd"); + const projectRoot = join(base, "project"); + const worktreePath = join(base, "worktree"); + + mkdirSync(externalGsd, { recursive: true }); + mkdirSync(projectRoot, { recursive: true }); + mkdirSync(worktreePath, { recursive: true }); + + // Create the external state directory with a milestone + mkdirSync(join(externalGsd, "milestones", "M001"), { recursive: true }); + writeFileSync( + join(externalGsd, "milestones", "M001", "M001-ROADMAP.md"), + "# Roadmap", + ); + + // Symlink both project and worktree .gsd to the same external directory + symlinkSync(externalGsd, join(projectRoot, ".gsd")); + symlinkSync(externalGsd, join(worktreePath, ".gsd")); + + try { + // This should NOT throw ERR_FS_CP_EINVAL — it should skip silently + let threw = false; + try { + syncProjectRootToWorktree(projectRoot, worktreePath, "M001"); + } catch { + threw = true; + } + assert.ok(!threw, "syncProjectRootToWorktree does not throw on same-path symlink"); + + // Same for reverse direction + threw = false; + try { + syncStateToProjectRoot(worktreePath, projectRoot, "M001"); + } catch { + threw = true; + } + assert.ok(!threw, "syncStateToProjectRoot does not throw on same-path symlink"); + } finally { + rmSync(base, { recursive: true, force: true }); + } + }); + + // ─── Bug 3b: sync still works when paths are different ─────────────────── + test("Bug 3b: syncProjectRootToWorktree copies when .gsd paths are different", () => { + const base = mkdtempSync(join(tmpdir(), "gsd-diff-sync-")); + const projectRoot = join(base, "project"); + const worktreePath = join(base, "worktree"); + + mkdirSync(join(projectRoot, ".gsd", "milestones", "M001"), { recursive: true }); + mkdirSync(join(worktreePath, ".gsd", "milestones"), { recursive: true }); + + writeFileSync( + join(projectRoot, ".gsd", "milestones", "M001", "M001-ROADMAP.md"), + "# Roadmap content", + ); + + try { + syncProjectRootToWorktree(projectRoot, worktreePath, "M001"); + + // The roadmap should have been copied + const copied = join(worktreePath, ".gsd", "milestones", "M001", "M001-ROADMAP.md"); + assert.ok(existsSync(copied), "milestone roadmap copied to worktree"); + assert.equal(readFileSync(copied, "utf-8"), "# Roadmap content"); + } finally { + rmSync(base, { recursive: true, force: true }); + } + }); +});