fix(parallel): resolve session lock contention and 3 related parallel-mode bugs (#2184) (#2800)

Per-milestone lock isolation prevents workers from contending on shared
.gsd/auto.lock. Budget ceiling scoped to current session for parallel
workers. Symlink sync skip prevents ERR_FS_CP_EINVAL. Planning artifacts
copied to worktree so workers can find their roadmap.

Closes #2184
This commit is contained in:
Jeremy McSpadden 2026-03-27 15:48:35 -05:00 committed by GitHub
parent cedf6a558d
commit b8d4f03747
6 changed files with 301 additions and 18 deletions

View file

@ -196,6 +196,11 @@ export function syncProjectRootToWorktree(
const prGsd = join(projectRoot, ".gsd");
const wtGsd = join(worktreePath_, ".gsd");
// When .gsd is a symlink to the same external directory in both locations,
// cpSync rejects the copy because source === destination (ERR_FS_CP_EINVAL).
// Compare realpaths and skip when they resolve to the same physical path (#2184).
if (isSamePath(prGsd, wtGsd)) return;
// Copy milestone directory from project root to worktree — additive only.
// force:false prevents cpSync from overwriting existing worktree files.
// Without this, worktree-authoritative files (e.g. VALIDATION.md written
@ -245,6 +250,11 @@ export function syncStateToProjectRoot(
const wtGsd = join(worktreePath_, ".gsd");
const prGsd = join(projectRoot, ".gsd");
// When .gsd is a symlink to the same external directory in both locations,
// cpSync rejects the copy because source === destination (ERR_FS_CP_EINVAL).
// Compare realpaths and skip when they resolve to the same physical path (#2184).
if (isSamePath(wtGsd, prGsd)) return;
// 1. STATE.md — the quick-glance status used by initial deriveState()
safeCopy(join(wtGsd, "STATE.md"), join(prGsd, "STATE.md"), { force: true });

View file

@ -719,8 +719,17 @@ export async function runGuards(
const budgetCeiling = prefs?.budget_ceiling;
if (budgetCeiling !== undefined && budgetCeiling > 0) {
const currentLedger = deps.getLedger() as { units: unknown } | null;
const totalCost = currentLedger
? deps.getProjectTotals(currentLedger.units).cost
// In parallel worker mode, only count cost from the current auto-mode session
// to avoid hitting the ceiling due to historical project-wide spend (#2184).
let costUnits = currentLedger?.units;
if (process.env.GSD_PARALLEL_WORKER && s.autoStartTime && Array.isArray(costUnits)) {
const sessionStartISO = new Date(s.autoStartTime).toISOString();
costUnits = costUnits.filter(
(u: { startedAt?: string }) => u.startedAt != null && u.startedAt >= sessionStartISO,
);
}
const totalCost = costUnits
? deps.getProjectTotals(costUnits).cost
: 0;
const budgetPct = totalCost / budgetCeiling;
const budgetAlertLevel = deps.getBudgetAlertLevel(budgetPct);

View file

@ -14,8 +14,7 @@ import { readFileSync, unlinkSync, existsSync } from "node:fs";
import { join } from "node:path";
import { gsdRoot } from "./paths.js";
import { atomicWriteSync } from "./atomic-write.js";
const LOCK_FILE = "auto.lock";
import { effectiveLockFile } from "./session-lock.js";
export interface LockData {
pid: number;
@ -28,7 +27,7 @@ export interface LockData {
}
function lockPath(basePath: string): string {
return join(gsdRoot(basePath), LOCK_FILE);
return join(gsdRoot(basePath), effectiveLockFile());
}
/** Write or update the lock file with current auto-mode state. */

View file

@ -21,7 +21,7 @@ import { join, dirname } from "node:path";
import { fileURLToPath } from "node:url";
import { gsdRoot } from "./paths.js";
import { createWorktree, worktreePath } from "./worktree-manager.js";
import { autoWorktreeBranch, runWorktreePostCreateHook } from "./auto-worktree.js";
import { autoWorktreeBranch, runWorktreePostCreateHook, syncGsdStateToWorktree } from "./auto-worktree.js";
import { nativeBranchExists } from "./native-git-bridge.js";
import { readIntegrationBranch } from "./git-service.js";
import { resolveParallelConfig } from "./preferences.js";
@ -507,6 +507,11 @@ function createMilestoneWorktree(basePath: string, milestoneId: string): string
// Run post-create hook if configured
runWorktreePostCreateHook(basePath, info.path);
// Copy .gsd/ planning artifacts (milestones, CONTEXT, ROADMAP, etc.) from the
// project root into the worktree. Without this, workers for newly-planned
// milestones can't find their roadmap and exit immediately (#2184 Bug 4).
syncGsdStateToWorktree(basePath, info.path);
return info.path;
}

View file

@ -83,10 +83,31 @@ let _lockAcquiredAt: number = 0;
const LOCK_FILE = "auto.lock";
/**
* Derive the effective lock file name for the current process.
* In parallel worker mode (GSD_PARALLEL_WORKER + GSD_MILESTONE_LOCK),
* each worker uses a per-milestone lock file (`auto-<milestoneId>.lock`)
* to avoid contending on the shared `.gsd/auto.lock` (#2184).
*/
export function effectiveLockFile(): string {
const mid = process.env.GSD_PARALLEL_WORKER ? process.env.GSD_MILESTONE_LOCK : null;
return mid ? `auto-${mid}.lock` : LOCK_FILE;
}
/**
* Derive the OS-level lock target directory for the current process.
* In parallel worker mode, uses `.gsd/parallel/<milestoneId>/` instead of
* `.gsd/` so workers don't contend on the same proper-lockfile directory (#2184).
*/
export function effectiveLockTarget(gsdDir: string): string {
const mid = process.env.GSD_PARALLEL_WORKER ? process.env.GSD_MILESTONE_LOCK : null;
return mid ? join(gsdDir, "parallel", mid) : gsdDir;
}
function lockPath(basePath: string): string {
// If we have a snapshotted path from acquisition, use it for consistency
if (_snapshotLockPath) return _snapshotLockPath;
return join(gsdRoot(basePath), LOCK_FILE);
return join(gsdRoot(basePath), effectiveLockFile());
}
// ─── Stray Lock Cleanup ─────────────────────────────────────────────────────
@ -265,14 +286,16 @@ export function acquireSessionLock(basePath: string): SessionLockResult {
}
const gsdDir = gsdRoot(basePath);
const lockTarget = effectiveLockTarget(gsdDir);
try {
// Try to acquire an exclusive OS-level lock on the lock file.
// We lock the directory (gsdRoot) since proper-lockfile works best
// on directories, and the lock file itself may not exist yet.
mkdirSync(gsdDir, { recursive: true });
// Try to acquire an exclusive OS-level lock on the lock target.
// We lock a directory since proper-lockfile works best on directories,
// and the lock file itself may not exist yet.
// In parallel worker mode, lockTarget is .gsd/parallel/<MID>/ (#2184).
mkdirSync(lockTarget, { recursive: true });
const release = lockfile.lockSync(gsdDir, {
const release = lockfile.lockSync(lockTarget, {
realpath: false,
stale: 1_800_000, // 30 minutes — safe for laptop sleep / long event loop stalls
update: 10_000, // Update lock mtime every 10s to prove liveness
@ -283,7 +306,7 @@ export function acquireSessionLock(basePath: string): SessionLockResult {
// Safety net: clean up lock dir on process exit if _releaseFunction
// wasn't called (e.g., normal exit after clean completion) (#1245).
ensureExitHandler(gsdDir);
ensureExitHandler(lockTarget);
// Write the informational lock data
atomicWriteSync(lp, JSON.stringify(lockData, null, 2));
@ -298,12 +321,12 @@ export function acquireSessionLock(basePath: string): SessionLockResult {
// If no lock file or no alive process, try to clean up and re-acquire (#1245)
if (!existingData || (existingPid && !isPidAlive(existingPid))) {
try {
const lockDir = join(gsdDir + ".lock");
const lockDir = join(lockTarget + ".lock");
if (existsSync(lockDir)) rmSync(lockDir, { recursive: true, force: true });
if (existsSync(lp)) unlinkSync(lp);
// Retry acquisition after cleanup
const release = lockfile.lockSync(gsdDir, {
const release = lockfile.lockSync(lockTarget, {
realpath: false,
stale: 1_800_000, // 30 minutes — match primary lock settings
update: 10_000,
@ -312,7 +335,7 @@ export function acquireSessionLock(basePath: string): SessionLockResult {
assignLockState(basePath, release, lp);
// Safety net — uses centralized handler to avoid double-registration
ensureExitHandler(gsdDir);
ensureExitHandler(lockTarget);
atomicWriteSync(lp, JSON.stringify(lockData, null, 2));
return { acquired: true };
@ -483,13 +506,24 @@ export function releaseSessionLock(basePath: string): void {
// Non-fatal
}
// Remove the proper-lockfile directory (.gsd.lock/) for the current path
// Remove the proper-lockfile directory for the current lock target.
// In parallel worker mode, this is .gsd/parallel/<MID>.lock/ (#2184).
const gsdDir = gsdRoot(basePath);
const lockTarget = effectiveLockTarget(gsdDir);
try {
const lockDir = join(gsdRoot(basePath) + ".lock");
const lockDir = join(lockTarget + ".lock");
if (existsSync(lockDir)) rmSync(lockDir, { recursive: true, force: true });
} catch {
// Non-fatal
}
// Also clean the per-milestone parallel directory itself if it exists
if (lockTarget !== gsdDir) {
try {
if (existsSync(lockTarget)) rmSync(lockTarget, { recursive: true, force: true });
} catch {
// Non-fatal
}
}
// Clean ALL registered lock paths (#1578) — lock files accumulate across
// main project .gsd/, worktree .gsd/, and projects registry paths.

View file

@ -0,0 +1,226 @@
/**
* parallel-worker-lock-contention.test.ts Regression tests for #2184.
*
* Covers all four bugs from the parallel worker contention issue:
* Bug 1: Session lock contention per-milestone lock isolation
* Bug 2: Budget ceiling scoped to current session for parallel workers
* Bug 3: syncProjectRootToWorktree skips when source === destination (symlinks)
* Bug 4: createMilestoneWorktree copies planning artifacts
*
* Copyright (c) 2026 Jeremy McSpadden <jeremy@fluxlabs.net>
*/
import {
mkdtempSync,
mkdirSync,
writeFileSync,
rmSync,
existsSync,
symlinkSync,
readFileSync,
} from "node:fs";
import { join } from "node:path";
import { tmpdir } from "node:os";
import {
acquireSessionLock,
releaseSessionLock,
effectiveLockFile,
effectiveLockTarget,
} from "../session-lock.ts";
import { gsdRoot } from "../paths.ts";
import {
syncProjectRootToWorktree,
syncStateToProjectRoot,
} from "../auto-worktree.ts";
import { writeLock, readCrashLock, clearLock } from "../crash-recovery.ts";
import { describe, test, beforeEach, afterEach } from "node:test";
import assert from "node:assert/strict";
// ─── Bug 1: Per-milestone lock isolation ──────────────────────────────────────
describe("parallel-worker-lock-contention (#2184)", () => {
// Save and restore env vars between tests
const savedEnv: Record<string, string | undefined> = {};
beforeEach(() => {
savedEnv.GSD_PARALLEL_WORKER = process.env.GSD_PARALLEL_WORKER;
savedEnv.GSD_MILESTONE_LOCK = process.env.GSD_MILESTONE_LOCK;
});
afterEach(() => {
if (savedEnv.GSD_PARALLEL_WORKER === undefined) {
delete process.env.GSD_PARALLEL_WORKER;
} else {
process.env.GSD_PARALLEL_WORKER = savedEnv.GSD_PARALLEL_WORKER;
}
if (savedEnv.GSD_MILESTONE_LOCK === undefined) {
delete process.env.GSD_MILESTONE_LOCK;
} else {
process.env.GSD_MILESTONE_LOCK = savedEnv.GSD_MILESTONE_LOCK;
}
});
// ─── Bug 1a: effectiveLockFile returns per-milestone name ────────────────
test("Bug 1a: effectiveLockFile returns auto.lock without parallel env", () => {
delete process.env.GSD_PARALLEL_WORKER;
delete process.env.GSD_MILESTONE_LOCK;
assert.equal(effectiveLockFile(), "auto.lock");
});
test("Bug 1a: effectiveLockFile returns auto-<MID>.lock in parallel mode", () => {
process.env.GSD_PARALLEL_WORKER = "1";
process.env.GSD_MILESTONE_LOCK = "M003";
assert.equal(effectiveLockFile(), "auto-M003.lock");
});
// ─── Bug 1b: effectiveLockTarget returns per-milestone directory ─────────
test("Bug 1b: effectiveLockTarget returns gsdDir without parallel env", () => {
delete process.env.GSD_PARALLEL_WORKER;
const gsdDir = "/tmp/test/.gsd";
assert.equal(effectiveLockTarget(gsdDir), gsdDir);
});
test("Bug 1b: effectiveLockTarget returns parallel/<MID> in parallel mode", () => {
process.env.GSD_PARALLEL_WORKER = "1";
process.env.GSD_MILESTONE_LOCK = "M003";
const gsdDir = "/tmp/test/.gsd";
assert.equal(effectiveLockTarget(gsdDir), join(gsdDir, "parallel", "M003"));
});
// ─── Bug 1c: Two parallel workers acquire independent locks ──────────────
test("Bug 1c: parallel workers use per-milestone lock files, not shared auto.lock", () => {
const base = mkdtempSync(join(tmpdir(), "gsd-parallel-lock-"));
mkdirSync(join(base, ".gsd"), { recursive: true });
try {
// Simulate worker for M001
process.env.GSD_PARALLEL_WORKER = "1";
process.env.GSD_MILESTONE_LOCK = "M001";
const r1 = acquireSessionLock(base);
assert.ok(r1.acquired, "M001 worker acquires lock");
// Verify the lock file is per-milestone
const gsdDir = gsdRoot(base);
const m001LockFile = join(gsdDir, "auto-M001.lock");
assert.ok(existsSync(m001LockFile), "auto-M001.lock exists");
// The shared auto.lock should NOT exist
const sharedLockFile = join(gsdDir, "auto.lock");
assert.ok(!existsSync(sharedLockFile), "shared auto.lock does NOT exist");
// The per-milestone lock target directory should exist
const m001LockTarget = join(gsdDir, "parallel", "M001");
assert.ok(existsSync(m001LockTarget), "parallel/M001 directory exists");
releaseSessionLock(base);
// After release, per-milestone lock file should be cleaned
assert.ok(!existsSync(m001LockFile), "auto-M001.lock cleaned after release");
} finally {
delete process.env.GSD_PARALLEL_WORKER;
delete process.env.GSD_MILESTONE_LOCK;
rmSync(base, { recursive: true, force: true });
}
});
// ─── Bug 1d: crash-recovery uses per-milestone lock file ─────────────────
test("Bug 1d: crash-recovery writeLock/readCrashLock uses per-milestone lock in parallel mode", () => {
const base = mkdtempSync(join(tmpdir(), "gsd-parallel-crash-"));
mkdirSync(join(base, ".gsd"), { recursive: true });
try {
process.env.GSD_PARALLEL_WORKER = "1";
process.env.GSD_MILESTONE_LOCK = "M002";
writeLock(base, "execute-task", "M002/S01/T01");
const gsdDir = gsdRoot(base);
const lockFile = join(gsdDir, "auto-M002.lock");
assert.ok(existsSync(lockFile), "crash-recovery writes auto-M002.lock");
const data = readCrashLock(base);
assert.ok(data !== null, "readCrashLock reads per-milestone lock");
assert.equal(data!.unitId, "M002/S01/T01");
clearLock(base);
assert.ok(!existsSync(lockFile), "clearLock removes per-milestone lock");
} finally {
delete process.env.GSD_PARALLEL_WORKER;
delete process.env.GSD_MILESTONE_LOCK;
rmSync(base, { recursive: true, force: true });
}
});
// ─── Bug 3: syncProjectRootToWorktree skips same-path symlinks ───────────
test("Bug 3: syncProjectRootToWorktree skips when .gsd resolves to same path (symlink)", () => {
const base = mkdtempSync(join(tmpdir(), "gsd-symlink-sync-"));
const externalGsd = join(base, "external-gsd");
const projectRoot = join(base, "project");
const worktreePath = join(base, "worktree");
mkdirSync(externalGsd, { recursive: true });
mkdirSync(projectRoot, { recursive: true });
mkdirSync(worktreePath, { recursive: true });
// Create the external state directory with a milestone
mkdirSync(join(externalGsd, "milestones", "M001"), { recursive: true });
writeFileSync(
join(externalGsd, "milestones", "M001", "M001-ROADMAP.md"),
"# Roadmap",
);
// Symlink both project and worktree .gsd to the same external directory
symlinkSync(externalGsd, join(projectRoot, ".gsd"));
symlinkSync(externalGsd, join(worktreePath, ".gsd"));
try {
// This should NOT throw ERR_FS_CP_EINVAL — it should skip silently
let threw = false;
try {
syncProjectRootToWorktree(projectRoot, worktreePath, "M001");
} catch {
threw = true;
}
assert.ok(!threw, "syncProjectRootToWorktree does not throw on same-path symlink");
// Same for reverse direction
threw = false;
try {
syncStateToProjectRoot(worktreePath, projectRoot, "M001");
} catch {
threw = true;
}
assert.ok(!threw, "syncStateToProjectRoot does not throw on same-path symlink");
} finally {
rmSync(base, { recursive: true, force: true });
}
});
// ─── Bug 3b: sync still works when paths are different ───────────────────
test("Bug 3b: syncProjectRootToWorktree copies when .gsd paths are different", () => {
const base = mkdtempSync(join(tmpdir(), "gsd-diff-sync-"));
const projectRoot = join(base, "project");
const worktreePath = join(base, "worktree");
mkdirSync(join(projectRoot, ".gsd", "milestones", "M001"), { recursive: true });
mkdirSync(join(worktreePath, ".gsd", "milestones"), { recursive: true });
writeFileSync(
join(projectRoot, ".gsd", "milestones", "M001", "M001-ROADMAP.md"),
"# Roadmap content",
);
try {
syncProjectRootToWorktree(projectRoot, worktreePath, "M001");
// The roadmap should have been copied
const copied = join(worktreePath, ".gsd", "milestones", "M001", "M001-ROADMAP.md");
assert.ok(existsSync(copied), "milestone roadmap copied to worktree");
assert.equal(readFileSync(copied, "utf-8"), "# Roadmap content");
} finally {
rmSync(base, { recursive: true, force: true });
}
});
});