fix(parallel): three bugs preventing reliable parallel worker execution (#2801)

Bug 1 — Workers exit immediately (#2792):
spawnWorker() used `--print "/gsd auto"` which calls session.prompt()
that returns immediately when ctx.newSession() resets the session inside
the auto-loop. Changed to `headless --json auto` which uses an RPC
client that keeps the process alive until auto-mode completes.

Bug 2 — Dispatch guard blocks parallel workers (#2797):
getPriorSliceCompletionBlocker() checked ALL milestones in queue order,
blocking M012 when M011 had incomplete slices. When GSD_MILESTONE_LOCK
is set, the guard now only checks intra-milestone slice dependencies.
Added test covering cross-milestone bypass + intra-milestone preservation.

Bug 3 — Orphaned RPC children on stop (#2798):
stopParallel() gave only 750ms for SIGTERM before SIGKILL. The headless
parent needs ~1500ms to cascade shutdown to its RPC child via
client.stop(). Increased to 3000ms to prevent orphaned processes holding
auto.lock.

Updated tests:
- dispatch-guard.test.ts: new test for GSD_MILESTONE_LOCK bypass
- parallel-worker-monitoring.test.ts: updated spawn args assertion
This commit is contained in:
Jordan Gaytan 2026-03-27 19:10:02 -05:00 committed by GitHub
parent 7c5dae0298
commit 558b1f2081
4 changed files with 91 additions and 15 deletions

View file

@ -26,9 +26,20 @@ export function getPriorSliceCompletionBlocker(
const { milestone: targetMid, slice: targetSid } = parseUnitId(unitId);
if (!targetMid || !targetSid) return null;
// Parallel worker isolation: when GSD_MILESTONE_LOCK is set, this worker
// is scoped to a single milestone. Skip the cross-milestone dependency
// check — other milestones are being handled by their own workers.
// Without this, the dispatch guard sees incomplete slices in M010/M011
// (cloned into the worktree DB) and blocks M012 from ever starting. #2797
const milestoneLock = process.env.GSD_MILESTONE_LOCK;
// Use findMilestoneIds to respect custom queue order.
// Only check milestones that come BEFORE the target in queue order.
const allIds = findMilestoneIds(base);
// When locked to a specific milestone, only check that milestone's
// intra-slice dependencies — skip all cross-milestone checks.
const allIds = milestoneLock && targetMid === milestoneLock
? [targetMid]
: findMilestoneIds(base);
const targetIdx = allIds.indexOf(targetMid);
if (targetIdx < 0) return null;
const milestoneIds = allIds.slice(0, targetIdx + 1);

View file

@ -519,8 +519,19 @@ function createMilestoneWorktree(basePath: string, milestoneId: string): string
/**
* Spawn a worker process for a milestone.
* The worker runs `gsd --print "/gsd auto"` in the milestone's worktree
* The worker runs `gsd headless --json auto` in the milestone's worktree
* with GSD_MILESTONE_LOCK set to isolate state derivation.
*
* IMPORTANT: We use `headless --json auto` instead of `--print "/gsd auto"`.
* --print mode calls session.prompt() which returns immediately after the
* extension command handler fires, because auto-mode's ctx.newSession()
* resets the session and unblocks the outer prompt() await. This causes
* process.exit(0) to fire before any LLM work happens. See #2792.
*
* The headless subcommand uses an RPC client that keeps the process alive
* until auto-mode emits a terminal notification or the idle timer fires.
* It outputs NDJSON events to stdout (with --json), which our
* processWorkerLine() parser already understands.
*/
export function spawnWorker(
basePath: string,
@ -537,7 +548,7 @@ export function spawnWorker(
let child: ChildProcess;
try {
child = spawn(process.execPath, [binPath, "--mode", "json", "--print", "/gsd auto"], {
child = spawn(process.execPath, [binPath, "headless", "--json", "auto"], {
cwd: worker.worktreePath,
env: {
...process.env,
@ -577,9 +588,10 @@ export function spawnWorker(
}
// ── NDJSON stdout monitoring ────────────────────────────────────────
// Workers run with --mode json, emitting one JSON event per line.
// We parse message_end events to extract cost/token usage, keeping
// the coordinator's cost tracking in sync with actual API spend.
// Workers run via `headless --json`, which forwards all RPC events
// as NDJSON to stdout. We parse message_end events to extract
// cost/token usage, keeping the coordinator's cost tracking in sync
// with actual API spend.
if (child.stdout) {
let stdoutBuffer = "";
child.stdout.on("data", (data: Buffer) => {
@ -808,7 +820,12 @@ export async function stopParallel(
} catch { /* process may already be dead */ }
}
const exitedAfterTerm = await waitForWorkerExit(worker, 750);
// Wait for the headless process to cascade SIGTERM to its RPC child.
// The headless signal handler calls client.stop() which sends SIGTERM
// to the RPC child and waits up to 1000ms. The previous 750ms window
// was insufficient — the parent got SIGKILL before the child died,
// leaving orphaned RPC processes holding auto.lock. See #2798.
const exitedAfterTerm = await waitForWorkerExit(worker, 3000);
if (!exitedAfterTerm && worker.pid > 0) {
try {
if (worker.process) {

View file

@ -216,3 +216,50 @@ test("dispatch guard works without git repo", (t) => {
assert.equal(getPriorSliceCompletionBlocker(repo, "main", "plan-slice", "M001/S02"), null);
});
test("dispatch guard skips cross-milestone check when GSD_MILESTONE_LOCK is set (#2797)", (t) => {
const repo = setupRepo();
t.after(() => {
delete process.env.GSD_MILESTONE_LOCK;
teardownRepo(repo);
});
mkdirSync(join(repo, ".gsd", "milestones", "M010"), { recursive: true });
mkdirSync(join(repo, ".gsd", "milestones", "M011"), { recursive: true });
mkdirSync(join(repo, ".gsd", "milestones", "M012"), { recursive: true });
// M010 and M011 have incomplete slices
insertMilestone({ id: "M010", title: "Analytics" });
insertSlice({ id: "S01", milestoneId: "M010", title: "Data Quality", status: "pending", depends: [], sequence: 1 });
insertMilestone({ id: "M011", title: "Builder Onboarding" });
insertSlice({ id: "S01", milestoneId: "M011", title: "Schema", status: "pending", depends: [], sequence: 1 });
insertMilestone({ id: "M012", title: "Shared Components" });
insertSlice({ id: "S01", milestoneId: "M012", title: "Foundation", status: "pending", depends: [], sequence: 1 });
insertSlice({ id: "S02", milestoneId: "M012", title: "Migrate Pages", status: "pending", depends: ["S01"], sequence: 2 });
writeFileSync(join(repo, ".gsd", "milestones", "M010", "M010-ROADMAP.md"), "# M010\n");
writeFileSync(join(repo, ".gsd", "milestones", "M011", "M011-ROADMAP.md"), "# M011\n");
writeFileSync(join(repo, ".gsd", "milestones", "M012", "M012-ROADMAP.md"), "# M012\n");
// Without lock: M012 blocked by M010's incomplete S01
delete process.env.GSD_MILESTONE_LOCK;
assert.match(
getPriorSliceCompletionBlocker(repo, "main", "execute-task", "M012/S01/T01") ?? "",
/earlier slice M010\/S01 is not complete/,
);
// With lock: M012 only checks its own intra-milestone deps — S01 has none, so unblocked
process.env.GSD_MILESTONE_LOCK = "M012";
assert.equal(
getPriorSliceCompletionBlocker(repo, "main", "execute-task", "M012/S01/T01"),
null,
);
// With lock: M012/S02 still blocked by M012/S01 (intra-milestone dep preserved)
assert.equal(
getPriorSliceCompletionBlocker(repo, "main", "execute-task", "M012/S02/T01"),
"Cannot dispatch execute-task M012/S02/T01: dependency slice M012/S01 is not complete.",
);
});

View file

@ -131,14 +131,15 @@ describe("parallel-worker-monitoring", () => {
assert.ok(!(5.01 < ceiling), "5.01 is over ceiling");
});
it("worker spawn args include --mode json", () => {
// Verify the spawn command includes JSON mode for NDJSON output.
// We can't easily test the actual spawn, but we verify the args pattern.
const expectedArgs = ["--mode", "json", "--print", "/gsd auto"];
assert.ok(expectedArgs.includes("--mode"), "args include --mode");
assert.ok(expectedArgs.includes("json"), "args include json");
assert.ok(expectedArgs.indexOf("--mode") < expectedArgs.indexOf("json"),
"--mode comes before json");
it("worker spawn args use headless --json auto (#2792)", () => {
// Verify the spawn command uses headless mode (not --print which exits
// before auto-mode can run). See #2792.
const expectedArgs = ["headless", "--json", "auto"];
assert.ok(expectedArgs.includes("headless"), "args include headless");
assert.ok(expectedArgs.includes("--json"), "args include --json");
assert.ok(expectedArgs.includes("auto"), "args include auto");
assert.ok(expectedArgs.indexOf("headless") < expectedArgs.indexOf("auto"),
"headless comes before auto");
});
it("refreshWorkerStatuses restores persisted workers from disk", () => {