fix(parallel): three bugs preventing reliable parallel worker execution (#2801)
Bug 1 — Workers exit immediately (#2792): spawnWorker() used `--print "/gsd auto"` which calls session.prompt() that returns immediately when ctx.newSession() resets the session inside the auto-loop. Changed to `headless --json auto` which uses an RPC client that keeps the process alive until auto-mode completes. Bug 2 — Dispatch guard blocks parallel workers (#2797): getPriorSliceCompletionBlocker() checked ALL milestones in queue order, blocking M012 when M011 had incomplete slices. When GSD_MILESTONE_LOCK is set, the guard now only checks intra-milestone slice dependencies. Added test covering cross-milestone bypass + intra-milestone preservation. Bug 3 — Orphaned RPC children on stop (#2798): stopParallel() gave only 750ms for SIGTERM before SIGKILL. The headless parent needs ~1500ms to cascade shutdown to its RPC child via client.stop(). Increased to 3000ms to prevent orphaned processes holding auto.lock. Updated tests: - dispatch-guard.test.ts: new test for GSD_MILESTONE_LOCK bypass - parallel-worker-monitoring.test.ts: updated spawn args assertion
This commit is contained in:
parent
7c5dae0298
commit
558b1f2081
4 changed files with 91 additions and 15 deletions
|
|
@ -26,9 +26,20 @@ export function getPriorSliceCompletionBlocker(
|
|||
const { milestone: targetMid, slice: targetSid } = parseUnitId(unitId);
|
||||
if (!targetMid || !targetSid) return null;
|
||||
|
||||
// Parallel worker isolation: when GSD_MILESTONE_LOCK is set, this worker
|
||||
// is scoped to a single milestone. Skip the cross-milestone dependency
|
||||
// check — other milestones are being handled by their own workers.
|
||||
// Without this, the dispatch guard sees incomplete slices in M010/M011
|
||||
// (cloned into the worktree DB) and blocks M012 from ever starting. #2797
|
||||
const milestoneLock = process.env.GSD_MILESTONE_LOCK;
|
||||
|
||||
// Use findMilestoneIds to respect custom queue order.
|
||||
// Only check milestones that come BEFORE the target in queue order.
|
||||
const allIds = findMilestoneIds(base);
|
||||
// When locked to a specific milestone, only check that milestone's
|
||||
// intra-slice dependencies — skip all cross-milestone checks.
|
||||
const allIds = milestoneLock && targetMid === milestoneLock
|
||||
? [targetMid]
|
||||
: findMilestoneIds(base);
|
||||
const targetIdx = allIds.indexOf(targetMid);
|
||||
if (targetIdx < 0) return null;
|
||||
const milestoneIds = allIds.slice(0, targetIdx + 1);
|
||||
|
|
|
|||
|
|
@ -519,8 +519,19 @@ function createMilestoneWorktree(basePath: string, milestoneId: string): string
|
|||
|
||||
/**
|
||||
* Spawn a worker process for a milestone.
|
||||
* The worker runs `gsd --print "/gsd auto"` in the milestone's worktree
|
||||
* The worker runs `gsd headless --json auto` in the milestone's worktree
|
||||
* with GSD_MILESTONE_LOCK set to isolate state derivation.
|
||||
*
|
||||
* IMPORTANT: We use `headless --json auto` instead of `--print "/gsd auto"`.
|
||||
* --print mode calls session.prompt() which returns immediately after the
|
||||
* extension command handler fires, because auto-mode's ctx.newSession()
|
||||
* resets the session and unblocks the outer prompt() await. This causes
|
||||
* process.exit(0) to fire before any LLM work happens. See #2792.
|
||||
*
|
||||
* The headless subcommand uses an RPC client that keeps the process alive
|
||||
* until auto-mode emits a terminal notification or the idle timer fires.
|
||||
* It outputs NDJSON events to stdout (with --json), which our
|
||||
* processWorkerLine() parser already understands.
|
||||
*/
|
||||
export function spawnWorker(
|
||||
basePath: string,
|
||||
|
|
@ -537,7 +548,7 @@ export function spawnWorker(
|
|||
|
||||
let child: ChildProcess;
|
||||
try {
|
||||
child = spawn(process.execPath, [binPath, "--mode", "json", "--print", "/gsd auto"], {
|
||||
child = spawn(process.execPath, [binPath, "headless", "--json", "auto"], {
|
||||
cwd: worker.worktreePath,
|
||||
env: {
|
||||
...process.env,
|
||||
|
|
@ -577,9 +588,10 @@ export function spawnWorker(
|
|||
}
|
||||
|
||||
// ── NDJSON stdout monitoring ────────────────────────────────────────
|
||||
// Workers run with --mode json, emitting one JSON event per line.
|
||||
// We parse message_end events to extract cost/token usage, keeping
|
||||
// the coordinator's cost tracking in sync with actual API spend.
|
||||
// Workers run via `headless --json`, which forwards all RPC events
|
||||
// as NDJSON to stdout. We parse message_end events to extract
|
||||
// cost/token usage, keeping the coordinator's cost tracking in sync
|
||||
// with actual API spend.
|
||||
if (child.stdout) {
|
||||
let stdoutBuffer = "";
|
||||
child.stdout.on("data", (data: Buffer) => {
|
||||
|
|
@ -808,7 +820,12 @@ export async function stopParallel(
|
|||
} catch { /* process may already be dead */ }
|
||||
}
|
||||
|
||||
const exitedAfterTerm = await waitForWorkerExit(worker, 750);
|
||||
// Wait for the headless process to cascade SIGTERM to its RPC child.
|
||||
// The headless signal handler calls client.stop() which sends SIGTERM
|
||||
// to the RPC child and waits up to 1000ms. The previous 750ms window
|
||||
// was insufficient — the parent got SIGKILL before the child died,
|
||||
// leaving orphaned RPC processes holding auto.lock. See #2798.
|
||||
const exitedAfterTerm = await waitForWorkerExit(worker, 3000);
|
||||
if (!exitedAfterTerm && worker.pid > 0) {
|
||||
try {
|
||||
if (worker.process) {
|
||||
|
|
|
|||
|
|
@ -216,3 +216,50 @@ test("dispatch guard works without git repo", (t) => {
|
|||
|
||||
assert.equal(getPriorSliceCompletionBlocker(repo, "main", "plan-slice", "M001/S02"), null);
|
||||
});
|
||||
|
||||
test("dispatch guard skips cross-milestone check when GSD_MILESTONE_LOCK is set (#2797)", (t) => {
|
||||
const repo = setupRepo();
|
||||
t.after(() => {
|
||||
delete process.env.GSD_MILESTONE_LOCK;
|
||||
teardownRepo(repo);
|
||||
});
|
||||
|
||||
mkdirSync(join(repo, ".gsd", "milestones", "M010"), { recursive: true });
|
||||
mkdirSync(join(repo, ".gsd", "milestones", "M011"), { recursive: true });
|
||||
mkdirSync(join(repo, ".gsd", "milestones", "M012"), { recursive: true });
|
||||
|
||||
// M010 and M011 have incomplete slices
|
||||
insertMilestone({ id: "M010", title: "Analytics" });
|
||||
insertSlice({ id: "S01", milestoneId: "M010", title: "Data Quality", status: "pending", depends: [], sequence: 1 });
|
||||
|
||||
insertMilestone({ id: "M011", title: "Builder Onboarding" });
|
||||
insertSlice({ id: "S01", milestoneId: "M011", title: "Schema", status: "pending", depends: [], sequence: 1 });
|
||||
|
||||
insertMilestone({ id: "M012", title: "Shared Components" });
|
||||
insertSlice({ id: "S01", milestoneId: "M012", title: "Foundation", status: "pending", depends: [], sequence: 1 });
|
||||
insertSlice({ id: "S02", milestoneId: "M012", title: "Migrate Pages", status: "pending", depends: ["S01"], sequence: 2 });
|
||||
|
||||
writeFileSync(join(repo, ".gsd", "milestones", "M010", "M010-ROADMAP.md"), "# M010\n");
|
||||
writeFileSync(join(repo, ".gsd", "milestones", "M011", "M011-ROADMAP.md"), "# M011\n");
|
||||
writeFileSync(join(repo, ".gsd", "milestones", "M012", "M012-ROADMAP.md"), "# M012\n");
|
||||
|
||||
// Without lock: M012 blocked by M010's incomplete S01
|
||||
delete process.env.GSD_MILESTONE_LOCK;
|
||||
assert.match(
|
||||
getPriorSliceCompletionBlocker(repo, "main", "execute-task", "M012/S01/T01") ?? "",
|
||||
/earlier slice M010\/S01 is not complete/,
|
||||
);
|
||||
|
||||
// With lock: M012 only checks its own intra-milestone deps — S01 has none, so unblocked
|
||||
process.env.GSD_MILESTONE_LOCK = "M012";
|
||||
assert.equal(
|
||||
getPriorSliceCompletionBlocker(repo, "main", "execute-task", "M012/S01/T01"),
|
||||
null,
|
||||
);
|
||||
|
||||
// With lock: M012/S02 still blocked by M012/S01 (intra-milestone dep preserved)
|
||||
assert.equal(
|
||||
getPriorSliceCompletionBlocker(repo, "main", "execute-task", "M012/S02/T01"),
|
||||
"Cannot dispatch execute-task M012/S02/T01: dependency slice M012/S01 is not complete.",
|
||||
);
|
||||
});
|
||||
|
|
|
|||
|
|
@ -131,14 +131,15 @@ describe("parallel-worker-monitoring", () => {
|
|||
assert.ok(!(5.01 < ceiling), "5.01 is over ceiling");
|
||||
});
|
||||
|
||||
it("worker spawn args include --mode json", () => {
|
||||
// Verify the spawn command includes JSON mode for NDJSON output.
|
||||
// We can't easily test the actual spawn, but we verify the args pattern.
|
||||
const expectedArgs = ["--mode", "json", "--print", "/gsd auto"];
|
||||
assert.ok(expectedArgs.includes("--mode"), "args include --mode");
|
||||
assert.ok(expectedArgs.includes("json"), "args include json");
|
||||
assert.ok(expectedArgs.indexOf("--mode") < expectedArgs.indexOf("json"),
|
||||
"--mode comes before json");
|
||||
it("worker spawn args use headless --json auto (#2792)", () => {
|
||||
// Verify the spawn command uses headless mode (not --print which exits
|
||||
// before auto-mode can run). See #2792.
|
||||
const expectedArgs = ["headless", "--json", "auto"];
|
||||
assert.ok(expectedArgs.includes("headless"), "args include headless");
|
||||
assert.ok(expectedArgs.includes("--json"), "args include --json");
|
||||
assert.ok(expectedArgs.includes("auto"), "args include auto");
|
||||
assert.ok(expectedArgs.indexOf("headless") < expectedArgs.indexOf("auto"),
|
||||
"headless comes before auto");
|
||||
});
|
||||
|
||||
it("refreshWorkerStatuses restores persisted workers from disk", () => {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue