fix: prevent gsd next from self-killing via stale crash lock (#2784)

* fix: prevent gsd next from self-killing via stale crash lock

cleanupAfterLoopExit() did not clear the crash lock or session lock.
On the next `/gsd next`, checkRemoteAutoSession() read the stale lock
with the current PID, reported it as a "remote" session, and
stopAutoRemote() sent SIGTERM to the current process.

Three fixes:
1. cleanupAfterLoopExit() now clears crash lock and releases session lock
2. checkRemoteAutoSession() returns { running: false } for own PID
3. stopAutoRemote() guards against self-kill for own PID

Closes #2730

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

* test: add regression tests for stale lock self-kill prevention

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
TÂCHES 2026-03-26 19:59:32 -06:00 committed by GitHub
parent 474d097c34
commit 4fe4dbf456
2 changed files with 108 additions and 0 deletions

View file

@ -414,6 +414,13 @@ export function stopAutoRemote(projectRoot: string): {
const lock = readCrashLock(projectRoot);
if (!lock) return { found: false };
// Never SIGTERM ourselves — a stale lock with our own PID is not a remote
// session, it is leftover from a prior loop exit in this process. (#2730)
if (lock.pid === process.pid) {
clearLock(projectRoot);
return { found: false };
}
if (!isLockProcessAlive(lock)) {
// Stale lock — clean it up
clearLock(projectRoot);
@ -445,6 +452,10 @@ export function checkRemoteAutoSession(projectRoot: string): {
const lock = readCrashLock(projectRoot);
if (!lock) return { running: false };
// Our own PID is not a "remote" session — it is a stale lock left by this
// process (e.g. after step-mode exit without full cleanup). (#2730)
if (lock.pid === process.pid) return { running: false };
if (!isLockProcessAlive(lock)) {
// Stale lock from a dead process — not a live remote session
return { running: false };
@ -548,6 +559,16 @@ function cleanupAfterLoopExit(ctx: ExtensionContext): void {
s.active = false;
clearUnitTimeout();
// Clear crash lock and release session lock so the next `/gsd next` does
// not see a stale lock with the current PID and treat it as a "remote"
// session (which would cause it to SIGTERM itself). (#2730)
try {
if (lockBase()) clearLock(lockBase());
if (lockBase()) releaseSessionLock(lockBase());
} catch {
/* best-effort — mirror stopAuto cleanup */
}
ctx.ui.setStatus("gsd-auto", undefined);
ctx.ui.setWidget("gsd-progress", undefined);
ctx.ui.setFooter(undefined);

View file

@ -0,0 +1,87 @@
import test from "node:test";
import assert from "node:assert/strict";
import { mkdirSync, mkdtempSync, writeFileSync, existsSync, rmSync } from "node:fs";
import { join } from "node:path";
import { tmpdir } from "node:os";
import { writeLock, readCrashLock, clearLock } from "../crash-recovery.ts";
import { checkRemoteAutoSession, stopAutoRemote } from "../auto.ts";
function makeTmpProject(): string {
const dir = mkdtempSync(join(tmpdir(), "gsd-stale-lock-test-"));
mkdirSync(join(dir, ".gsd"), { recursive: true });
return dir;
}
// ─── checkRemoteAutoSession: own-PID filtering (#2730) ───────────────────
test("#2730: checkRemoteAutoSession returns { running: false } when lock PID matches current process", (t) => {
const dir = makeTmpProject();
t.after(() => rmSync(dir, { recursive: true, force: true }));
// Write a lock with the current process PID — simulates a stale lock
// left behind after step-mode exit without full cleanup.
writeLock(dir, "execute-task", "M001/S01/T01");
const lock = readCrashLock(dir);
assert.ok(lock, "lock file should exist");
assert.equal(lock!.pid, process.pid, "lock should have our PID");
const result = checkRemoteAutoSession(dir);
assert.equal(result.running, false, "own PID must not be treated as a remote session");
});
test("#2730: checkRemoteAutoSession still detects a genuine remote session (different PID)", (t) => {
const dir = makeTmpProject();
t.after(() => rmSync(dir, { recursive: true, force: true }));
// Use parent PID — guaranteed alive, guaranteed not our PID.
const remotePid = process.ppid;
const lockData = {
pid: remotePid,
startedAt: new Date().toISOString(),
unitType: "execute-task",
unitId: "M001/S01/T02",
unitStartedAt: new Date().toISOString(),
};
writeFileSync(join(dir, ".gsd", "auto.lock"), JSON.stringify(lockData, null, 2));
const result = checkRemoteAutoSession(dir);
assert.equal(result.running, true, "different live PID should be detected as running");
assert.equal(result.pid, remotePid);
});
// ─── stopAutoRemote: self-kill prevention (#2730) ────────────────────────
test("#2730: stopAutoRemote does not send SIGTERM when lock PID matches current process", (t) => {
const dir = makeTmpProject();
t.after(() => rmSync(dir, { recursive: true, force: true }));
// Write a lock with our own PID
writeLock(dir, "execute-task", "M001/S01/T01");
const result = stopAutoRemote(dir);
assert.equal(result.found, false, "own PID must not be signalled");
// The lock should be cleared as part of the self-detection cleanup
assert.ok(!existsSync(join(dir, ".gsd", "auto.lock")), "stale self-lock should be cleared");
});
test("#2730: stopAutoRemote clears stale lock from dead remote process without error", (t) => {
const dir = makeTmpProject();
t.after(() => rmSync(dir, { recursive: true, force: true }));
// Simulate a stale lock from a process that no longer exists
const lockData = {
pid: 9999999,
startedAt: "2026-03-01T00:00:00Z",
unitType: "plan-slice",
unitId: "M001/S02",
unitStartedAt: "2026-03-01T00:05:00Z",
};
writeFileSync(join(dir, ".gsd", "auto.lock"), JSON.stringify(lockData, null, 2));
const result = stopAutoRemote(dir);
assert.equal(result.found, false, "dead remote PID should not be reported as found");
assert.ok(!existsSync(join(dir, ".gsd", "auto.lock")), "stale lock should be cleaned up");
});