fix: clarify session lock loss diagnostics (#1535)

This commit is contained in:
Jeremy McSpadden 2026-03-19 22:13:38 -05:00 committed by GitHub
parent df76eea764
commit 2fcbb40c09
5 changed files with 171 additions and 18 deletions

View file

@ -15,6 +15,7 @@ import type { ExtensionAPI, ExtensionContext } from "@gsd/pi-coding-agent";
import type { AutoSession } from "./auto/session.js";
import { NEW_SESSION_TIMEOUT_MS } from "./auto/session.js";
import type { GSDPreferences } from "./preferences.js";
import type { SessionLockStatus } from "./session-lock.js";
import type { GSDState } from "./types.js";
import type { CloseoutOptions } from "./auto-unit-closeout.js";
import type { PostUnitContext } from "./auto-post-unit.js";
@ -307,7 +308,7 @@ export interface LoopDeps {
checkResourcesStale: (version: string | null) => string | null;
// Session lock
validateSessionLock: (basePath: string) => boolean;
validateSessionLock: (basePath: string) => SessionLockStatus;
updateSessionLock: (
basePath: string,
unitType: string,
@ -315,7 +316,10 @@ export interface LoopDeps {
completedUnits: number,
sessionFile?: string,
) => void;
handleLostSessionLock: (ctx?: ExtensionContext) => void;
handleLostSessionLock: (
ctx?: ExtensionContext,
lockStatus?: SessionLockStatus,
) => void;
// Milestone transition functions
sendDesktopNotification: (
@ -559,10 +563,24 @@ export async function autoLoop(
try {
// ── Blanket try/catch: one bad iteration must not kill the session
if (deps.lockBase() && !deps.validateSessionLock(deps.lockBase())) {
deps.handleLostSessionLock(ctx);
debugLog("autoLoop", { phase: "exit", reason: "session-lock-lost" });
break;
const sessionLockBase = deps.lockBase();
if (sessionLockBase) {
const lockStatus = deps.validateSessionLock(sessionLockBase);
if (!lockStatus.valid) {
debugLog("autoLoop", {
phase: "session-lock-invalid",
reason: lockStatus.failureReason ?? "unknown",
existingPid: lockStatus.existingPid,
expectedPid: lockStatus.expectedPid,
});
deps.handleLostSessionLock(ctx, lockStatus);
debugLog("autoLoop", {
phase: "exit",
reason: "session-lock-lost",
detail: lockStatus.failureReason ?? "unknown",
});
break;
}
}
// ── Phase 1: Pre-dispatch ───────────────────────────────────────────

View file

@ -47,10 +47,11 @@ import {
} from "./crash-recovery.js";
import {
acquireSessionLock,
validateSessionLock,
getSessionLockStatus,
releaseSessionLock,
updateSessionLock,
} from "./session-lock.js";
import type { SessionLockStatus } from "./session-lock.js";
import {
clearUnitRuntimeRecord,
inspectExecuteTaskDurability,
@ -461,15 +462,33 @@ function buildSnapshotOpts(
};
}
function handleLostSessionLock(ctx?: ExtensionContext): void {
debugLog("session-lock-lost", { lockBase: lockBase() });
function handleLostSessionLock(
ctx?: ExtensionContext,
lockStatus?: SessionLockStatus,
): void {
debugLog("session-lock-lost", {
lockBase: lockBase(),
reason: lockStatus?.failureReason,
existingPid: lockStatus?.existingPid,
expectedPid: lockStatus?.expectedPid,
});
s.active = false;
s.paused = false;
clearUnitTimeout();
deregisterSigtermHandler();
clearCmuxSidebar(loadEffectiveGSDPreferences()?.preferences);
const message =
lockStatus?.failureReason === "pid-mismatch"
? lockStatus.existingPid
? `Session lock moved to PID ${lockStatus.existingPid} — another GSD process appears to have taken over. Stopping gracefully.`
: "Session lock moved to a different process — another GSD process appears to have taken over. Stopping gracefully."
: lockStatus?.failureReason === "missing-metadata"
? "Session lock metadata disappeared, so ownership could not be confirmed. Stopping gracefully."
: lockStatus?.failureReason === "compromised"
? "Session lock was compromised or invalidated during heartbeat checks; takeover was not confirmed. Stopping gracefully."
: "Session lock lost. Stopping gracefully.";
ctx?.ui.notify(
"Session lock lost — another GSD process appears to have taken over. Stopping gracefully.",
message,
"error",
);
ctx?.ui.setStatus("gsd-auto", undefined);
@ -736,7 +755,7 @@ function buildLoopDeps(): LoopDeps {
checkResourcesStale,
// Session lock
validateSessionLock,
validateSessionLock: getSessionLockStatus,
updateSessionLock,
handleLostSessionLock,

View file

@ -40,6 +40,19 @@ export type SessionLockResult =
| { acquired: true }
| { acquired: false; reason: string; existingPid?: number };
export type SessionLockFailureReason =
| "compromised"
| "missing-metadata"
| "pid-mismatch";
export interface SessionLockStatus {
valid: boolean;
failureReason?: SessionLockFailureReason;
existingPid?: number;
expectedPid?: number;
recovered?: boolean;
}
// ─── Module State ───────────────────────────────────────────────────────────
/** Release function from proper-lockfile — calling it releases the OS lock. */
@ -368,7 +381,7 @@ export function updateSessionLock(
*
* This is called periodically during the dispatch loop.
*/
export function validateSessionLock(basePath: string): boolean {
export function getSessionLockStatus(basePath: string): SessionLockStatus {
// Lock was compromised by proper-lockfile (mtime drift from sleep, stall, etc.)
if (_lockCompromised) {
// Recovery gate (#1512): Before declaring the lock lost, check if the lock
@ -385,18 +398,23 @@ export function validateSessionLock(basePath: string): boolean {
process.stderr.write(
`[gsd] Lock recovered after onCompromised — lock file PID matched, re-acquired.\n`,
);
return true;
return { valid: true, recovered: true };
}
} catch {
// Re-acquisition failed — fall through to return false
}
}
return false;
return {
valid: false,
failureReason: "compromised",
existingPid: existing?.pid,
expectedPid: process.pid,
};
}
// If we have an OS-level lock, we're still the owner
if (_releaseFunction && _lockedPath === basePath) {
return true;
return { valid: true };
}
// Fallback: check the lock file PID
@ -404,10 +422,27 @@ export function validateSessionLock(basePath: string): boolean {
const existing = readExistingLockData(lp);
if (!existing) {
// Lock file was deleted — we lost ownership
return false;
return {
valid: false,
failureReason: "missing-metadata",
expectedPid: process.pid,
};
}
return existing.pid === process.pid;
if (existing.pid !== process.pid) {
return {
valid: false,
failureReason: "pid-mismatch",
existingPid: existing.pid,
expectedPid: process.pid,
};
}
return { valid: true };
}
export function validateSessionLock(basePath: string): boolean {
return getSessionLockStatus(basePath).valid;
}
/**

View file

@ -14,6 +14,7 @@ import {
type AgentEndEvent,
type LoopDeps,
} from "../auto-loop.js";
import type { SessionLockStatus } from "../session-lock.js";
// ─── Helpers ─────────────────────────────────────────────────────────────────
@ -341,7 +342,7 @@ function makeMockDeps(
preDispatchHealthGate: async () => ({ proceed: true, fixesApplied: [] }),
syncProjectRootToWorktree: () => {},
checkResourcesStale: () => null,
validateSessionLock: () => true,
validateSessionLock: () => ({ valid: true } as SessionLockStatus),
updateSessionLock: () => {
callLog.push("updateSessionLock");
},
@ -532,6 +533,41 @@ test("autoLoop exits on terminal complete state", async (t) => {
);
});
test("autoLoop passes structured session-lock failure details to the handler", async () => {
_resetPendingResolve();
const ctx = makeMockCtx();
ctx.ui.setStatus = () => {};
const pi = makeMockPi();
const s = makeLoopSession();
let observedLockStatus: SessionLockStatus | undefined;
const deps = makeMockDeps({
validateSessionLock: () =>
({
valid: false,
failureReason: "compromised",
expectedPid: process.pid,
}) as SessionLockStatus,
handleLostSessionLock: (_ctx, lockStatus) => {
observedLockStatus = lockStatus;
deps.callLog.push("handleLostSessionLock");
},
});
await autoLoop(ctx, pi, s, deps);
assert.deepEqual(observedLockStatus, {
valid: false,
failureReason: "compromised",
expectedPid: process.pid,
});
assert.ok(
!deps.callLog.includes("resolveDispatch"),
"should stop before dispatch after lock validation fails",
);
});
test("autoLoop exits on terminal blocked state", async (t) => {
_resetPendingResolve();

View file

@ -17,6 +17,7 @@ import { tmpdir } from 'node:os';
import {
acquireSessionLock,
getSessionLockStatus,
validateSessionLock,
releaseSessionLock,
readSessionLockData,
@ -201,6 +202,50 @@ async function main(): Promise<void> {
}
}
// ─── 7b. getSessionLockStatus with missing metadata → reason surfaced ──
console.log('\n=== 7b. missing lock metadata → structured reason ===');
{
const base = mkdtempSync(join(tmpdir(), 'gsd-session-lock-'));
mkdirSync(join(base, '.gsd'), { recursive: true });
try {
const status = getSessionLockStatus(base);
assertEq(status.valid, false, 'missing lock metadata is invalid');
assertEq(status.failureReason, 'missing-metadata', 'missing metadata reason is surfaced');
assertEq(status.expectedPid, process.pid, 'expected PID is included');
} finally {
rmSync(base, { recursive: true, force: true });
}
}
// ─── 7c. getSessionLockStatus with foreign PID → reason surfaced ───────
console.log('\n=== 7c. foreign PID in lock file → structured reason ===');
{
const base = mkdtempSync(join(tmpdir(), 'gsd-session-lock-'));
mkdirSync(join(base, '.gsd'), { recursive: true });
try {
const foreignPid = process.pid + 1000;
const lockFile = join(gsdRoot(base), 'auto.lock');
writeFileSync(lockFile, JSON.stringify({
pid: foreignPid,
startedAt: new Date().toISOString(),
unitType: 'execute-task',
unitId: 'M001/S01/T01',
unitStartedAt: new Date().toISOString(),
completedUnits: 0,
}, null, 2));
const status = getSessionLockStatus(base);
assertEq(status.valid, false, 'foreign PID lock is invalid');
assertEq(status.failureReason, 'pid-mismatch', 'PID mismatch reason is surfaced');
assertEq(status.existingPid, foreignPid, 'existing PID is included');
assertEq(status.expectedPid, process.pid, 'expected PID is included');
} finally {
rmSync(base, { recursive: true, force: true });
}
}
// ─── 8. Acquire after release is possible ─────────────────────────────
console.log('\n=== 8. acquire after release → re-acquirable ===');
{