fix: clarify session lock loss diagnostics (#1535)
This commit is contained in:
parent
df76eea764
commit
2fcbb40c09
5 changed files with 171 additions and 18 deletions
|
|
@ -15,6 +15,7 @@ import type { ExtensionAPI, ExtensionContext } from "@gsd/pi-coding-agent";
|
|||
import type { AutoSession } from "./auto/session.js";
|
||||
import { NEW_SESSION_TIMEOUT_MS } from "./auto/session.js";
|
||||
import type { GSDPreferences } from "./preferences.js";
|
||||
import type { SessionLockStatus } from "./session-lock.js";
|
||||
import type { GSDState } from "./types.js";
|
||||
import type { CloseoutOptions } from "./auto-unit-closeout.js";
|
||||
import type { PostUnitContext } from "./auto-post-unit.js";
|
||||
|
|
@ -307,7 +308,7 @@ export interface LoopDeps {
|
|||
checkResourcesStale: (version: string | null) => string | null;
|
||||
|
||||
// Session lock
|
||||
validateSessionLock: (basePath: string) => boolean;
|
||||
validateSessionLock: (basePath: string) => SessionLockStatus;
|
||||
updateSessionLock: (
|
||||
basePath: string,
|
||||
unitType: string,
|
||||
|
|
@ -315,7 +316,10 @@ export interface LoopDeps {
|
|||
completedUnits: number,
|
||||
sessionFile?: string,
|
||||
) => void;
|
||||
handleLostSessionLock: (ctx?: ExtensionContext) => void;
|
||||
handleLostSessionLock: (
|
||||
ctx?: ExtensionContext,
|
||||
lockStatus?: SessionLockStatus,
|
||||
) => void;
|
||||
|
||||
// Milestone transition functions
|
||||
sendDesktopNotification: (
|
||||
|
|
@ -559,10 +563,24 @@ export async function autoLoop(
|
|||
try {
|
||||
// ── Blanket try/catch: one bad iteration must not kill the session
|
||||
|
||||
if (deps.lockBase() && !deps.validateSessionLock(deps.lockBase())) {
|
||||
deps.handleLostSessionLock(ctx);
|
||||
debugLog("autoLoop", { phase: "exit", reason: "session-lock-lost" });
|
||||
break;
|
||||
const sessionLockBase = deps.lockBase();
|
||||
if (sessionLockBase) {
|
||||
const lockStatus = deps.validateSessionLock(sessionLockBase);
|
||||
if (!lockStatus.valid) {
|
||||
debugLog("autoLoop", {
|
||||
phase: "session-lock-invalid",
|
||||
reason: lockStatus.failureReason ?? "unknown",
|
||||
existingPid: lockStatus.existingPid,
|
||||
expectedPid: lockStatus.expectedPid,
|
||||
});
|
||||
deps.handleLostSessionLock(ctx, lockStatus);
|
||||
debugLog("autoLoop", {
|
||||
phase: "exit",
|
||||
reason: "session-lock-lost",
|
||||
detail: lockStatus.failureReason ?? "unknown",
|
||||
});
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
// ── Phase 1: Pre-dispatch ───────────────────────────────────────────
|
||||
|
|
|
|||
|
|
@ -47,10 +47,11 @@ import {
|
|||
} from "./crash-recovery.js";
|
||||
import {
|
||||
acquireSessionLock,
|
||||
validateSessionLock,
|
||||
getSessionLockStatus,
|
||||
releaseSessionLock,
|
||||
updateSessionLock,
|
||||
} from "./session-lock.js";
|
||||
import type { SessionLockStatus } from "./session-lock.js";
|
||||
import {
|
||||
clearUnitRuntimeRecord,
|
||||
inspectExecuteTaskDurability,
|
||||
|
|
@ -461,15 +462,33 @@ function buildSnapshotOpts(
|
|||
};
|
||||
}
|
||||
|
||||
function handleLostSessionLock(ctx?: ExtensionContext): void {
|
||||
debugLog("session-lock-lost", { lockBase: lockBase() });
|
||||
function handleLostSessionLock(
|
||||
ctx?: ExtensionContext,
|
||||
lockStatus?: SessionLockStatus,
|
||||
): void {
|
||||
debugLog("session-lock-lost", {
|
||||
lockBase: lockBase(),
|
||||
reason: lockStatus?.failureReason,
|
||||
existingPid: lockStatus?.existingPid,
|
||||
expectedPid: lockStatus?.expectedPid,
|
||||
});
|
||||
s.active = false;
|
||||
s.paused = false;
|
||||
clearUnitTimeout();
|
||||
deregisterSigtermHandler();
|
||||
clearCmuxSidebar(loadEffectiveGSDPreferences()?.preferences);
|
||||
const message =
|
||||
lockStatus?.failureReason === "pid-mismatch"
|
||||
? lockStatus.existingPid
|
||||
? `Session lock moved to PID ${lockStatus.existingPid} — another GSD process appears to have taken over. Stopping gracefully.`
|
||||
: "Session lock moved to a different process — another GSD process appears to have taken over. Stopping gracefully."
|
||||
: lockStatus?.failureReason === "missing-metadata"
|
||||
? "Session lock metadata disappeared, so ownership could not be confirmed. Stopping gracefully."
|
||||
: lockStatus?.failureReason === "compromised"
|
||||
? "Session lock was compromised or invalidated during heartbeat checks; takeover was not confirmed. Stopping gracefully."
|
||||
: "Session lock lost. Stopping gracefully.";
|
||||
ctx?.ui.notify(
|
||||
"Session lock lost — another GSD process appears to have taken over. Stopping gracefully.",
|
||||
message,
|
||||
"error",
|
||||
);
|
||||
ctx?.ui.setStatus("gsd-auto", undefined);
|
||||
|
|
@ -736,7 +755,7 @@ function buildLoopDeps(): LoopDeps {
|
|||
checkResourcesStale,
|
||||
|
||||
// Session lock
|
||||
validateSessionLock,
|
||||
validateSessionLock: getSessionLockStatus,
|
||||
updateSessionLock,
|
||||
handleLostSessionLock,
|
||||
|
||||
|
|
|
|||
|
|
@ -40,6 +40,19 @@ export type SessionLockResult =
|
|||
| { acquired: true }
|
||||
| { acquired: false; reason: string; existingPid?: number };
|
||||
|
||||
export type SessionLockFailureReason =
|
||||
| "compromised"
|
||||
| "missing-metadata"
|
||||
| "pid-mismatch";
|
||||
|
||||
export interface SessionLockStatus {
|
||||
valid: boolean;
|
||||
failureReason?: SessionLockFailureReason;
|
||||
existingPid?: number;
|
||||
expectedPid?: number;
|
||||
recovered?: boolean;
|
||||
}
|
||||
|
||||
// ─── Module State ───────────────────────────────────────────────────────────
|
||||
|
||||
/** Release function from proper-lockfile — calling it releases the OS lock. */
|
||||
|
|
@ -368,7 +381,7 @@ export function updateSessionLock(
|
|||
*
|
||||
* This is called periodically during the dispatch loop.
|
||||
*/
|
||||
export function validateSessionLock(basePath: string): boolean {
|
||||
export function getSessionLockStatus(basePath: string): SessionLockStatus {
|
||||
// Lock was compromised by proper-lockfile (mtime drift from sleep, stall, etc.)
|
||||
if (_lockCompromised) {
|
||||
// Recovery gate (#1512): Before declaring the lock lost, check if the lock
|
||||
|
|
@ -385,18 +398,23 @@ export function validateSessionLock(basePath: string): boolean {
|
|||
process.stderr.write(
|
||||
`[gsd] Lock recovered after onCompromised — lock file PID matched, re-acquired.\n`,
|
||||
);
|
||||
return true;
|
||||
return { valid: true, recovered: true };
|
||||
}
|
||||
} catch {
|
||||
// Re-acquisition failed — fall through to return false
|
||||
}
|
||||
}
|
||||
return false;
|
||||
return {
|
||||
valid: false,
|
||||
failureReason: "compromised",
|
||||
existingPid: existing?.pid,
|
||||
expectedPid: process.pid,
|
||||
};
|
||||
}
|
||||
|
||||
// If we have an OS-level lock, we're still the owner
|
||||
if (_releaseFunction && _lockedPath === basePath) {
|
||||
return true;
|
||||
return { valid: true };
|
||||
}
|
||||
|
||||
// Fallback: check the lock file PID
|
||||
|
|
@ -404,10 +422,27 @@ export function validateSessionLock(basePath: string): boolean {
|
|||
const existing = readExistingLockData(lp);
|
||||
if (!existing) {
|
||||
// Lock file was deleted — we lost ownership
|
||||
return false;
|
||||
return {
|
||||
valid: false,
|
||||
failureReason: "missing-metadata",
|
||||
expectedPid: process.pid,
|
||||
};
|
||||
}
|
||||
|
||||
return existing.pid === process.pid;
|
||||
if (existing.pid !== process.pid) {
|
||||
return {
|
||||
valid: false,
|
||||
failureReason: "pid-mismatch",
|
||||
existingPid: existing.pid,
|
||||
expectedPid: process.pid,
|
||||
};
|
||||
}
|
||||
|
||||
return { valid: true };
|
||||
}
|
||||
|
||||
export function validateSessionLock(basePath: string): boolean {
|
||||
return getSessionLockStatus(basePath).valid;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
|
|
@ -14,6 +14,7 @@ import {
|
|||
type AgentEndEvent,
|
||||
type LoopDeps,
|
||||
} from "../auto-loop.js";
|
||||
import type { SessionLockStatus } from "../session-lock.js";
|
||||
|
||||
// ─── Helpers ─────────────────────────────────────────────────────────────────
|
||||
|
||||
|
|
@ -341,7 +342,7 @@ function makeMockDeps(
|
|||
preDispatchHealthGate: async () => ({ proceed: true, fixesApplied: [] }),
|
||||
syncProjectRootToWorktree: () => {},
|
||||
checkResourcesStale: () => null,
|
||||
validateSessionLock: () => true,
|
||||
validateSessionLock: () => ({ valid: true } as SessionLockStatus),
|
||||
updateSessionLock: () => {
|
||||
callLog.push("updateSessionLock");
|
||||
},
|
||||
|
|
@ -532,6 +533,41 @@ test("autoLoop exits on terminal complete state", async (t) => {
|
|||
);
|
||||
});
|
||||
|
||||
test("autoLoop passes structured session-lock failure details to the handler", async () => {
|
||||
_resetPendingResolve();
|
||||
|
||||
const ctx = makeMockCtx();
|
||||
ctx.ui.setStatus = () => {};
|
||||
const pi = makeMockPi();
|
||||
const s = makeLoopSession();
|
||||
let observedLockStatus: SessionLockStatus | undefined;
|
||||
|
||||
const deps = makeMockDeps({
|
||||
validateSessionLock: () =>
|
||||
({
|
||||
valid: false,
|
||||
failureReason: "compromised",
|
||||
expectedPid: process.pid,
|
||||
}) as SessionLockStatus,
|
||||
handleLostSessionLock: (_ctx, lockStatus) => {
|
||||
observedLockStatus = lockStatus;
|
||||
deps.callLog.push("handleLostSessionLock");
|
||||
},
|
||||
});
|
||||
|
||||
await autoLoop(ctx, pi, s, deps);
|
||||
|
||||
assert.deepEqual(observedLockStatus, {
|
||||
valid: false,
|
||||
failureReason: "compromised",
|
||||
expectedPid: process.pid,
|
||||
});
|
||||
assert.ok(
|
||||
!deps.callLog.includes("resolveDispatch"),
|
||||
"should stop before dispatch after lock validation fails",
|
||||
);
|
||||
});
|
||||
|
||||
test("autoLoop exits on terminal blocked state", async (t) => {
|
||||
_resetPendingResolve();
|
||||
|
||||
|
|
|
|||
|
|
@ -17,6 +17,7 @@ import { tmpdir } from 'node:os';
|
|||
|
||||
import {
|
||||
acquireSessionLock,
|
||||
getSessionLockStatus,
|
||||
validateSessionLock,
|
||||
releaseSessionLock,
|
||||
readSessionLockData,
|
||||
|
|
@ -201,6 +202,50 @@ async function main(): Promise<void> {
|
|||
}
|
||||
}
|
||||
|
||||
// ─── 7b. getSessionLockStatus with missing metadata → reason surfaced ──
|
||||
console.log('\n=== 7b. missing lock metadata → structured reason ===');
|
||||
{
|
||||
const base = mkdtempSync(join(tmpdir(), 'gsd-session-lock-'));
|
||||
mkdirSync(join(base, '.gsd'), { recursive: true });
|
||||
|
||||
try {
|
||||
const status = getSessionLockStatus(base);
|
||||
assertEq(status.valid, false, 'missing lock metadata is invalid');
|
||||
assertEq(status.failureReason, 'missing-metadata', 'missing metadata reason is surfaced');
|
||||
assertEq(status.expectedPid, process.pid, 'expected PID is included');
|
||||
} finally {
|
||||
rmSync(base, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
|
||||
// ─── 7c. getSessionLockStatus with foreign PID → reason surfaced ───────
|
||||
console.log('\n=== 7c. foreign PID in lock file → structured reason ===');
|
||||
{
|
||||
const base = mkdtempSync(join(tmpdir(), 'gsd-session-lock-'));
|
||||
mkdirSync(join(base, '.gsd'), { recursive: true });
|
||||
|
||||
try {
|
||||
const foreignPid = process.pid + 1000;
|
||||
const lockFile = join(gsdRoot(base), 'auto.lock');
|
||||
writeFileSync(lockFile, JSON.stringify({
|
||||
pid: foreignPid,
|
||||
startedAt: new Date().toISOString(),
|
||||
unitType: 'execute-task',
|
||||
unitId: 'M001/S01/T01',
|
||||
unitStartedAt: new Date().toISOString(),
|
||||
completedUnits: 0,
|
||||
}, null, 2));
|
||||
|
||||
const status = getSessionLockStatus(base);
|
||||
assertEq(status.valid, false, 'foreign PID lock is invalid');
|
||||
assertEq(status.failureReason, 'pid-mismatch', 'PID mismatch reason is surfaced');
|
||||
assertEq(status.existingPid, foreignPid, 'existing PID is included');
|
||||
assertEq(status.expectedPid, process.pid, 'expected PID is included');
|
||||
} finally {
|
||||
rmSync(base, { recursive: true, force: true });
|
||||
}
|
||||
}
|
||||
|
||||
// ─── 8. Acquire after release is possible ─────────────────────────────
|
||||
console.log('\n=== 8. acquire after release → re-acquirable ===');
|
||||
{
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue