fix(auto): stale lock detection, SIGTERM handler, live-session guard (#362)

This commit is contained in:
Copilot 2026-03-14 08:43:33 -06:00 committed by GitHub
parent db2c864e9e
commit 8f8a9db7cb
2 changed files with 70 additions and 4 deletions

View file

@ -32,7 +32,7 @@ import {
} from "./paths.js";
import { saveActivityLog } from "./activity-log.js";
import { synthesizeCrashRecovery, getDeepDiagnostic } from "./session-forensics.js";
import { writeLock, clearLock, readCrashLock, formatCrashInfo } from "./crash-recovery.js";
import { writeLock, clearLock, readCrashLock, formatCrashInfo, isLockProcessAlive } from "./crash-recovery.js";
import {
clearUnitRuntimeRecord,
formatExecuteTaskRecoveryStatus,
@ -164,6 +164,32 @@ let unitTimeoutHandle: ReturnType<typeof setTimeout> | null = null;
let wrapupWarningHandle: ReturnType<typeof setTimeout> | null = null;
let idleWatchdogHandle: ReturnType<typeof setInterval> | null = null;
/** SIGTERM handler registered while auto-mode is active — cleared on stop/pause. */
let _sigtermHandler: (() => void) | null = null;
/**
* Register a SIGTERM handler that clears the lock file and exits cleanly.
* Captures the active base path at registration time so the handler
* always references the correct path even if the module variable changes.
* Removes any previously registered handler before installing the new one.
*/
function registerSigtermHandler(currentBasePath: string): void {
if (_sigtermHandler) process.off("SIGTERM", _sigtermHandler);
_sigtermHandler = () => {
clearLock(currentBasePath);
process.exit(0);
};
process.on("SIGTERM", _sigtermHandler);
}
/** Deregister the SIGTERM handler (called on stop/pause). */
function deregisterSigtermHandler(): void {
if (_sigtermHandler) {
process.off("SIGTERM", _sigtermHandler);
_sigtermHandler = null;
}
}
/** Format token counts for compact display */
function formatWidgetTokens(count: number): string {
if (count < 1000) return count.toString();
@ -251,7 +277,8 @@ export async function stopAuto(ctx?: ExtensionContext, pi?: ExtensionAPI): Promi
if (basePath) clearLock(basePath);
clearSkillSnapshot();
// Show final cost summary before resetting
// Remove SIGTERM handler registered at auto-mode start
deregisterSigtermHandler();
const ledger = getLedger();
if (ledger && ledger.units.length > 0) {
const totals = getProjectTotals(ledger.units);
@ -303,6 +330,10 @@ export async function pauseAuto(ctx?: ExtensionContext, _pi?: ExtensionAPI): Pro
if (!active) return;
clearUnitTimeout();
if (basePath) clearLock(basePath);
// Remove SIGTERM handler registered at auto-mode start
deregisterSigtermHandler();
active = false;
paused = true;
// Preserve: unitDispatchCount, currentUnit, basePath, verbose, cmdCtx,
@ -479,6 +510,10 @@ export async function startAuto(
if (!getLedger()) initMetrics(base);
// Ensure milestone ID is set on git service for integration branch resolution
if (currentMilestoneId) setActiveMilestoneId(base, currentMilestoneId);
// Re-register SIGTERM handler for the resumed session
registerSigtermHandler(base);
ctx.ui.setStatus("gsd-auto", stepMode ? "next" : "auto");
ctx.ui.setFooter(hideFooter);
ctx.ui.notify(stepMode ? "Step-mode resumed." : "Auto-mode resumed.", "info");
@ -525,8 +560,16 @@ export async function startAuto(
// Check for crash from previous session
const crashLock = readCrashLock(base);
if (crashLock) {
// Synthesize a rich recovery briefing from the surviving pi session file
// (pi writes entries incrementally, so it contains every tool call up to the crash)
if (isLockProcessAlive(crashLock)) {
// The lock belongs to a process that is still running — not a crash.
// Warn the user and abort to avoid two concurrent auto-mode sessions.
ctx.ui.notify(
`Another auto-mode session (PID ${crashLock.pid}) appears to be running.\nStop it with \`kill ${crashLock.pid}\` before starting a new session.`,
"error",
);
return;
}
// Stale lock from a dead process — synthesize crash recovery context.
const activityDir = join(gsdRoot(base), "activity");
const recovery = synthesizeCrashRecovery(
base, crashLock.unitType, crashLock.unitId,
@ -586,6 +629,9 @@ export async function startAuto(
originalModelId = ctx.model?.id ?? null;
originalModelProvider = ctx.model?.provider ?? null;
// Register a SIGTERM handler so `kill <pid>` cleans up the lock and exits.
registerSigtermHandler(base);
// Capture the integration branch — records the branch the user was on when
// auto-mode started. Slice branches will merge back to this branch instead
// of the repo's default (main/master). Idempotent when the branch is the

View file

@ -73,6 +73,26 @@ export function readCrashLock(basePath: string): LockData | null {
}
}
/**
* Check whether the process that wrote the lock is still running.
* Uses `process.kill(pid, 0)` which sends no signal but checks liveness.
* Returns false if the PID matches our own (recycled PID from a prior run).
*/
export function isLockProcessAlive(lock: LockData): boolean {
const pid = lock.pid;
if (!Number.isInteger(pid) || pid <= 0) return false;
if (pid === process.pid) return false;
try {
process.kill(pid, 0);
return true;
} catch (err) {
// EPERM means the process exists but we lack permission — treat as alive.
// ESRCH means the process does not exist — treat as dead (stale lock).
if ((err as NodeJS.ErrnoException).code === "EPERM") return true;
return false;
}
}
/** Format crash info for display or injection into a prompt. */
export function formatCrashInfo(lock: LockData): string {
return [