fix(sf): self-heal stale auto locks before resume

This commit is contained in:
Mikael Hugo 2026-05-02 14:10:16 +02:00
parent bba5a7f143
commit 64b46fcb8a
3 changed files with 84 additions and 7 deletions

View file

@ -65,12 +65,12 @@ import {
import { DISPATCH_RULES, resolveDispatch } from "./auto-dispatch.js";
import {
_resetPendingResolve,
autoLoop,
type ErrorContext,
isSessionSwitchInFlight,
type LoopDeps,
resolveAgentEnd,
resolveAgentEndCancelled,
autoLoop,
runUokKernelLoop,
} from "./auto-loop.js";
import {
@ -132,6 +132,7 @@ import { debugLog, isDebugEnabled, writeDebugSummary } from "./debug-logger.js";
import { getPriorSliceCompletionBlocker } from "./dispatch-guard.js";
import { rebuildState, runSFDoctor } from "./doctor.js";
import {
healAutoStartupRuntime,
preDispatchHealthGate,
resetProactiveHealing,
setLevelChangeCallback,
@ -424,7 +425,10 @@ export function getAutoDashboardData(): AutoDashboardData {
const ledger = getLedger();
const totals = ledger ? getProjectTotals(ledger.units) : null;
const sessionId = s.cmdCtx?.sessionManager?.getSessionId?.() ?? null;
const rtkSavings = sessionId && s.basePath ? getRtkSessionSavings(s.basePath, sessionId) : null;
const rtkSavings =
sessionId && s.basePath
? getRtkSessionSavings(s.basePath, sessionId)
: null;
const rtkEnabled =
loadEffectiveSFPreferences()?.preferences.experimental?.rtk === true;
// Pending capture count — lazy check, non-fatal
@ -1013,8 +1017,7 @@ export async function stopAuto(
// metadata.kind rather than text matching. blocking=true when the
// stop reason includes "blocked" (e.g. write-gate, guardrail block).
const isBlocked =
reason !== undefined &&
reason.toLowerCase().includes("block");
reason !== undefined && reason.toLowerCase().includes("block");
const stopMeta = {
kind: "terminal" as const,
...(isBlocked ? { blocking: true } : {}),
@ -1550,6 +1553,11 @@ export async function startAuto(
// Escape stale worktree cwd from a previous milestone (#608).
base = escapeStaleWorktree(base);
const startupFixes = healAutoStartupRuntime(base);
for (const fix of startupFixes) {
ctx.ui.notify(`Startup self-heal: ${fix}.`, "info");
}
const freshStartAssessment =
interruptedAssessment ?? (await assessInterruptedSession(base));

View file

@ -14,8 +14,8 @@
* after N units, escalates to LLM-assisted heal dispatch.
*/
import { existsSync } from "node:fs";
import { join } from "node:path";
import { existsSync, rmSync } from "node:fs";
import { basename, dirname, join } from "node:path";
import {
clearLock,
isLockProcessAlive,
@ -231,6 +231,46 @@ export interface PreDispatchHealthResult {
fixesApplied: string[];
}
/**
* Clear stale auto runtime locks before startup decides whether to resume.
*
* Purpose: make background/proactive healing effective for the first auto
* decision, not only after a unit is already about to dispatch.
*
* Consumer: startAuto before assessInterruptedSession reads auto.lock and
* paused-session state.
*/
export function healAutoStartupRuntime(basePath: string): string[] {
const fixesApplied: string[] = [];
try {
const lock = readCrashLock(basePath);
if (lock && !isLockProcessAlive(lock)) {
clearLock(basePath);
fixesApplied.push("cleared stale auto.lock before auto startup");
}
} catch {
// Non-fatal.
}
try {
const root = sfRoot(basePath);
const lockDir = join(dirname(root), `${basename(root)}.lock`);
if (existsSync(lockDir)) {
const lock = readCrashLock(basePath);
const lockHolderAlive = lock ? isLockProcessAlive(lock) : false;
if (!lockHolderAlive) {
rmSync(lockDir, { recursive: true, force: true });
fixesApplied.push("removed stranded session lock directory");
}
}
} catch {
// Non-fatal.
}
return fixesApplied;
}
/**
* Lightweight pre-dispatch health check. Runs fast checks that should
* block dispatch if they fail avoids dispatching into a broken state.

View file

@ -1,5 +1,5 @@
import assert from "node:assert/strict";
import { describe, test } from 'vitest';
import { describe, test } from "vitest";
/**
* doctor-proactive.test.ts Tests for proactive healing layer.
@ -28,6 +28,7 @@ import {
getConsecutiveErrorUnits,
getHealthHistory,
getHealthTrend,
healAutoStartupRuntime,
preDispatchHealthGate,
recordHealthSnapshot,
resetProactiveHealing,
@ -371,6 +372,34 @@ describe("doctor-proactive", async () => {
);
});
test("startup self-heal clears stale crash lock before resume assessment", () => {
const dir = realpathSync(mkdtempSync(join(tmpdir(), "doc-proactive-")));
cleanups.push(dir);
mkdirSync(join(dir, ".sf"), { recursive: true });
writeFileSync(
join(dir, ".sf", "auto.lock"),
JSON.stringify({
pid: 9999999,
startedAt: "2026-03-10T00:00:00Z",
unitType: "research-slice",
unitId: "M008/S01",
unitStartedAt: "2026-03-10T00:01:00Z",
}),
);
const fixes = healAutoStartupRuntime(dir);
assert.ok(
fixes.some((f) => f.includes("cleared stale auto.lock")),
"startup self-heal reports stale auto.lock cleanup",
);
assert.ok(
!existsSync(join(dir, ".sf", "auto.lock")),
"startup self-heal removes stale auto.lock before assessInterruptedSession",
);
});
test("health gate: corrupt merge state auto-healed", async () => {
if (process.platform !== "win32") {
{