fix(sf): self-heal stale auto locks before resume
This commit is contained in:
parent
bba5a7f143
commit
64b46fcb8a
3 changed files with 84 additions and 7 deletions
|
|
@ -65,12 +65,12 @@ import {
|
|||
import { DISPATCH_RULES, resolveDispatch } from "./auto-dispatch.js";
|
||||
import {
|
||||
_resetPendingResolve,
|
||||
autoLoop,
|
||||
type ErrorContext,
|
||||
isSessionSwitchInFlight,
|
||||
type LoopDeps,
|
||||
resolveAgentEnd,
|
||||
resolveAgentEndCancelled,
|
||||
autoLoop,
|
||||
runUokKernelLoop,
|
||||
} from "./auto-loop.js";
|
||||
import {
|
||||
|
|
@ -132,6 +132,7 @@ import { debugLog, isDebugEnabled, writeDebugSummary } from "./debug-logger.js";
|
|||
import { getPriorSliceCompletionBlocker } from "./dispatch-guard.js";
|
||||
import { rebuildState, runSFDoctor } from "./doctor.js";
|
||||
import {
|
||||
healAutoStartupRuntime,
|
||||
preDispatchHealthGate,
|
||||
resetProactiveHealing,
|
||||
setLevelChangeCallback,
|
||||
|
|
@ -424,7 +425,10 @@ export function getAutoDashboardData(): AutoDashboardData {
|
|||
const ledger = getLedger();
|
||||
const totals = ledger ? getProjectTotals(ledger.units) : null;
|
||||
const sessionId = s.cmdCtx?.sessionManager?.getSessionId?.() ?? null;
|
||||
const rtkSavings = sessionId && s.basePath ? getRtkSessionSavings(s.basePath, sessionId) : null;
|
||||
const rtkSavings =
|
||||
sessionId && s.basePath
|
||||
? getRtkSessionSavings(s.basePath, sessionId)
|
||||
: null;
|
||||
const rtkEnabled =
|
||||
loadEffectiveSFPreferences()?.preferences.experimental?.rtk === true;
|
||||
// Pending capture count — lazy check, non-fatal
|
||||
|
|
@ -1013,8 +1017,7 @@ export async function stopAuto(
|
|||
// metadata.kind rather than text matching. blocking=true when the
|
||||
// stop reason includes "blocked" (e.g. write-gate, guardrail block).
|
||||
const isBlocked =
|
||||
reason !== undefined &&
|
||||
reason.toLowerCase().includes("block");
|
||||
reason !== undefined && reason.toLowerCase().includes("block");
|
||||
const stopMeta = {
|
||||
kind: "terminal" as const,
|
||||
...(isBlocked ? { blocking: true } : {}),
|
||||
|
|
@ -1550,6 +1553,11 @@ export async function startAuto(
|
|||
// Escape stale worktree cwd from a previous milestone (#608).
|
||||
base = escapeStaleWorktree(base);
|
||||
|
||||
const startupFixes = healAutoStartupRuntime(base);
|
||||
for (const fix of startupFixes) {
|
||||
ctx.ui.notify(`Startup self-heal: ${fix}.`, "info");
|
||||
}
|
||||
|
||||
const freshStartAssessment =
|
||||
interruptedAssessment ?? (await assessInterruptedSession(base));
|
||||
|
||||
|
|
|
|||
|
|
@ -14,8 +14,8 @@
|
|||
* after N units, escalates to LLM-assisted heal dispatch.
|
||||
*/
|
||||
|
||||
import { existsSync } from "node:fs";
|
||||
import { join } from "node:path";
|
||||
import { existsSync, rmSync } from "node:fs";
|
||||
import { basename, dirname, join } from "node:path";
|
||||
import {
|
||||
clearLock,
|
||||
isLockProcessAlive,
|
||||
|
|
@ -231,6 +231,46 @@ export interface PreDispatchHealthResult {
|
|||
fixesApplied: string[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Clear stale auto runtime locks before startup decides whether to resume.
|
||||
*
|
||||
* Purpose: make background/proactive healing effective for the first auto
|
||||
* decision, not only after a unit is already about to dispatch.
|
||||
*
|
||||
* Consumer: startAuto before assessInterruptedSession reads auto.lock and
|
||||
* paused-session state.
|
||||
*/
|
||||
export function healAutoStartupRuntime(basePath: string): string[] {
|
||||
const fixesApplied: string[] = [];
|
||||
|
||||
try {
|
||||
const lock = readCrashLock(basePath);
|
||||
if (lock && !isLockProcessAlive(lock)) {
|
||||
clearLock(basePath);
|
||||
fixesApplied.push("cleared stale auto.lock before auto startup");
|
||||
}
|
||||
} catch {
|
||||
// Non-fatal.
|
||||
}
|
||||
|
||||
try {
|
||||
const root = sfRoot(basePath);
|
||||
const lockDir = join(dirname(root), `${basename(root)}.lock`);
|
||||
if (existsSync(lockDir)) {
|
||||
const lock = readCrashLock(basePath);
|
||||
const lockHolderAlive = lock ? isLockProcessAlive(lock) : false;
|
||||
if (!lockHolderAlive) {
|
||||
rmSync(lockDir, { recursive: true, force: true });
|
||||
fixesApplied.push("removed stranded session lock directory");
|
||||
}
|
||||
}
|
||||
} catch {
|
||||
// Non-fatal.
|
||||
}
|
||||
|
||||
return fixesApplied;
|
||||
}
|
||||
|
||||
/**
|
||||
* Lightweight pre-dispatch health check. Runs fast checks that should
|
||||
* block dispatch if they fail — avoids dispatching into a broken state.
|
||||
|
|
|
|||
|
|
@ -1,5 +1,5 @@
|
|||
import assert from "node:assert/strict";
|
||||
import { describe, test } from 'vitest';
|
||||
import { describe, test } from "vitest";
|
||||
|
||||
/**
|
||||
* doctor-proactive.test.ts — Tests for proactive healing layer.
|
||||
|
|
@ -28,6 +28,7 @@ import {
|
|||
getConsecutiveErrorUnits,
|
||||
getHealthHistory,
|
||||
getHealthTrend,
|
||||
healAutoStartupRuntime,
|
||||
preDispatchHealthGate,
|
||||
recordHealthSnapshot,
|
||||
resetProactiveHealing,
|
||||
|
|
@ -371,6 +372,34 @@ describe("doctor-proactive", async () => {
|
|||
);
|
||||
});
|
||||
|
||||
test("startup self-heal clears stale crash lock before resume assessment", () => {
|
||||
const dir = realpathSync(mkdtempSync(join(tmpdir(), "doc-proactive-")));
|
||||
cleanups.push(dir);
|
||||
mkdirSync(join(dir, ".sf"), { recursive: true });
|
||||
|
||||
writeFileSync(
|
||||
join(dir, ".sf", "auto.lock"),
|
||||
JSON.stringify({
|
||||
pid: 9999999,
|
||||
startedAt: "2026-03-10T00:00:00Z",
|
||||
unitType: "research-slice",
|
||||
unitId: "M008/S01",
|
||||
unitStartedAt: "2026-03-10T00:01:00Z",
|
||||
}),
|
||||
);
|
||||
|
||||
const fixes = healAutoStartupRuntime(dir);
|
||||
|
||||
assert.ok(
|
||||
fixes.some((f) => f.includes("cleared stale auto.lock")),
|
||||
"startup self-heal reports stale auto.lock cleanup",
|
||||
);
|
||||
assert.ok(
|
||||
!existsSync(join(dir, ".sf", "auto.lock")),
|
||||
"startup self-heal removes stale auto.lock before assessInterruptedSession",
|
||||
);
|
||||
});
|
||||
|
||||
test("health gate: corrupt merge state auto-healed", async () => {
|
||||
if (process.platform !== "win32") {
|
||||
{
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue