feat: run solver eval from autonomous lifecycle

This commit is contained in:
Mikael Hugo 2026-05-06 04:02:40 +02:00
parent 7a13dd82b1
commit adf28d69b4
11 changed files with 245 additions and 6 deletions

View file

@ -126,6 +126,8 @@ auto_supervisor:
soft_timeout_minutes: 20
idle_timeout_minutes: 10
hard_timeout_minutes: 30
solver_max_iterations: 30000
solver_eval_on_autonomous_exit: true
```
### Cost Tracking

View file

@ -294,6 +294,8 @@ auto_supervisor:
soft_timeout_minutes: 20 # warn LLM to wrap up
idle_timeout_minutes: 10 # detect stalls
hard_timeout_minutes: 30 # pause autonomous mode
solver_max_iterations: 30000
solver_eval_on_autonomous_exit: true
completion_nudge_after: 10 # complete-slice tool calls before nudging sf_slice_complete
```

View file

@ -36,6 +36,18 @@ import { getCurrentBranch } from "./worktree.js";
import { getActiveWorktreeName } from "./worktree-command.js";
const ACTIVITY_FRAMES = ["|", "/", "-", "\\"];
function safeSetWidget(ctx, key, content, options) {
try {
ctx.ui?.setWidget?.(key, content, options);
return true;
} catch (err) {
logWarning(
"dashboard",
`setWidget(${key}) failed: ${err instanceof Error ? err.message : String(err)}`,
);
return false;
}
}
// ─── UAT Slice Extraction ─────────────────────────────────────────────────────
/**
* Extract the target slice ID from a run-uat unit ID (e.g. "M001/S01" "S01").
@ -580,7 +592,7 @@ export function updateProgressWidget(
refreshLastCommit(accessors.getBasePath());
// Cache the effective service tier at widget creation time (reads preferences)
const effectiveServiceTier = getEffectiveServiceTier();
ctx.ui.setWidget("sf-progress", (tui, theme) => {
safeSetWidget(ctx, "sf-progress", (tui, theme) => {
let cachedLines;
let cachedWidth;
let cachedRtkLabel;

View file

@ -92,6 +92,21 @@ import { deriveState, isGhostMilestone } from "./state.js";
import { isClosedStatus } from "./status-guards.js";
import { reconcileStaleCompleteSliceRecords } from "./unit-runtime.js";
import { logError, logWarning } from "./workflow-logger.js";
function safeSetWidget(ctx, key, content, options) {
try {
ctx?.ui?.setWidget?.(key, content, options);
return true;
} catch (err) {
logWarning(
"ui",
`setWidget(${key}) failed: ${err instanceof Error ? err.message : String(err)}`,
{ file: "auto-start.ts" },
);
return false;
}
}
import {
captureIntegrationBranch,
detectWorktreeName,
@ -1048,7 +1063,7 @@ export async function bootstrapAutoSession(
ctx.ui.setFooter(hideFooter);
// Hide sf-health during AUTO — sf-progress is the single source of truth
// for last-commit / cost / health signal while auto is running.
ctx.ui.setWidget("sf-health", undefined);
safeSetWidget(ctx, "sf-health", undefined);
const modeLabel = s.stepMode ? "Step-mode" : "Auto-mode";
const pendingCount = (state.registry ?? []).filter(
(m) => m.status !== "complete" && m.status !== "parked",

View file

@ -189,6 +189,20 @@ import {
} from "./worktree.js";
import { WorktreeResolver } from "./worktree-resolver.js";
function safeSetWidget(ctx, key, content, options) {
try {
ctx?.ui?.setWidget?.(key, content, options);
return true;
} catch (err) {
logWarning(
"ui",
`setWidget(${key}) failed: ${err instanceof Error ? err.message : String(err)}`,
{ file: "auto.ts" },
);
return false;
}
}
export {
MAX_LIFETIME_DISPATCHES,
MAX_UNIT_DISPATCHES,
@ -689,7 +703,7 @@ function handleLostSessionLock(ctx, lockStatus) {
: `Session lock lost (${lockFilePath}). Stopping gracefully.${recoverySuggestion}`;
ctx?.ui.notify(message, "error");
ctx?.ui.setStatus("sf-auto", undefined);
ctx?.ui.setWidget("sf-progress", undefined);
safeSetWidget(ctx, "sf-progress", undefined);
ctx?.ui.setFooter(undefined);
if (ctx) initHealthWidget(ctx);
}
@ -725,7 +739,7 @@ function cleanupAfterLoopExit(ctx) {
// visible so the user still has a resumable auto-mode signal on screen.
if (!s.paused) {
ctx.ui.setStatus("sf-auto", undefined);
ctx.ui.setWidget("sf-progress", undefined);
safeSetWidget(ctx, "sf-progress", undefined);
ctx.ui.setFooter(undefined);
initHealthWidget(ctx);
}
@ -1065,7 +1079,7 @@ export async function stopAuto(ctx, pi, reason) {
resetProactiveHealing();
// UI cleanup
ctx?.ui.setStatus("sf-auto", undefined);
ctx?.ui.setWidget("sf-progress", undefined);
safeSetWidget(ctx, "sf-progress", undefined);
ctx?.ui.setFooter(undefined);
if (ctx) initHealthWidget(ctx);
restoreProjectRootEnv();
@ -1215,7 +1229,7 @@ export async function pauseAuto(ctx, _pi, _errorContext) {
s.pendingVerificationRetry = null;
s.verificationRetryCount.clear();
ctx?.ui.setStatus("sf-auto", "paused");
ctx?.ui.setWidget("sf-progress", undefined);
safeSetWidget(ctx, "sf-progress", undefined);
ctx?.ui.setFooter(undefined);
if (ctx) initHealthWidget(ctx);
const resumeCmd = s.stepMode ? "/sf next" : "/sf autonomous";

View file

@ -11,6 +11,7 @@ import { mkdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
import { join } from "node:path";
import { atomicWriteSync } from "../atomic-write.js";
import { ModelPolicyDispatchBlockedError } from "../auto-model-selection.js";
import { runAutomaticAutonomousSolverEval } from "../autonomous-solver-eval.js";
import { debugLog } from "../debug-logger.js";
import { resolveEngine } from "../engine-resolver.js";
import { sfRoot } from "../paths.js";
@ -290,6 +291,58 @@ async function enforceMinRequestInterval(s, prefs) {
}
}
}
async function runExitSolverEval(ctx, s, deps, iteration) {
try {
const supervisor =
deps.loadEffectiveSFPreferences()?.preferences?.auto_supervisor;
const flowId = randomUUID();
let seq = 0;
const emitJournalEvent = (event) =>
deps.emitJournalEvent({
...event,
flowId: event.flowId ?? flowId,
seq: event.seq ?? ++seq,
});
const result = await runAutomaticAutonomousSolverEval({
basePath: s.basePath,
enabled: supervisor?.solver_eval_on_autonomous_exit !== false,
reason: "autonomous-exit",
emitJournalEvent,
});
if (result.ok && result.report?.dbRecorded) {
ctx.ui.notify(
`Autonomous solver eval recorded: ${result.report.reportPath}`,
"info",
);
} else if (result.ok && result.report) {
ctx.ui.notify(
`Autonomous solver eval wrote ${result.report.reportPath}, but DB evidence was not recorded.`,
"warning",
);
} else if (!result.ok) {
ctx.ui.notify(
`Autonomous solver eval did not record: ${result.error}`,
"warning",
);
}
debugLog("autoLoop", {
phase: "solver-eval-auto",
iteration,
ok: result.ok,
skipped: result.skipped,
runId: result.report?.runId,
error: result.error,
});
} catch (err) {
const message = err instanceof Error ? err.message : String(err);
ctx.ui.notify(`Autonomous solver eval hook failed: ${message}`, "warning");
debugLog("autoLoop", {
phase: "solver-eval-auto-failed",
iteration,
error: message,
});
}
}
/**
* Main auto-mode execution loop. Iterates: derive dispatch guards
* runUnit finalize repeat. Exits when s.active becomes false or a
@ -1108,6 +1161,7 @@ export async function autoLoop(ctx, pi, s, deps) {
finishTurn("retry", "execution", msg);
}
}
await runExitSolverEval(ctx, s, deps, iteration);
_clearCurrentResolve();
debugLog("autoLoop", { phase: "exit", totalIterations: iteration });
}

View file

@ -430,6 +430,66 @@ export function runAutonomousSolverEval(options) {
return finalReport;
}
/**
* Run and record the built-in autonomous solver eval as a best-effort lifecycle hook.
*
* Purpose: make solver quality evidence automatic for `/sf autonomous` sessions
* so regressions are captured without requiring a separate manual command.
*
* Consumer: auto/loop.js when an autonomous session exits.
*/
export async function runAutomaticAutonomousSolverEval(options) {
const basePath = resolve(options.basePath ?? process.cwd());
const emitJournalEvent =
typeof options.emitJournalEvent === "function"
? options.emitJournalEvent
: () => {};
const reason = options.reason ?? "autonomous-exit";
if (options.enabled === false) {
emitJournalEvent({
ts: new Date().toISOString(),
eventType: "solver-eval-auto-skipped",
data: { reason, disabled: true },
});
return { ok: true, skipped: true, reason: "disabled" };
}
emitJournalEvent({
ts: new Date().toISOString(),
eventType: "solver-eval-auto-start",
data: { reason },
});
try {
let report = runAutonomousSolverEval({
basePath,
runId: `auto-${nowRunId()}`,
suiteSource: "auto-sample",
});
const dbRecord = await recordEvalRunBestEffort(basePath, report);
if (dbRecord.ok) report = dbRecord.report;
emitJournalEvent({
ts: new Date().toISOString(),
eventType: "solver-eval-auto-complete",
data: {
reason,
runId: report.runId,
dbRecorded: report.dbRecorded,
reportPath: report.reportPath,
summary: report.summary,
error: dbRecord.ok ? undefined : dbRecord.error,
},
});
return { ok: true, report };
} catch (err) {
const message = err instanceof Error ? err.message : String(err);
emitJournalEvent({
ts: new Date().toISOString(),
eventType: "solver-eval-auto-failed",
data: { reason, error: message },
});
return { ok: false, error: message };
}
}
/**
* Parse `/sf solver-eval` arguments.
*

View file

@ -126,6 +126,7 @@ Setting `prefer_skills: []` does **not** disable skill discovery — it just mea
- `idle_timeout_minutes`: minutes of inactivity before the supervisor intervenes (default: 10).
- `hard_timeout_minutes`: minutes before the supervisor forces termination (default: 30).
- `solver_max_iterations`: maximum autonomous solver iterations for one unit before pausing (default: `30000`, min: `1`, max: `100000`).
- `solver_eval_on_autonomous_exit`: automatically run and record the built-in solver eval when `/sf autonomous` exits (default: `true`; set `false` only to disable lifecycle eval evidence).
- `completion_nudge_after`: tool calls in a complete-slice unit before nudging the agent to call `sf_slice_complete` (default: 10; set `0` to disable).
- `runaway_guard_enabled`: enable active-loop diagnosis for long-running units (default: `true`).
- `runaway_tool_call_warning`: unit tool calls before a runaway warning (default: `60`; set `0` to disable this signal).

View file

@ -697,6 +697,8 @@ export function resolveAutoSupervisorConfig() {
)
? Math.max(1, Math.min(100000, Number(configured.solver_max_iterations)))
: 30000,
solver_eval_on_autonomous_exit:
configured.solver_eval_on_autonomous_exit !== false,
completion_nudge_after: configured.completion_nudge_after ?? 10,
runaway_guard_enabled: configured.runaway_guard_enabled ?? true,
runaway_tool_call_warning:

View file

@ -790,6 +790,10 @@ export function validatePreferences(preferences) {
);
}
}
if (as.solver_eval_on_autonomous_exit !== undefined) {
validatedAs.solver_eval_on_autonomous_exit =
!!as.solver_eval_on_autonomous_exit;
}
if (as.phase_timeout_minutes !== undefined) {
const val = Number(as.phase_timeout_minutes);
if (!Number.isNaN(val) && val >= 0)

View file

@ -2,10 +2,12 @@ import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs";
import { tmpdir } from "node:os";
import { join } from "node:path";
import { afterEach, describe, expect, test } from "vitest";
import { autoLoop } from "../auto/loop.js";
import {
handleAutonomousSolverEval,
loadAutonomousSolverEvalCases,
parseAutonomousSolverEvalArgs,
runAutomaticAutonomousSolverEval,
runAutonomousSolverEval,
sampleAutonomousSolverEvalCases,
} from "../autonomous-solver-eval.js";
@ -156,4 +158,75 @@ describe("autonomous solver eval", () => {
expect(notices[2].message).toContain("raw");
expect(notices[2].message).toContain("sf");
});
test("runAutomaticAutonomousSolverEval_records_lifecycle_evidence", async () => {
const project = makeProject();
mkdirSync(join(project, ".sf"), { recursive: true });
const events = [];
const result = await runAutomaticAutonomousSolverEval({
basePath: project,
reason: "test-exit",
emitJournalEvent: (event) => events.push(event),
});
expect(result.ok).toBe(true);
expect(result.report.dbRecorded).toBe(true);
expect(result.report.suiteSource).toBe("auto-sample");
expect(events.map((event) => event.eventType)).toEqual([
"solver-eval-auto-start",
"solver-eval-auto-complete",
]);
const run = getSolverEvalRun(result.report.runId);
const cases = getSolverEvalCaseResults(result.report.runId);
expect(run.summary.sfWins).toBe(1);
expect(cases).toHaveLength(2);
});
test("runAutomaticAutonomousSolverEval_can_be_disabled_by_preference", async () => {
const project = makeProject();
const events = [];
const result = await runAutomaticAutonomousSolverEval({
basePath: project,
enabled: false,
emitJournalEvent: (event) => events.push(event),
});
expect(result).toMatchObject({ ok: true, skipped: true });
expect(events[0].eventType).toBe("solver-eval-auto-skipped");
});
test("autoLoop_runs_solver_eval_on_autonomous_exit", async () => {
const project = makeProject();
mkdirSync(join(project, ".sf"), { recursive: true });
const events = [];
const { ctx, notices } = makeCtx();
await autoLoop(
ctx,
{},
{ active: true, basePath: project, cmdCtx: null },
{
loadEffectiveSFPreferences: () => ({
preferences: {
auto_supervisor: { solver_eval_on_autonomous_exit: true },
},
}),
emitJournalEvent: (event) => events.push(event),
uokObserver: null,
},
);
expect(
events.some((event) => event.eventType === "solver-eval-auto-start"),
).toBe(true);
const complete = events.find(
(event) => event.eventType === "solver-eval-auto-complete",
);
expect(complete?.data?.dbRecorded).toBe(true);
expect(notices.at(-1).message).toContain("Autonomous solver eval recorded");
const runs = listSolverEvalRuns(1);
expect(runs[0].suiteSource).toBe("auto-sample");
});
});