From adf28d69b4cb0cca9a8f6a5a2b94e74be6d7701b Mon Sep 17 00:00:00 2001 From: Mikael Hugo Date: Wed, 6 May 2026 04:02:40 +0200 Subject: [PATCH] feat: run solver eval from autonomous lifecycle --- docs/user-docs/auto-mode.md | 2 + docs/user-docs/configuration.md | 2 + src/resources/extensions/sf/auto-dashboard.js | 14 +++- src/resources/extensions/sf/auto-start.js | 17 ++++- src/resources/extensions/sf/auto.js | 22 +++++- src/resources/extensions/sf/auto/loop.js | 54 ++++++++++++++ .../extensions/sf/autonomous-solver-eval.js | 60 +++++++++++++++ .../sf/docs/preferences-reference.md | 1 + .../extensions/sf/preferences-models.js | 2 + .../extensions/sf/preferences-validation.js | 4 + .../sf/tests/autonomous-solver-eval.test.mjs | 73 +++++++++++++++++++ 11 files changed, 245 insertions(+), 6 deletions(-) diff --git a/docs/user-docs/auto-mode.md b/docs/user-docs/auto-mode.md index b62cf9aca..cc7653f87 100644 --- a/docs/user-docs/auto-mode.md +++ b/docs/user-docs/auto-mode.md @@ -126,6 +126,8 @@ auto_supervisor: soft_timeout_minutes: 20 idle_timeout_minutes: 10 hard_timeout_minutes: 30 + solver_max_iterations: 30000 + solver_eval_on_autonomous_exit: true ``` ### Cost Tracking diff --git a/docs/user-docs/configuration.md b/docs/user-docs/configuration.md index cbcd3b5da..f0d2f8270 100644 --- a/docs/user-docs/configuration.md +++ b/docs/user-docs/configuration.md @@ -294,6 +294,8 @@ auto_supervisor: soft_timeout_minutes: 20 # warn LLM to wrap up idle_timeout_minutes: 10 # detect stalls hard_timeout_minutes: 30 # pause autonomous mode + solver_max_iterations: 30000 + solver_eval_on_autonomous_exit: true completion_nudge_after: 10 # complete-slice tool calls before nudging sf_slice_complete ``` diff --git a/src/resources/extensions/sf/auto-dashboard.js b/src/resources/extensions/sf/auto-dashboard.js index 32faf32b4..e63d21f04 100644 --- a/src/resources/extensions/sf/auto-dashboard.js +++ b/src/resources/extensions/sf/auto-dashboard.js @@ -36,6 +36,18 @@ import { getCurrentBranch } from "./worktree.js"; import { getActiveWorktreeName } from "./worktree-command.js"; const ACTIVITY_FRAMES = ["|", "/", "-", "\\"]; +function safeSetWidget(ctx, key, content, options) { + try { + ctx.ui?.setWidget?.(key, content, options); + return true; + } catch (err) { + logWarning( + "dashboard", + `setWidget(${key}) failed: ${err instanceof Error ? err.message : String(err)}`, + ); + return false; + } +} // ─── UAT Slice Extraction ───────────────────────────────────────────────────── /** * Extract the target slice ID from a run-uat unit ID (e.g. "M001/S01" → "S01"). @@ -580,7 +592,7 @@ export function updateProgressWidget( refreshLastCommit(accessors.getBasePath()); // Cache the effective service tier at widget creation time (reads preferences) const effectiveServiceTier = getEffectiveServiceTier(); - ctx.ui.setWidget("sf-progress", (tui, theme) => { + safeSetWidget(ctx, "sf-progress", (tui, theme) => { let cachedLines; let cachedWidth; let cachedRtkLabel; diff --git a/src/resources/extensions/sf/auto-start.js b/src/resources/extensions/sf/auto-start.js index db90f14f7..134e3d173 100644 --- a/src/resources/extensions/sf/auto-start.js +++ b/src/resources/extensions/sf/auto-start.js @@ -92,6 +92,21 @@ import { deriveState, isGhostMilestone } from "./state.js"; import { isClosedStatus } from "./status-guards.js"; import { reconcileStaleCompleteSliceRecords } from "./unit-runtime.js"; import { logError, logWarning } from "./workflow-logger.js"; + +function safeSetWidget(ctx, key, content, options) { + try { + ctx?.ui?.setWidget?.(key, content, options); + return true; + } catch (err) { + logWarning( + "ui", + `setWidget(${key}) failed: ${err instanceof Error ? err.message : String(err)}`, + { file: "auto-start.ts" }, + ); + return false; + } +} + import { captureIntegrationBranch, detectWorktreeName, @@ -1048,7 +1063,7 @@ export async function bootstrapAutoSession( ctx.ui.setFooter(hideFooter); // Hide sf-health during AUTO — sf-progress is the single source of truth // for last-commit / cost / health signal while auto is running. - ctx.ui.setWidget("sf-health", undefined); + safeSetWidget(ctx, "sf-health", undefined); const modeLabel = s.stepMode ? "Step-mode" : "Auto-mode"; const pendingCount = (state.registry ?? []).filter( (m) => m.status !== "complete" && m.status !== "parked", diff --git a/src/resources/extensions/sf/auto.js b/src/resources/extensions/sf/auto.js index 2486bb5d1..6a2d02607 100644 --- a/src/resources/extensions/sf/auto.js +++ b/src/resources/extensions/sf/auto.js @@ -189,6 +189,20 @@ import { } from "./worktree.js"; import { WorktreeResolver } from "./worktree-resolver.js"; +function safeSetWidget(ctx, key, content, options) { + try { + ctx?.ui?.setWidget?.(key, content, options); + return true; + } catch (err) { + logWarning( + "ui", + `setWidget(${key}) failed: ${err instanceof Error ? err.message : String(err)}`, + { file: "auto.ts" }, + ); + return false; + } +} + export { MAX_LIFETIME_DISPATCHES, MAX_UNIT_DISPATCHES, @@ -689,7 +703,7 @@ function handleLostSessionLock(ctx, lockStatus) { : `Session lock lost (${lockFilePath}). Stopping gracefully.${recoverySuggestion}`; ctx?.ui.notify(message, "error"); ctx?.ui.setStatus("sf-auto", undefined); - ctx?.ui.setWidget("sf-progress", undefined); + safeSetWidget(ctx, "sf-progress", undefined); ctx?.ui.setFooter(undefined); if (ctx) initHealthWidget(ctx); } @@ -725,7 +739,7 @@ function cleanupAfterLoopExit(ctx) { // visible so the user still has a resumable auto-mode signal on screen. if (!s.paused) { ctx.ui.setStatus("sf-auto", undefined); - ctx.ui.setWidget("sf-progress", undefined); + safeSetWidget(ctx, "sf-progress", undefined); ctx.ui.setFooter(undefined); initHealthWidget(ctx); } @@ -1065,7 +1079,7 @@ export async function stopAuto(ctx, pi, reason) { resetProactiveHealing(); // UI cleanup ctx?.ui.setStatus("sf-auto", undefined); - ctx?.ui.setWidget("sf-progress", undefined); + safeSetWidget(ctx, "sf-progress", undefined); ctx?.ui.setFooter(undefined); if (ctx) initHealthWidget(ctx); restoreProjectRootEnv(); @@ -1215,7 +1229,7 @@ export async function pauseAuto(ctx, _pi, _errorContext) { s.pendingVerificationRetry = null; s.verificationRetryCount.clear(); ctx?.ui.setStatus("sf-auto", "paused"); - ctx?.ui.setWidget("sf-progress", undefined); + safeSetWidget(ctx, "sf-progress", undefined); ctx?.ui.setFooter(undefined); if (ctx) initHealthWidget(ctx); const resumeCmd = s.stepMode ? "/sf next" : "/sf autonomous"; diff --git a/src/resources/extensions/sf/auto/loop.js b/src/resources/extensions/sf/auto/loop.js index c75908a36..7922fe934 100644 --- a/src/resources/extensions/sf/auto/loop.js +++ b/src/resources/extensions/sf/auto/loop.js @@ -11,6 +11,7 @@ import { mkdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs"; import { join } from "node:path"; import { atomicWriteSync } from "../atomic-write.js"; import { ModelPolicyDispatchBlockedError } from "../auto-model-selection.js"; +import { runAutomaticAutonomousSolverEval } from "../autonomous-solver-eval.js"; import { debugLog } from "../debug-logger.js"; import { resolveEngine } from "../engine-resolver.js"; import { sfRoot } from "../paths.js"; @@ -290,6 +291,58 @@ async function enforceMinRequestInterval(s, prefs) { } } } +async function runExitSolverEval(ctx, s, deps, iteration) { + try { + const supervisor = + deps.loadEffectiveSFPreferences()?.preferences?.auto_supervisor; + const flowId = randomUUID(); + let seq = 0; + const emitJournalEvent = (event) => + deps.emitJournalEvent({ + ...event, + flowId: event.flowId ?? flowId, + seq: event.seq ?? ++seq, + }); + const result = await runAutomaticAutonomousSolverEval({ + basePath: s.basePath, + enabled: supervisor?.solver_eval_on_autonomous_exit !== false, + reason: "autonomous-exit", + emitJournalEvent, + }); + if (result.ok && result.report?.dbRecorded) { + ctx.ui.notify( + `Autonomous solver eval recorded: ${result.report.reportPath}`, + "info", + ); + } else if (result.ok && result.report) { + ctx.ui.notify( + `Autonomous solver eval wrote ${result.report.reportPath}, but DB evidence was not recorded.`, + "warning", + ); + } else if (!result.ok) { + ctx.ui.notify( + `Autonomous solver eval did not record: ${result.error}`, + "warning", + ); + } + debugLog("autoLoop", { + phase: "solver-eval-auto", + iteration, + ok: result.ok, + skipped: result.skipped, + runId: result.report?.runId, + error: result.error, + }); + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + ctx.ui.notify(`Autonomous solver eval hook failed: ${message}`, "warning"); + debugLog("autoLoop", { + phase: "solver-eval-auto-failed", + iteration, + error: message, + }); + } +} /** * Main auto-mode execution loop. Iterates: derive → dispatch → guards → * runUnit → finalize → repeat. Exits when s.active becomes false or a @@ -1108,6 +1161,7 @@ export async function autoLoop(ctx, pi, s, deps) { finishTurn("retry", "execution", msg); } } + await runExitSolverEval(ctx, s, deps, iteration); _clearCurrentResolve(); debugLog("autoLoop", { phase: "exit", totalIterations: iteration }); } diff --git a/src/resources/extensions/sf/autonomous-solver-eval.js b/src/resources/extensions/sf/autonomous-solver-eval.js index c852f3c71..a5f9456dd 100644 --- a/src/resources/extensions/sf/autonomous-solver-eval.js +++ b/src/resources/extensions/sf/autonomous-solver-eval.js @@ -430,6 +430,66 @@ export function runAutonomousSolverEval(options) { return finalReport; } +/** + * Run and record the built-in autonomous solver eval as a best-effort lifecycle hook. + * + * Purpose: make solver quality evidence automatic for `/sf autonomous` sessions + * so regressions are captured without requiring a separate manual command. + * + * Consumer: auto/loop.js when an autonomous session exits. + */ +export async function runAutomaticAutonomousSolverEval(options) { + const basePath = resolve(options.basePath ?? process.cwd()); + const emitJournalEvent = + typeof options.emitJournalEvent === "function" + ? options.emitJournalEvent + : () => {}; + const reason = options.reason ?? "autonomous-exit"; + if (options.enabled === false) { + emitJournalEvent({ + ts: new Date().toISOString(), + eventType: "solver-eval-auto-skipped", + data: { reason, disabled: true }, + }); + return { ok: true, skipped: true, reason: "disabled" }; + } + emitJournalEvent({ + ts: new Date().toISOString(), + eventType: "solver-eval-auto-start", + data: { reason }, + }); + try { + let report = runAutonomousSolverEval({ + basePath, + runId: `auto-${nowRunId()}`, + suiteSource: "auto-sample", + }); + const dbRecord = await recordEvalRunBestEffort(basePath, report); + if (dbRecord.ok) report = dbRecord.report; + emitJournalEvent({ + ts: new Date().toISOString(), + eventType: "solver-eval-auto-complete", + data: { + reason, + runId: report.runId, + dbRecorded: report.dbRecorded, + reportPath: report.reportPath, + summary: report.summary, + error: dbRecord.ok ? undefined : dbRecord.error, + }, + }); + return { ok: true, report }; + } catch (err) { + const message = err instanceof Error ? err.message : String(err); + emitJournalEvent({ + ts: new Date().toISOString(), + eventType: "solver-eval-auto-failed", + data: { reason, error: message }, + }); + return { ok: false, error: message }; + } +} + /** * Parse `/sf solver-eval` arguments. * diff --git a/src/resources/extensions/sf/docs/preferences-reference.md b/src/resources/extensions/sf/docs/preferences-reference.md index 6345b6b50..6b5504843 100644 --- a/src/resources/extensions/sf/docs/preferences-reference.md +++ b/src/resources/extensions/sf/docs/preferences-reference.md @@ -126,6 +126,7 @@ Setting `prefer_skills: []` does **not** disable skill discovery — it just mea - `idle_timeout_minutes`: minutes of inactivity before the supervisor intervenes (default: 10). - `hard_timeout_minutes`: minutes before the supervisor forces termination (default: 30). - `solver_max_iterations`: maximum autonomous solver iterations for one unit before pausing (default: `30000`, min: `1`, max: `100000`). + - `solver_eval_on_autonomous_exit`: automatically run and record the built-in solver eval when `/sf autonomous` exits (default: `true`; set `false` only to disable lifecycle eval evidence). - `completion_nudge_after`: tool calls in a complete-slice unit before nudging the agent to call `sf_slice_complete` (default: 10; set `0` to disable). - `runaway_guard_enabled`: enable active-loop diagnosis for long-running units (default: `true`). - `runaway_tool_call_warning`: unit tool calls before a runaway warning (default: `60`; set `0` to disable this signal). diff --git a/src/resources/extensions/sf/preferences-models.js b/src/resources/extensions/sf/preferences-models.js index 24a8e7c1e..ba3455d47 100644 --- a/src/resources/extensions/sf/preferences-models.js +++ b/src/resources/extensions/sf/preferences-models.js @@ -697,6 +697,8 @@ export function resolveAutoSupervisorConfig() { ) ? Math.max(1, Math.min(100000, Number(configured.solver_max_iterations))) : 30000, + solver_eval_on_autonomous_exit: + configured.solver_eval_on_autonomous_exit !== false, completion_nudge_after: configured.completion_nudge_after ?? 10, runaway_guard_enabled: configured.runaway_guard_enabled ?? true, runaway_tool_call_warning: diff --git a/src/resources/extensions/sf/preferences-validation.js b/src/resources/extensions/sf/preferences-validation.js index d068904ae..e63f7221e 100644 --- a/src/resources/extensions/sf/preferences-validation.js +++ b/src/resources/extensions/sf/preferences-validation.js @@ -790,6 +790,10 @@ export function validatePreferences(preferences) { ); } } + if (as.solver_eval_on_autonomous_exit !== undefined) { + validatedAs.solver_eval_on_autonomous_exit = + !!as.solver_eval_on_autonomous_exit; + } if (as.phase_timeout_minutes !== undefined) { const val = Number(as.phase_timeout_minutes); if (!Number.isNaN(val) && val >= 0) diff --git a/src/resources/extensions/sf/tests/autonomous-solver-eval.test.mjs b/src/resources/extensions/sf/tests/autonomous-solver-eval.test.mjs index 5c94959b0..bc7eb1498 100644 --- a/src/resources/extensions/sf/tests/autonomous-solver-eval.test.mjs +++ b/src/resources/extensions/sf/tests/autonomous-solver-eval.test.mjs @@ -2,10 +2,12 @@ import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs"; import { tmpdir } from "node:os"; import { join } from "node:path"; import { afterEach, describe, expect, test } from "vitest"; +import { autoLoop } from "../auto/loop.js"; import { handleAutonomousSolverEval, loadAutonomousSolverEvalCases, parseAutonomousSolverEvalArgs, + runAutomaticAutonomousSolverEval, runAutonomousSolverEval, sampleAutonomousSolverEvalCases, } from "../autonomous-solver-eval.js"; @@ -156,4 +158,75 @@ describe("autonomous solver eval", () => { expect(notices[2].message).toContain("raw"); expect(notices[2].message).toContain("sf"); }); + + test("runAutomaticAutonomousSolverEval_records_lifecycle_evidence", async () => { + const project = makeProject(); + mkdirSync(join(project, ".sf"), { recursive: true }); + const events = []; + + const result = await runAutomaticAutonomousSolverEval({ + basePath: project, + reason: "test-exit", + emitJournalEvent: (event) => events.push(event), + }); + + expect(result.ok).toBe(true); + expect(result.report.dbRecorded).toBe(true); + expect(result.report.suiteSource).toBe("auto-sample"); + expect(events.map((event) => event.eventType)).toEqual([ + "solver-eval-auto-start", + "solver-eval-auto-complete", + ]); + const run = getSolverEvalRun(result.report.runId); + const cases = getSolverEvalCaseResults(result.report.runId); + expect(run.summary.sfWins).toBe(1); + expect(cases).toHaveLength(2); + }); + + test("runAutomaticAutonomousSolverEval_can_be_disabled_by_preference", async () => { + const project = makeProject(); + const events = []; + + const result = await runAutomaticAutonomousSolverEval({ + basePath: project, + enabled: false, + emitJournalEvent: (event) => events.push(event), + }); + + expect(result).toMatchObject({ ok: true, skipped: true }); + expect(events[0].eventType).toBe("solver-eval-auto-skipped"); + }); + + test("autoLoop_runs_solver_eval_on_autonomous_exit", async () => { + const project = makeProject(); + mkdirSync(join(project, ".sf"), { recursive: true }); + const events = []; + const { ctx, notices } = makeCtx(); + + await autoLoop( + ctx, + {}, + { active: true, basePath: project, cmdCtx: null }, + { + loadEffectiveSFPreferences: () => ({ + preferences: { + auto_supervisor: { solver_eval_on_autonomous_exit: true }, + }, + }), + emitJournalEvent: (event) => events.push(event), + uokObserver: null, + }, + ); + + expect( + events.some((event) => event.eventType === "solver-eval-auto-start"), + ).toBe(true); + const complete = events.find( + (event) => event.eventType === "solver-eval-auto-complete", + ); + expect(complete?.data?.dbRecorded).toBe(true); + expect(notices.at(-1).message).toContain("Autonomous solver eval recorded"); + const runs = listSolverEvalRuns(1); + expect(runs[0].suiteSource).toBe("auto-sample"); + }); });