feat: run solver eval from autonomous lifecycle
This commit is contained in:
parent
7a13dd82b1
commit
adf28d69b4
11 changed files with 245 additions and 6 deletions
|
|
@ -126,6 +126,8 @@ auto_supervisor:
|
|||
soft_timeout_minutes: 20
|
||||
idle_timeout_minutes: 10
|
||||
hard_timeout_minutes: 30
|
||||
solver_max_iterations: 30000
|
||||
solver_eval_on_autonomous_exit: true
|
||||
```
|
||||
|
||||
### Cost Tracking
|
||||
|
|
|
|||
|
|
@ -294,6 +294,8 @@ auto_supervisor:
|
|||
soft_timeout_minutes: 20 # warn LLM to wrap up
|
||||
idle_timeout_minutes: 10 # detect stalls
|
||||
hard_timeout_minutes: 30 # pause autonomous mode
|
||||
solver_max_iterations: 30000
|
||||
solver_eval_on_autonomous_exit: true
|
||||
completion_nudge_after: 10 # complete-slice tool calls before nudging sf_slice_complete
|
||||
```
|
||||
|
||||
|
|
|
|||
|
|
@ -36,6 +36,18 @@ import { getCurrentBranch } from "./worktree.js";
|
|||
import { getActiveWorktreeName } from "./worktree-command.js";
|
||||
|
||||
const ACTIVITY_FRAMES = ["|", "/", "-", "\\"];
|
||||
function safeSetWidget(ctx, key, content, options) {
|
||||
try {
|
||||
ctx.ui?.setWidget?.(key, content, options);
|
||||
return true;
|
||||
} catch (err) {
|
||||
logWarning(
|
||||
"dashboard",
|
||||
`setWidget(${key}) failed: ${err instanceof Error ? err.message : String(err)}`,
|
||||
);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// ─── UAT Slice Extraction ─────────────────────────────────────────────────────
|
||||
/**
|
||||
* Extract the target slice ID from a run-uat unit ID (e.g. "M001/S01" → "S01").
|
||||
|
|
@ -580,7 +592,7 @@ export function updateProgressWidget(
|
|||
refreshLastCommit(accessors.getBasePath());
|
||||
// Cache the effective service tier at widget creation time (reads preferences)
|
||||
const effectiveServiceTier = getEffectiveServiceTier();
|
||||
ctx.ui.setWidget("sf-progress", (tui, theme) => {
|
||||
safeSetWidget(ctx, "sf-progress", (tui, theme) => {
|
||||
let cachedLines;
|
||||
let cachedWidth;
|
||||
let cachedRtkLabel;
|
||||
|
|
|
|||
|
|
@ -92,6 +92,21 @@ import { deriveState, isGhostMilestone } from "./state.js";
|
|||
import { isClosedStatus } from "./status-guards.js";
|
||||
import { reconcileStaleCompleteSliceRecords } from "./unit-runtime.js";
|
||||
import { logError, logWarning } from "./workflow-logger.js";
|
||||
|
||||
function safeSetWidget(ctx, key, content, options) {
|
||||
try {
|
||||
ctx?.ui?.setWidget?.(key, content, options);
|
||||
return true;
|
||||
} catch (err) {
|
||||
logWarning(
|
||||
"ui",
|
||||
`setWidget(${key}) failed: ${err instanceof Error ? err.message : String(err)}`,
|
||||
{ file: "auto-start.ts" },
|
||||
);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
import {
|
||||
captureIntegrationBranch,
|
||||
detectWorktreeName,
|
||||
|
|
@ -1048,7 +1063,7 @@ export async function bootstrapAutoSession(
|
|||
ctx.ui.setFooter(hideFooter);
|
||||
// Hide sf-health during AUTO — sf-progress is the single source of truth
|
||||
// for last-commit / cost / health signal while auto is running.
|
||||
ctx.ui.setWidget("sf-health", undefined);
|
||||
safeSetWidget(ctx, "sf-health", undefined);
|
||||
const modeLabel = s.stepMode ? "Step-mode" : "Auto-mode";
|
||||
const pendingCount = (state.registry ?? []).filter(
|
||||
(m) => m.status !== "complete" && m.status !== "parked",
|
||||
|
|
|
|||
|
|
@ -189,6 +189,20 @@ import {
|
|||
} from "./worktree.js";
|
||||
import { WorktreeResolver } from "./worktree-resolver.js";
|
||||
|
||||
function safeSetWidget(ctx, key, content, options) {
|
||||
try {
|
||||
ctx?.ui?.setWidget?.(key, content, options);
|
||||
return true;
|
||||
} catch (err) {
|
||||
logWarning(
|
||||
"ui",
|
||||
`setWidget(${key}) failed: ${err instanceof Error ? err.message : String(err)}`,
|
||||
{ file: "auto.ts" },
|
||||
);
|
||||
return false;
|
||||
}
|
||||
}
|
||||
|
||||
export {
|
||||
MAX_LIFETIME_DISPATCHES,
|
||||
MAX_UNIT_DISPATCHES,
|
||||
|
|
@ -689,7 +703,7 @@ function handleLostSessionLock(ctx, lockStatus) {
|
|||
: `Session lock lost (${lockFilePath}). Stopping gracefully.${recoverySuggestion}`;
|
||||
ctx?.ui.notify(message, "error");
|
||||
ctx?.ui.setStatus("sf-auto", undefined);
|
||||
ctx?.ui.setWidget("sf-progress", undefined);
|
||||
safeSetWidget(ctx, "sf-progress", undefined);
|
||||
ctx?.ui.setFooter(undefined);
|
||||
if (ctx) initHealthWidget(ctx);
|
||||
}
|
||||
|
|
@ -725,7 +739,7 @@ function cleanupAfterLoopExit(ctx) {
|
|||
// visible so the user still has a resumable auto-mode signal on screen.
|
||||
if (!s.paused) {
|
||||
ctx.ui.setStatus("sf-auto", undefined);
|
||||
ctx.ui.setWidget("sf-progress", undefined);
|
||||
safeSetWidget(ctx, "sf-progress", undefined);
|
||||
ctx.ui.setFooter(undefined);
|
||||
initHealthWidget(ctx);
|
||||
}
|
||||
|
|
@ -1065,7 +1079,7 @@ export async function stopAuto(ctx, pi, reason) {
|
|||
resetProactiveHealing();
|
||||
// UI cleanup
|
||||
ctx?.ui.setStatus("sf-auto", undefined);
|
||||
ctx?.ui.setWidget("sf-progress", undefined);
|
||||
safeSetWidget(ctx, "sf-progress", undefined);
|
||||
ctx?.ui.setFooter(undefined);
|
||||
if (ctx) initHealthWidget(ctx);
|
||||
restoreProjectRootEnv();
|
||||
|
|
@ -1215,7 +1229,7 @@ export async function pauseAuto(ctx, _pi, _errorContext) {
|
|||
s.pendingVerificationRetry = null;
|
||||
s.verificationRetryCount.clear();
|
||||
ctx?.ui.setStatus("sf-auto", "paused");
|
||||
ctx?.ui.setWidget("sf-progress", undefined);
|
||||
safeSetWidget(ctx, "sf-progress", undefined);
|
||||
ctx?.ui.setFooter(undefined);
|
||||
if (ctx) initHealthWidget(ctx);
|
||||
const resumeCmd = s.stepMode ? "/sf next" : "/sf autonomous";
|
||||
|
|
|
|||
|
|
@ -11,6 +11,7 @@ import { mkdirSync, readFileSync, unlinkSync, writeFileSync } from "node:fs";
|
|||
import { join } from "node:path";
|
||||
import { atomicWriteSync } from "../atomic-write.js";
|
||||
import { ModelPolicyDispatchBlockedError } from "../auto-model-selection.js";
|
||||
import { runAutomaticAutonomousSolverEval } from "../autonomous-solver-eval.js";
|
||||
import { debugLog } from "../debug-logger.js";
|
||||
import { resolveEngine } from "../engine-resolver.js";
|
||||
import { sfRoot } from "../paths.js";
|
||||
|
|
@ -290,6 +291,58 @@ async function enforceMinRequestInterval(s, prefs) {
|
|||
}
|
||||
}
|
||||
}
|
||||
async function runExitSolverEval(ctx, s, deps, iteration) {
|
||||
try {
|
||||
const supervisor =
|
||||
deps.loadEffectiveSFPreferences()?.preferences?.auto_supervisor;
|
||||
const flowId = randomUUID();
|
||||
let seq = 0;
|
||||
const emitJournalEvent = (event) =>
|
||||
deps.emitJournalEvent({
|
||||
...event,
|
||||
flowId: event.flowId ?? flowId,
|
||||
seq: event.seq ?? ++seq,
|
||||
});
|
||||
const result = await runAutomaticAutonomousSolverEval({
|
||||
basePath: s.basePath,
|
||||
enabled: supervisor?.solver_eval_on_autonomous_exit !== false,
|
||||
reason: "autonomous-exit",
|
||||
emitJournalEvent,
|
||||
});
|
||||
if (result.ok && result.report?.dbRecorded) {
|
||||
ctx.ui.notify(
|
||||
`Autonomous solver eval recorded: ${result.report.reportPath}`,
|
||||
"info",
|
||||
);
|
||||
} else if (result.ok && result.report) {
|
||||
ctx.ui.notify(
|
||||
`Autonomous solver eval wrote ${result.report.reportPath}, but DB evidence was not recorded.`,
|
||||
"warning",
|
||||
);
|
||||
} else if (!result.ok) {
|
||||
ctx.ui.notify(
|
||||
`Autonomous solver eval did not record: ${result.error}`,
|
||||
"warning",
|
||||
);
|
||||
}
|
||||
debugLog("autoLoop", {
|
||||
phase: "solver-eval-auto",
|
||||
iteration,
|
||||
ok: result.ok,
|
||||
skipped: result.skipped,
|
||||
runId: result.report?.runId,
|
||||
error: result.error,
|
||||
});
|
||||
} catch (err) {
|
||||
const message = err instanceof Error ? err.message : String(err);
|
||||
ctx.ui.notify(`Autonomous solver eval hook failed: ${message}`, "warning");
|
||||
debugLog("autoLoop", {
|
||||
phase: "solver-eval-auto-failed",
|
||||
iteration,
|
||||
error: message,
|
||||
});
|
||||
}
|
||||
}
|
||||
/**
|
||||
* Main auto-mode execution loop. Iterates: derive → dispatch → guards →
|
||||
* runUnit → finalize → repeat. Exits when s.active becomes false or a
|
||||
|
|
@ -1108,6 +1161,7 @@ export async function autoLoop(ctx, pi, s, deps) {
|
|||
finishTurn("retry", "execution", msg);
|
||||
}
|
||||
}
|
||||
await runExitSolverEval(ctx, s, deps, iteration);
|
||||
_clearCurrentResolve();
|
||||
debugLog("autoLoop", { phase: "exit", totalIterations: iteration });
|
||||
}
|
||||
|
|
|
|||
|
|
@ -430,6 +430,66 @@ export function runAutonomousSolverEval(options) {
|
|||
return finalReport;
|
||||
}
|
||||
|
||||
/**
|
||||
* Run and record the built-in autonomous solver eval as a best-effort lifecycle hook.
|
||||
*
|
||||
* Purpose: make solver quality evidence automatic for `/sf autonomous` sessions
|
||||
* so regressions are captured without requiring a separate manual command.
|
||||
*
|
||||
* Consumer: auto/loop.js when an autonomous session exits.
|
||||
*/
|
||||
export async function runAutomaticAutonomousSolverEval(options) {
|
||||
const basePath = resolve(options.basePath ?? process.cwd());
|
||||
const emitJournalEvent =
|
||||
typeof options.emitJournalEvent === "function"
|
||||
? options.emitJournalEvent
|
||||
: () => {};
|
||||
const reason = options.reason ?? "autonomous-exit";
|
||||
if (options.enabled === false) {
|
||||
emitJournalEvent({
|
||||
ts: new Date().toISOString(),
|
||||
eventType: "solver-eval-auto-skipped",
|
||||
data: { reason, disabled: true },
|
||||
});
|
||||
return { ok: true, skipped: true, reason: "disabled" };
|
||||
}
|
||||
emitJournalEvent({
|
||||
ts: new Date().toISOString(),
|
||||
eventType: "solver-eval-auto-start",
|
||||
data: { reason },
|
||||
});
|
||||
try {
|
||||
let report = runAutonomousSolverEval({
|
||||
basePath,
|
||||
runId: `auto-${nowRunId()}`,
|
||||
suiteSource: "auto-sample",
|
||||
});
|
||||
const dbRecord = await recordEvalRunBestEffort(basePath, report);
|
||||
if (dbRecord.ok) report = dbRecord.report;
|
||||
emitJournalEvent({
|
||||
ts: new Date().toISOString(),
|
||||
eventType: "solver-eval-auto-complete",
|
||||
data: {
|
||||
reason,
|
||||
runId: report.runId,
|
||||
dbRecorded: report.dbRecorded,
|
||||
reportPath: report.reportPath,
|
||||
summary: report.summary,
|
||||
error: dbRecord.ok ? undefined : dbRecord.error,
|
||||
},
|
||||
});
|
||||
return { ok: true, report };
|
||||
} catch (err) {
|
||||
const message = err instanceof Error ? err.message : String(err);
|
||||
emitJournalEvent({
|
||||
ts: new Date().toISOString(),
|
||||
eventType: "solver-eval-auto-failed",
|
||||
data: { reason, error: message },
|
||||
});
|
||||
return { ok: false, error: message };
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse `/sf solver-eval` arguments.
|
||||
*
|
||||
|
|
|
|||
|
|
@ -126,6 +126,7 @@ Setting `prefer_skills: []` does **not** disable skill discovery — it just mea
|
|||
- `idle_timeout_minutes`: minutes of inactivity before the supervisor intervenes (default: 10).
|
||||
- `hard_timeout_minutes`: minutes before the supervisor forces termination (default: 30).
|
||||
- `solver_max_iterations`: maximum autonomous solver iterations for one unit before pausing (default: `30000`, min: `1`, max: `100000`).
|
||||
- `solver_eval_on_autonomous_exit`: automatically run and record the built-in solver eval when `/sf autonomous` exits (default: `true`; set `false` only to disable lifecycle eval evidence).
|
||||
- `completion_nudge_after`: tool calls in a complete-slice unit before nudging the agent to call `sf_slice_complete` (default: 10; set `0` to disable).
|
||||
- `runaway_guard_enabled`: enable active-loop diagnosis for long-running units (default: `true`).
|
||||
- `runaway_tool_call_warning`: unit tool calls before a runaway warning (default: `60`; set `0` to disable this signal).
|
||||
|
|
|
|||
|
|
@ -697,6 +697,8 @@ export function resolveAutoSupervisorConfig() {
|
|||
)
|
||||
? Math.max(1, Math.min(100000, Number(configured.solver_max_iterations)))
|
||||
: 30000,
|
||||
solver_eval_on_autonomous_exit:
|
||||
configured.solver_eval_on_autonomous_exit !== false,
|
||||
completion_nudge_after: configured.completion_nudge_after ?? 10,
|
||||
runaway_guard_enabled: configured.runaway_guard_enabled ?? true,
|
||||
runaway_tool_call_warning:
|
||||
|
|
|
|||
|
|
@ -790,6 +790,10 @@ export function validatePreferences(preferences) {
|
|||
);
|
||||
}
|
||||
}
|
||||
if (as.solver_eval_on_autonomous_exit !== undefined) {
|
||||
validatedAs.solver_eval_on_autonomous_exit =
|
||||
!!as.solver_eval_on_autonomous_exit;
|
||||
}
|
||||
if (as.phase_timeout_minutes !== undefined) {
|
||||
const val = Number(as.phase_timeout_minutes);
|
||||
if (!Number.isNaN(val) && val >= 0)
|
||||
|
|
|
|||
|
|
@ -2,10 +2,12 @@ import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs";
|
|||
import { tmpdir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import { afterEach, describe, expect, test } from "vitest";
|
||||
import { autoLoop } from "../auto/loop.js";
|
||||
import {
|
||||
handleAutonomousSolverEval,
|
||||
loadAutonomousSolverEvalCases,
|
||||
parseAutonomousSolverEvalArgs,
|
||||
runAutomaticAutonomousSolverEval,
|
||||
runAutonomousSolverEval,
|
||||
sampleAutonomousSolverEvalCases,
|
||||
} from "../autonomous-solver-eval.js";
|
||||
|
|
@ -156,4 +158,75 @@ describe("autonomous solver eval", () => {
|
|||
expect(notices[2].message).toContain("raw");
|
||||
expect(notices[2].message).toContain("sf");
|
||||
});
|
||||
|
||||
test("runAutomaticAutonomousSolverEval_records_lifecycle_evidence", async () => {
|
||||
const project = makeProject();
|
||||
mkdirSync(join(project, ".sf"), { recursive: true });
|
||||
const events = [];
|
||||
|
||||
const result = await runAutomaticAutonomousSolverEval({
|
||||
basePath: project,
|
||||
reason: "test-exit",
|
||||
emitJournalEvent: (event) => events.push(event),
|
||||
});
|
||||
|
||||
expect(result.ok).toBe(true);
|
||||
expect(result.report.dbRecorded).toBe(true);
|
||||
expect(result.report.suiteSource).toBe("auto-sample");
|
||||
expect(events.map((event) => event.eventType)).toEqual([
|
||||
"solver-eval-auto-start",
|
||||
"solver-eval-auto-complete",
|
||||
]);
|
||||
const run = getSolverEvalRun(result.report.runId);
|
||||
const cases = getSolverEvalCaseResults(result.report.runId);
|
||||
expect(run.summary.sfWins).toBe(1);
|
||||
expect(cases).toHaveLength(2);
|
||||
});
|
||||
|
||||
test("runAutomaticAutonomousSolverEval_can_be_disabled_by_preference", async () => {
|
||||
const project = makeProject();
|
||||
const events = [];
|
||||
|
||||
const result = await runAutomaticAutonomousSolverEval({
|
||||
basePath: project,
|
||||
enabled: false,
|
||||
emitJournalEvent: (event) => events.push(event),
|
||||
});
|
||||
|
||||
expect(result).toMatchObject({ ok: true, skipped: true });
|
||||
expect(events[0].eventType).toBe("solver-eval-auto-skipped");
|
||||
});
|
||||
|
||||
test("autoLoop_runs_solver_eval_on_autonomous_exit", async () => {
|
||||
const project = makeProject();
|
||||
mkdirSync(join(project, ".sf"), { recursive: true });
|
||||
const events = [];
|
||||
const { ctx, notices } = makeCtx();
|
||||
|
||||
await autoLoop(
|
||||
ctx,
|
||||
{},
|
||||
{ active: true, basePath: project, cmdCtx: null },
|
||||
{
|
||||
loadEffectiveSFPreferences: () => ({
|
||||
preferences: {
|
||||
auto_supervisor: { solver_eval_on_autonomous_exit: true },
|
||||
},
|
||||
}),
|
||||
emitJournalEvent: (event) => events.push(event),
|
||||
uokObserver: null,
|
||||
},
|
||||
);
|
||||
|
||||
expect(
|
||||
events.some((event) => event.eventType === "solver-eval-auto-start"),
|
||||
).toBe(true);
|
||||
const complete = events.find(
|
||||
(event) => event.eventType === "solver-eval-auto-complete",
|
||||
);
|
||||
expect(complete?.data?.dbRecorded).toBe(true);
|
||||
expect(notices.at(-1).message).toContain("Autonomous solver eval recorded");
|
||||
const runs = listSolverEvalRuns(1);
|
||||
expect(runs[0].suiteSource).toBe("auto-sample");
|
||||
});
|
||||
});
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue