feat: persist solver eval evidence in db

This commit is contained in:
Mikael Hugo 2026-05-06 03:49:32 +02:00
parent dc51baa19a
commit 7a13dd82b1
3 changed files with 446 additions and 13 deletions

View file

@ -18,6 +18,7 @@ import {
} from "node:fs";
import { dirname, isAbsolute, join, relative, resolve } from "node:path";
import { atomicWriteSync } from "./atomic-write.js";
import { ensureDbOpen } from "./bootstrap/dynamic-tools.js";
import { sfRoot } from "./paths.js";
const DEFAULT_TIMEOUT_MS = 5 * 60_000;
@ -374,6 +375,8 @@ export function runAutonomousSolverEval(options) {
const runId =
options.runId && RUN_ID_RE.test(options.runId) ? options.runId : nowRunId();
const cases = options.cases ?? sampleAutonomousSolverEvalCases();
const suiteSource =
options.suiteSource ?? (options.casesPath ? options.casesPath : "sample");
const outputDir = resolveOutputDir(basePath, runId);
const workspaceRoot = join(outputDir, "workspaces");
rmSync(outputDir, { recursive: true, force: true });
@ -404,23 +407,27 @@ export function runAutonomousSolverEval(options) {
runId,
createdAt: new Date().toISOString(),
basePath,
suiteSource,
summary: summarizeResults(results),
results,
dbRecorded: false,
};
atomicWriteSync(
join(outputDir, "report.json"),
`${JSON.stringify(report, null, 2)}\n`,
);
const reportPath = join(outputDir, "report.json");
const resultsPath = join(outputDir, "results.jsonl");
writeFileSync(
join(outputDir, "results.jsonl"),
resultsPath,
results.map((result) => JSON.stringify(result)).join("\n") + "\n",
"utf-8",
);
return {
const finalReport = {
...report,
outputDir,
relativeOutputDir: relative(basePath, outputDir),
reportPath: relative(basePath, reportPath),
resultsPath: relative(basePath, resultsPath),
};
atomicWriteSync(reportPath, `${JSON.stringify(finalReport, null, 2)}\n`);
return finalReport;
}
/**
@ -436,10 +443,42 @@ export function parseAutonomousSolverEvalArgs(raw) {
.trim()
.split(/\s+/)
.filter(Boolean);
const opts = { sample: false, casesPath: null, runId: null };
const opts = {
action: "run",
sample: false,
casesPath: null,
runId: null,
limit: 10,
};
for (let i = 0; i < tokens.length; i += 1) {
const token = tokens[i];
if (token === "run") continue;
if (token === "run") {
opts.action = "run";
continue;
}
if (token === "history") {
opts.action = "history";
continue;
}
if (token === "show") {
const value = tokens[i + 1];
if (!value || !RUN_ID_RE.test(value)) {
throw new Error("show requires a safe run id");
}
opts.action = "show";
opts.runId = value;
i += 1;
continue;
}
if (token === "--limit") {
const value = Number(tokens[i + 1]);
if (!Number.isFinite(value) || value < 1) {
throw new Error("--limit requires a positive number");
}
opts.limit = Math.floor(value);
i += 1;
continue;
}
if (token === "--sample") {
opts.sample = true;
continue;
@ -462,10 +501,89 @@ export function parseAutonomousSolverEvalArgs(raw) {
}
throw new Error(`unknown solver-eval argument: ${token}`);
}
if (!opts.sample && !opts.casesPath) opts.sample = true;
if (opts.action === "run" && !opts.sample && !opts.casesPath)
opts.sample = true;
return opts;
}
async function recordEvalRunBestEffort(basePath, report) {
try {
if (!(await ensureDbOpen(basePath)))
return { ok: false, error: "db-unavailable" };
const { recordSolverEvalRun } = await import("./sf-db.js");
recordSolverEvalRun(report);
const updated = { ...report, dbRecorded: true };
atomicWriteSync(
join(basePath, updated.reportPath),
`${JSON.stringify(updated, null, 2)}\n`,
);
return { ok: true, report: updated };
} catch (err) {
return {
ok: false,
error: err instanceof Error ? err.message : String(err),
};
}
}
async function notifySolverEvalHistory(ctx, basePath, limit) {
if (!(await ensureDbOpen(basePath))) {
ctx.ui.notify("No SF database available. Run /sf init first.", "warning");
return;
}
const { listSolverEvalRuns } = await import("./sf-db.js");
const runs = listSolverEvalRuns(limit);
if (runs.length === 0) {
ctx.ui.notify(
"No solver eval runs recorded. Run /sf solver-eval --sample.",
"info",
);
return;
}
const lines = ["Autonomous solver eval history"];
for (const run of runs) {
lines.push(
[
`- ${run.runId}`,
`${run.summary?.cases ?? run.casesCount} case(s)`,
`SF wins ${run.summary?.sfWins ?? 0}`,
`raw wins ${run.summary?.rawWins ?? 0}`,
`raw false-complete ${run.summary?.rawFalseCompletes ?? 0}`,
run.reportPath,
].join(" · "),
);
}
ctx.ui.notify(lines.join("\n"), "info");
}
async function notifySolverEvalShow(ctx, basePath, runId) {
if (!(await ensureDbOpen(basePath))) {
ctx.ui.notify("No SF database available. Run /sf init first.", "warning");
return;
}
const { getSolverEvalCaseResults, getSolverEvalRun } = await import(
"./sf-db.js"
);
const run = getSolverEvalRun(runId);
if (!run) {
ctx.ui.notify(`No solver eval run found for ${runId}.`, "warning");
return;
}
const cases = getSolverEvalCaseResults(runId);
const lines = [
`Autonomous solver eval: ${run.runId}`,
`Evidence: ${run.reportPath}`,
`Summary: ${run.summary?.cases ?? run.casesCount} case(s), SF wins ${run.summary?.sfWins ?? 0}, raw wins ${run.summary?.rawWins ?? 0}`,
"",
];
for (const result of cases) {
lines.push(
`- ${result.caseId} [${result.mode}] ${result.passed ? "pass" : "fail"}${result.falseComplete ? " false-complete" : ""}${result.solverOutcome ? ` outcome=${result.solverOutcome}` : ""}`,
);
}
ctx.ui.notify(lines.join("\n"), "info");
}
/**
* Handle `/sf solver-eval`.
*
@ -484,11 +602,19 @@ export async function handleAutonomousSolverEval(
args = parseAutonomousSolverEvalArgs(rawArgs);
} catch (err) {
ctx.ui.notify(
`Usage: /sf solver-eval [run] [--sample | --cases <jsonl>] [--run-id <id>]\n${err instanceof Error ? err.message : String(err)}`,
`Usage: /sf solver-eval [run|history|show <run-id>] [--sample | --cases <jsonl>] [--run-id <id>] [--limit <n>]\n${err instanceof Error ? err.message : String(err)}`,
"warning",
);
return;
}
if (args.action === "history") {
await notifySolverEvalHistory(ctx, basePath, args.limit);
return;
}
if (args.action === "show") {
await notifySolverEvalShow(ctx, basePath, args.runId);
return;
}
const cases = args.casesPath
? loadAutonomousSolverEvalCases(
isAbsolute(args.casesPath)
@ -496,16 +622,21 @@ export async function handleAutonomousSolverEval(
: join(basePath, args.casesPath),
)
: sampleAutonomousSolverEvalCases();
const report = runAutonomousSolverEval({
let report = runAutonomousSolverEval({
basePath,
cases,
casesPath: args.casesPath,
suiteSource: args.casesPath ?? "sample",
runId: args.runId ?? undefined,
});
const dbRecord = await recordEvalRunBestEffort(basePath, report);
if (dbRecord.ok) report = dbRecord.report;
ctx.ui.notify(
[
"Autonomous solver eval complete",
`Run: ${report.runId}`,
`Evidence: ${report.relativeOutputDir}/report.json`,
`DB recorded: ${report.dbRecorded ? "yes" : `no (${dbRecord.error})`}`,
`Cases: ${report.summary.cases}`,
`SF wins: ${report.summary.sfWins}`,
`Raw wins: ${report.summary.rawWins}`,

View file

@ -78,7 +78,7 @@ function openRawDb(path) {
loadProvider();
return new DatabaseSync(path);
}
const SCHEMA_VERSION = 26;
const SCHEMA_VERSION = 27;
function indexExists(db, name) {
return !!db
.prepare(
@ -140,6 +140,48 @@ function ensureRepoProfileTables(db) {
"CREATE INDEX IF NOT EXISTS idx_repo_file_observations_status ON repo_file_observations(git_status, ownership)",
);
}
function ensureSolverEvalTables(db) {
db.exec(`
CREATE TABLE IF NOT EXISTS solver_eval_runs (
run_id TEXT PRIMARY KEY,
suite_source TEXT NOT NULL DEFAULT '',
cases_count INTEGER NOT NULL DEFAULT 0,
summary_json TEXT NOT NULL DEFAULT '{}',
report_path TEXT NOT NULL DEFAULT '',
results_path TEXT NOT NULL DEFAULT '',
db_recorded INTEGER NOT NULL DEFAULT 1,
created_at TEXT NOT NULL,
updated_at TEXT NOT NULL
)
`);
db.exec(`
CREATE TABLE IF NOT EXISTS solver_eval_case_results (
run_id TEXT NOT NULL,
case_id TEXT NOT NULL,
title TEXT NOT NULL DEFAULT '',
mode TEXT NOT NULL,
passed INTEGER NOT NULL DEFAULT 0,
false_complete INTEGER NOT NULL DEFAULT 0,
duration_ms INTEGER DEFAULT NULL,
command_status INTEGER DEFAULT NULL,
solver_outcome TEXT DEFAULT NULL,
pdd_complete INTEGER DEFAULT NULL,
result_json TEXT NOT NULL DEFAULT '{}',
created_at TEXT NOT NULL,
PRIMARY KEY (run_id, case_id, mode),
FOREIGN KEY (run_id) REFERENCES solver_eval_runs(run_id) ON DELETE CASCADE
)
`);
db.exec(
"CREATE INDEX IF NOT EXISTS idx_solver_eval_runs_created ON solver_eval_runs(created_at DESC)",
);
db.exec(
"CREATE INDEX IF NOT EXISTS idx_solver_eval_case_lookup ON solver_eval_case_results(run_id, case_id)",
);
db.exec(
"CREATE INDEX IF NOT EXISTS idx_solver_eval_case_false_complete ON solver_eval_case_results(false_complete, mode)",
);
}
function initSchema(db, fileBacked) {
if (fileBacked) db.exec("PRAGMA journal_mode=WAL");
if (fileBacked) db.exec("PRAGMA busy_timeout = 5000");
@ -517,6 +559,7 @@ function initSchema(db, fileBacked) {
updated_at TEXT NOT NULL
)
`);
ensureSolverEvalTables(db);
db.exec(
"CREATE INDEX IF NOT EXISTS idx_memories_active ON memories(superseded_by)",
);
@ -578,6 +621,7 @@ function initSchema(db, fileBacked) {
"CREATE INDEX IF NOT EXISTS idx_uok_runs_session ON uok_runs(session_id, started_at DESC)",
);
ensureRepoProfileTables(db);
ensureSolverEvalTables(db);
db.exec(
`CREATE VIEW IF NOT EXISTS active_decisions AS SELECT * FROM decisions WHERE superseded_by IS NULL`,
);
@ -1475,6 +1519,15 @@ function migrateSchema(db) {
":applied_at": new Date().toISOString(),
});
}
if (currentVersion < 27) {
ensureSolverEvalTables(db);
db.prepare(
"INSERT INTO schema_version (version, applied_at) VALUES (:version, :applied_at)",
).run({
":version": 27,
":applied_at": new Date().toISOString(),
});
}
db.exec("COMMIT");
} catch (err) {
db.exec("ROLLBACK");
@ -3841,6 +3894,181 @@ export function getRepoFileObservations() {
adoptionUnitId: asStringOrNull(row["adoption_unit_id"]),
}));
}
function intBool(value) {
return value ? 1 : 0;
}
function parseJsonObject(raw, fallback = {}) {
try {
return JSON.parse(raw);
} catch {
return fallback;
}
}
function solverEvalRunFromRow(row) {
return {
runId: row["run_id"],
suiteSource: row["suite_source"],
casesCount: row["cases_count"] ?? 0,
summary: parseJsonObject(row["summary_json"], {}),
reportPath: row["report_path"],
resultsPath: row["results_path"],
dbRecorded: row["db_recorded"] === 1,
createdAt: row["created_at"],
updatedAt: row["updated_at"],
};
}
function solverEvalCaseFromRow(row) {
return {
runId: row["run_id"],
caseId: row["case_id"],
title: row["title"],
mode: row["mode"],
passed: row["passed"] === 1,
falseComplete: row["false_complete"] === 1,
durationMs: row["duration_ms"],
commandStatus: row["command_status"],
solverOutcome: asStringOrNull(row["solver_outcome"]),
pddComplete:
row["pdd_complete"] === null || row["pdd_complete"] === undefined
? null
: row["pdd_complete"] === 1,
result: parseJsonObject(row["result_json"], {}),
createdAt: row["created_at"],
};
}
/**
* Persist an autonomous solver eval run and its per-mode case results.
*
* Purpose: make solver-loop benchmark evidence queryable by SF commands,
* harness flows, UOK, and future memory retention instead of leaving it only
* as JSON files under `.sf/evals`.
*
* Consumer: `/sf solver-eval` after each run completes.
*/
export function recordSolverEvalRun(report) {
if (!currentDb) throw new SFError(SF_STALE_STATE, "sf-db: No database open");
const now = new Date().toISOString();
transaction(() => {
currentDb
.prepare(`INSERT INTO solver_eval_runs (
run_id, suite_source, cases_count, summary_json, report_path,
results_path, db_recorded, created_at, updated_at
) VALUES (
:run_id, :suite_source, :cases_count, :summary_json, :report_path,
:results_path, 1, :created_at, :updated_at
)
ON CONFLICT(run_id) DO UPDATE SET
suite_source = excluded.suite_source,
cases_count = excluded.cases_count,
summary_json = excluded.summary_json,
report_path = excluded.report_path,
results_path = excluded.results_path,
db_recorded = 1,
updated_at = excluded.updated_at`)
.run({
":run_id": report.runId,
":suite_source": report.suiteSource ?? "",
":cases_count": report.summary?.cases ?? report.results?.length ?? 0,
":summary_json": JSON.stringify(report.summary ?? {}),
":report_path": report.reportPath ?? "",
":results_path": report.resultsPath ?? "",
":created_at": report.createdAt ?? now,
":updated_at": now,
});
const stmt = currentDb.prepare(`INSERT INTO solver_eval_case_results (
run_id, case_id, title, mode, passed, false_complete, duration_ms,
command_status, solver_outcome, pdd_complete, result_json, created_at
) VALUES (
:run_id, :case_id, :title, :mode, :passed, :false_complete, :duration_ms,
:command_status, :solver_outcome, :pdd_complete, :result_json, :created_at
)
ON CONFLICT(run_id, case_id, mode) DO UPDATE SET
title = excluded.title,
passed = excluded.passed,
false_complete = excluded.false_complete,
duration_ms = excluded.duration_ms,
command_status = excluded.command_status,
solver_outcome = excluded.solver_outcome,
pdd_complete = excluded.pdd_complete,
result_json = excluded.result_json,
created_at = excluded.created_at`);
for (const result of report.results ?? []) {
stmt.run({
":run_id": report.runId,
":case_id": result.caseId,
":title": result.title ?? "",
":mode": result.mode,
":passed": intBool(result.passed),
":false_complete": intBool(result.falseComplete),
":duration_ms": result.command?.durationMs ?? null,
":command_status": result.command?.status ?? null,
":solver_outcome": result.solverSignals?.outcome ?? null,
":pdd_complete":
result.solverSignals?.pddComplete === undefined
? null
: intBool(result.solverSignals.pddComplete),
":result_json": JSON.stringify(result),
":created_at": report.createdAt ?? now,
});
}
});
}
/**
* List recent autonomous solver eval runs.
*
* Purpose: let operators inspect benchmark history without scraping generated
* report files.
*
* Consumer: `/sf solver-eval history`.
*/
export function listSolverEvalRuns(limit = 10) {
if (!currentDb) throw new SFError(SF_STALE_STATE, "sf-db: No database open");
return currentDb
.prepare(`SELECT run_id, suite_source, cases_count, summary_json,
report_path, results_path, db_recorded, created_at, updated_at
FROM solver_eval_runs
ORDER BY created_at DESC, run_id DESC
LIMIT :limit`)
.all({ ":limit": Math.max(1, Math.min(100, Number(limit) || 10)) })
.map(solverEvalRunFromRow);
}
/**
* Read one autonomous solver eval run by id.
*
* Purpose: support `/sf solver-eval show <run-id>` and future evidence
* promotion without parsing JSON artifacts.
*
* Consumer: solver eval command handlers.
*/
export function getSolverEvalRun(runId) {
if (!currentDb) throw new SFError(SF_STALE_STATE, "sf-db: No database open");
const row = currentDb
.prepare(`SELECT run_id, suite_source, cases_count, summary_json,
report_path, results_path, db_recorded, created_at, updated_at
FROM solver_eval_runs
WHERE run_id = :run_id`)
.get({ ":run_id": runId });
return row ? solverEvalRunFromRow(row) : null;
}
/**
* Read per-case results for one autonomous solver eval run.
*
* Purpose: show raw-vs-SF comparisons from DB evidence.
*
* Consumer: `/sf solver-eval show <run-id>`.
*/
export function getSolverEvalCaseResults(runId) {
if (!currentDb) throw new SFError(SF_STALE_STATE, "sf-db: No database open");
return currentDb
.prepare(`SELECT run_id, case_id, title, mode, passed, false_complete,
duration_ms, command_status, solver_outcome, pdd_complete,
result_json, created_at
FROM solver_eval_case_results
WHERE run_id = :run_id
ORDER BY case_id ASC, mode ASC`)
.all({ ":run_id": runId })
.map(solverEvalCaseFromRow);
}
/**
* INSERT OR REPLACE a quality_gates row. Used by milestone-validation-gates.ts
* to persist milestone-level (MV*) gate outcomes after validate-milestone runs.

View file

@ -1,13 +1,22 @@
import { mkdtempSync, rmSync, writeFileSync } from "node:fs";
import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs";
import { tmpdir } from "node:os";
import { join } from "node:path";
import { afterEach, describe, expect, test } from "vitest";
import {
handleAutonomousSolverEval,
loadAutonomousSolverEvalCases,
parseAutonomousSolverEvalArgs,
runAutonomousSolverEval,
sampleAutonomousSolverEvalCases,
} from "../autonomous-solver-eval.js";
import {
closeDatabase,
getSolverEvalCaseResults,
getSolverEvalRun,
listSolverEvalRuns,
openDatabase,
recordSolverEvalRun,
} from "../sf-db.js";
let tempDirs = [];
@ -17,7 +26,22 @@ function makeProject() {
return dir;
}
function makeCtx() {
const notices = [];
return {
notices,
ctx: {
ui: {
notify(message, level) {
notices.push({ message, level });
},
},
},
};
}
afterEach(() => {
closeDatabase();
for (const dir of tempDirs) {
rmSync(dir, { recursive: true, force: true });
}
@ -27,16 +51,28 @@ afterEach(() => {
describe("autonomous solver eval", () => {
test("parseAutonomousSolverEvalArgs_defaults_to_sample", () => {
expect(parseAutonomousSolverEvalArgs("run")).toEqual({
action: "run",
sample: true,
casesPath: null,
runId: null,
limit: 10,
});
expect(
parseAutonomousSolverEvalArgs("--cases cases.jsonl --run-id abc"),
).toEqual({
action: "run",
sample: false,
casesPath: "cases.jsonl",
runId: "abc",
limit: 10,
});
expect(parseAutonomousSolverEvalArgs("history --limit 3")).toMatchObject({
action: "history",
limit: 3,
});
expect(parseAutonomousSolverEvalArgs("show run-1")).toMatchObject({
action: "show",
runId: "run-1",
});
});
@ -82,4 +118,42 @@ describe("autonomous solver eval", () => {
expect(sfResult.solverSignals.hasCheckpoint).toBe(true);
expect(sfResult.solverSignals.pddComplete).toBe(true);
});
test("recordSolverEvalRun_persists_queryable_run_and_case_rows", () => {
const project = makeProject();
openDatabase(":memory:");
const report = runAutonomousSolverEval({
basePath: project,
runId: "db-run",
cases: sampleAutonomousSolverEvalCases(),
});
recordSolverEvalRun({ ...report, dbRecorded: true });
const run = getSolverEvalRun("db-run");
const runs = listSolverEvalRuns(5);
const cases = getSolverEvalCaseResults("db-run");
expect(run.runId).toBe("db-run");
expect(run.summary.sfWins).toBe(1);
expect(runs[0].runId).toBe("db-run");
expect(cases).toHaveLength(2);
expect(cases.find((r) => r.mode === "raw").falseComplete).toBe(true);
expect(cases.find((r) => r.mode === "sf").pddComplete).toBe(true);
});
test("handleAutonomousSolverEval_records_and_reads_db_history", async () => {
const project = makeProject();
mkdirSync(join(project, ".sf"), { recursive: true });
const { ctx, notices } = makeCtx();
await handleAutonomousSolverEval("--sample --run-id cmd-run", ctx, project);
await handleAutonomousSolverEval("history --limit 1", ctx, project);
await handleAutonomousSolverEval("show cmd-run", ctx, project);
expect(notices[0].message).toContain("DB recorded: yes");
expect(notices[1].message).toContain("cmd-run");
expect(notices[2].message).toContain("Autonomous solver eval: cmd-run");
expect(notices[2].message).toContain("raw");
expect(notices[2].message).toContain("sf");
});
});