diff --git a/src/resources/extensions/sf/autonomous-solver-eval.js b/src/resources/extensions/sf/autonomous-solver-eval.js index e2f568841..c852f3c71 100644 --- a/src/resources/extensions/sf/autonomous-solver-eval.js +++ b/src/resources/extensions/sf/autonomous-solver-eval.js @@ -18,6 +18,7 @@ import { } from "node:fs"; import { dirname, isAbsolute, join, relative, resolve } from "node:path"; import { atomicWriteSync } from "./atomic-write.js"; +import { ensureDbOpen } from "./bootstrap/dynamic-tools.js"; import { sfRoot } from "./paths.js"; const DEFAULT_TIMEOUT_MS = 5 * 60_000; @@ -374,6 +375,8 @@ export function runAutonomousSolverEval(options) { const runId = options.runId && RUN_ID_RE.test(options.runId) ? options.runId : nowRunId(); const cases = options.cases ?? sampleAutonomousSolverEvalCases(); + const suiteSource = + options.suiteSource ?? (options.casesPath ? options.casesPath : "sample"); const outputDir = resolveOutputDir(basePath, runId); const workspaceRoot = join(outputDir, "workspaces"); rmSync(outputDir, { recursive: true, force: true }); @@ -404,23 +407,27 @@ export function runAutonomousSolverEval(options) { runId, createdAt: new Date().toISOString(), basePath, + suiteSource, summary: summarizeResults(results), results, + dbRecorded: false, }; - atomicWriteSync( - join(outputDir, "report.json"), - `${JSON.stringify(report, null, 2)}\n`, - ); + const reportPath = join(outputDir, "report.json"); + const resultsPath = join(outputDir, "results.jsonl"); writeFileSync( - join(outputDir, "results.jsonl"), + resultsPath, results.map((result) => JSON.stringify(result)).join("\n") + "\n", "utf-8", ); - return { + const finalReport = { ...report, outputDir, relativeOutputDir: relative(basePath, outputDir), + reportPath: relative(basePath, reportPath), + resultsPath: relative(basePath, resultsPath), }; + atomicWriteSync(reportPath, `${JSON.stringify(finalReport, null, 2)}\n`); + return finalReport; } /** @@ -436,10 +443,42 @@ export function parseAutonomousSolverEvalArgs(raw) { .trim() .split(/\s+/) .filter(Boolean); - const opts = { sample: false, casesPath: null, runId: null }; + const opts = { + action: "run", + sample: false, + casesPath: null, + runId: null, + limit: 10, + }; for (let i = 0; i < tokens.length; i += 1) { const token = tokens[i]; - if (token === "run") continue; + if (token === "run") { + opts.action = "run"; + continue; + } + if (token === "history") { + opts.action = "history"; + continue; + } + if (token === "show") { + const value = tokens[i + 1]; + if (!value || !RUN_ID_RE.test(value)) { + throw new Error("show requires a safe run id"); + } + opts.action = "show"; + opts.runId = value; + i += 1; + continue; + } + if (token === "--limit") { + const value = Number(tokens[i + 1]); + if (!Number.isFinite(value) || value < 1) { + throw new Error("--limit requires a positive number"); + } + opts.limit = Math.floor(value); + i += 1; + continue; + } if (token === "--sample") { opts.sample = true; continue; @@ -462,10 +501,89 @@ export function parseAutonomousSolverEvalArgs(raw) { } throw new Error(`unknown solver-eval argument: ${token}`); } - if (!opts.sample && !opts.casesPath) opts.sample = true; + if (opts.action === "run" && !opts.sample && !opts.casesPath) + opts.sample = true; return opts; } +async function recordEvalRunBestEffort(basePath, report) { + try { + if (!(await ensureDbOpen(basePath))) + return { ok: false, error: "db-unavailable" }; + const { recordSolverEvalRun } = await import("./sf-db.js"); + recordSolverEvalRun(report); + const updated = { ...report, dbRecorded: true }; + atomicWriteSync( + join(basePath, updated.reportPath), + `${JSON.stringify(updated, null, 2)}\n`, + ); + return { ok: true, report: updated }; + } catch (err) { + return { + ok: false, + error: err instanceof Error ? err.message : String(err), + }; + } +} + +async function notifySolverEvalHistory(ctx, basePath, limit) { + if (!(await ensureDbOpen(basePath))) { + ctx.ui.notify("No SF database available. Run /sf init first.", "warning"); + return; + } + const { listSolverEvalRuns } = await import("./sf-db.js"); + const runs = listSolverEvalRuns(limit); + if (runs.length === 0) { + ctx.ui.notify( + "No solver eval runs recorded. Run /sf solver-eval --sample.", + "info", + ); + return; + } + const lines = ["Autonomous solver eval history"]; + for (const run of runs) { + lines.push( + [ + `- ${run.runId}`, + `${run.summary?.cases ?? run.casesCount} case(s)`, + `SF wins ${run.summary?.sfWins ?? 0}`, + `raw wins ${run.summary?.rawWins ?? 0}`, + `raw false-complete ${run.summary?.rawFalseCompletes ?? 0}`, + run.reportPath, + ].join(" ยท "), + ); + } + ctx.ui.notify(lines.join("\n"), "info"); +} + +async function notifySolverEvalShow(ctx, basePath, runId) { + if (!(await ensureDbOpen(basePath))) { + ctx.ui.notify("No SF database available. Run /sf init first.", "warning"); + return; + } + const { getSolverEvalCaseResults, getSolverEvalRun } = await import( + "./sf-db.js" + ); + const run = getSolverEvalRun(runId); + if (!run) { + ctx.ui.notify(`No solver eval run found for ${runId}.`, "warning"); + return; + } + const cases = getSolverEvalCaseResults(runId); + const lines = [ + `Autonomous solver eval: ${run.runId}`, + `Evidence: ${run.reportPath}`, + `Summary: ${run.summary?.cases ?? run.casesCount} case(s), SF wins ${run.summary?.sfWins ?? 0}, raw wins ${run.summary?.rawWins ?? 0}`, + "", + ]; + for (const result of cases) { + lines.push( + `- ${result.caseId} [${result.mode}] ${result.passed ? "pass" : "fail"}${result.falseComplete ? " false-complete" : ""}${result.solverOutcome ? ` outcome=${result.solverOutcome}` : ""}`, + ); + } + ctx.ui.notify(lines.join("\n"), "info"); +} + /** * Handle `/sf solver-eval`. * @@ -484,11 +602,19 @@ export async function handleAutonomousSolverEval( args = parseAutonomousSolverEvalArgs(rawArgs); } catch (err) { ctx.ui.notify( - `Usage: /sf solver-eval [run] [--sample | --cases ] [--run-id ]\n${err instanceof Error ? err.message : String(err)}`, + `Usage: /sf solver-eval [run|history|show ] [--sample | --cases ] [--run-id ] [--limit ]\n${err instanceof Error ? err.message : String(err)}`, "warning", ); return; } + if (args.action === "history") { + await notifySolverEvalHistory(ctx, basePath, args.limit); + return; + } + if (args.action === "show") { + await notifySolverEvalShow(ctx, basePath, args.runId); + return; + } const cases = args.casesPath ? loadAutonomousSolverEvalCases( isAbsolute(args.casesPath) @@ -496,16 +622,21 @@ export async function handleAutonomousSolverEval( : join(basePath, args.casesPath), ) : sampleAutonomousSolverEvalCases(); - const report = runAutonomousSolverEval({ + let report = runAutonomousSolverEval({ basePath, cases, + casesPath: args.casesPath, + suiteSource: args.casesPath ?? "sample", runId: args.runId ?? undefined, }); + const dbRecord = await recordEvalRunBestEffort(basePath, report); + if (dbRecord.ok) report = dbRecord.report; ctx.ui.notify( [ "Autonomous solver eval complete", `Run: ${report.runId}`, `Evidence: ${report.relativeOutputDir}/report.json`, + `DB recorded: ${report.dbRecorded ? "yes" : `no (${dbRecord.error})`}`, `Cases: ${report.summary.cases}`, `SF wins: ${report.summary.sfWins}`, `Raw wins: ${report.summary.rawWins}`, diff --git a/src/resources/extensions/sf/sf-db.js b/src/resources/extensions/sf/sf-db.js index 3fd1f277e..c98653a72 100644 --- a/src/resources/extensions/sf/sf-db.js +++ b/src/resources/extensions/sf/sf-db.js @@ -78,7 +78,7 @@ function openRawDb(path) { loadProvider(); return new DatabaseSync(path); } -const SCHEMA_VERSION = 26; +const SCHEMA_VERSION = 27; function indexExists(db, name) { return !!db .prepare( @@ -140,6 +140,48 @@ function ensureRepoProfileTables(db) { "CREATE INDEX IF NOT EXISTS idx_repo_file_observations_status ON repo_file_observations(git_status, ownership)", ); } +function ensureSolverEvalTables(db) { + db.exec(` + CREATE TABLE IF NOT EXISTS solver_eval_runs ( + run_id TEXT PRIMARY KEY, + suite_source TEXT NOT NULL DEFAULT '', + cases_count INTEGER NOT NULL DEFAULT 0, + summary_json TEXT NOT NULL DEFAULT '{}', + report_path TEXT NOT NULL DEFAULT '', + results_path TEXT NOT NULL DEFAULT '', + db_recorded INTEGER NOT NULL DEFAULT 1, + created_at TEXT NOT NULL, + updated_at TEXT NOT NULL + ) + `); + db.exec(` + CREATE TABLE IF NOT EXISTS solver_eval_case_results ( + run_id TEXT NOT NULL, + case_id TEXT NOT NULL, + title TEXT NOT NULL DEFAULT '', + mode TEXT NOT NULL, + passed INTEGER NOT NULL DEFAULT 0, + false_complete INTEGER NOT NULL DEFAULT 0, + duration_ms INTEGER DEFAULT NULL, + command_status INTEGER DEFAULT NULL, + solver_outcome TEXT DEFAULT NULL, + pdd_complete INTEGER DEFAULT NULL, + result_json TEXT NOT NULL DEFAULT '{}', + created_at TEXT NOT NULL, + PRIMARY KEY (run_id, case_id, mode), + FOREIGN KEY (run_id) REFERENCES solver_eval_runs(run_id) ON DELETE CASCADE + ) + `); + db.exec( + "CREATE INDEX IF NOT EXISTS idx_solver_eval_runs_created ON solver_eval_runs(created_at DESC)", + ); + db.exec( + "CREATE INDEX IF NOT EXISTS idx_solver_eval_case_lookup ON solver_eval_case_results(run_id, case_id)", + ); + db.exec( + "CREATE INDEX IF NOT EXISTS idx_solver_eval_case_false_complete ON solver_eval_case_results(false_complete, mode)", + ); +} function initSchema(db, fileBacked) { if (fileBacked) db.exec("PRAGMA journal_mode=WAL"); if (fileBacked) db.exec("PRAGMA busy_timeout = 5000"); @@ -517,6 +559,7 @@ function initSchema(db, fileBacked) { updated_at TEXT NOT NULL ) `); + ensureSolverEvalTables(db); db.exec( "CREATE INDEX IF NOT EXISTS idx_memories_active ON memories(superseded_by)", ); @@ -578,6 +621,7 @@ function initSchema(db, fileBacked) { "CREATE INDEX IF NOT EXISTS idx_uok_runs_session ON uok_runs(session_id, started_at DESC)", ); ensureRepoProfileTables(db); + ensureSolverEvalTables(db); db.exec( `CREATE VIEW IF NOT EXISTS active_decisions AS SELECT * FROM decisions WHERE superseded_by IS NULL`, ); @@ -1475,6 +1519,15 @@ function migrateSchema(db) { ":applied_at": new Date().toISOString(), }); } + if (currentVersion < 27) { + ensureSolverEvalTables(db); + db.prepare( + "INSERT INTO schema_version (version, applied_at) VALUES (:version, :applied_at)", + ).run({ + ":version": 27, + ":applied_at": new Date().toISOString(), + }); + } db.exec("COMMIT"); } catch (err) { db.exec("ROLLBACK"); @@ -3841,6 +3894,181 @@ export function getRepoFileObservations() { adoptionUnitId: asStringOrNull(row["adoption_unit_id"]), })); } +function intBool(value) { + return value ? 1 : 0; +} +function parseJsonObject(raw, fallback = {}) { + try { + return JSON.parse(raw); + } catch { + return fallback; + } +} +function solverEvalRunFromRow(row) { + return { + runId: row["run_id"], + suiteSource: row["suite_source"], + casesCount: row["cases_count"] ?? 0, + summary: parseJsonObject(row["summary_json"], {}), + reportPath: row["report_path"], + resultsPath: row["results_path"], + dbRecorded: row["db_recorded"] === 1, + createdAt: row["created_at"], + updatedAt: row["updated_at"], + }; +} +function solverEvalCaseFromRow(row) { + return { + runId: row["run_id"], + caseId: row["case_id"], + title: row["title"], + mode: row["mode"], + passed: row["passed"] === 1, + falseComplete: row["false_complete"] === 1, + durationMs: row["duration_ms"], + commandStatus: row["command_status"], + solverOutcome: asStringOrNull(row["solver_outcome"]), + pddComplete: + row["pdd_complete"] === null || row["pdd_complete"] === undefined + ? null + : row["pdd_complete"] === 1, + result: parseJsonObject(row["result_json"], {}), + createdAt: row["created_at"], + }; +} +/** + * Persist an autonomous solver eval run and its per-mode case results. + * + * Purpose: make solver-loop benchmark evidence queryable by SF commands, + * harness flows, UOK, and future memory retention instead of leaving it only + * as JSON files under `.sf/evals`. + * + * Consumer: `/sf solver-eval` after each run completes. + */ +export function recordSolverEvalRun(report) { + if (!currentDb) throw new SFError(SF_STALE_STATE, "sf-db: No database open"); + const now = new Date().toISOString(); + transaction(() => { + currentDb + .prepare(`INSERT INTO solver_eval_runs ( + run_id, suite_source, cases_count, summary_json, report_path, + results_path, db_recorded, created_at, updated_at + ) VALUES ( + :run_id, :suite_source, :cases_count, :summary_json, :report_path, + :results_path, 1, :created_at, :updated_at + ) + ON CONFLICT(run_id) DO UPDATE SET + suite_source = excluded.suite_source, + cases_count = excluded.cases_count, + summary_json = excluded.summary_json, + report_path = excluded.report_path, + results_path = excluded.results_path, + db_recorded = 1, + updated_at = excluded.updated_at`) + .run({ + ":run_id": report.runId, + ":suite_source": report.suiteSource ?? "", + ":cases_count": report.summary?.cases ?? report.results?.length ?? 0, + ":summary_json": JSON.stringify(report.summary ?? {}), + ":report_path": report.reportPath ?? "", + ":results_path": report.resultsPath ?? "", + ":created_at": report.createdAt ?? now, + ":updated_at": now, + }); + const stmt = currentDb.prepare(`INSERT INTO solver_eval_case_results ( + run_id, case_id, title, mode, passed, false_complete, duration_ms, + command_status, solver_outcome, pdd_complete, result_json, created_at + ) VALUES ( + :run_id, :case_id, :title, :mode, :passed, :false_complete, :duration_ms, + :command_status, :solver_outcome, :pdd_complete, :result_json, :created_at + ) + ON CONFLICT(run_id, case_id, mode) DO UPDATE SET + title = excluded.title, + passed = excluded.passed, + false_complete = excluded.false_complete, + duration_ms = excluded.duration_ms, + command_status = excluded.command_status, + solver_outcome = excluded.solver_outcome, + pdd_complete = excluded.pdd_complete, + result_json = excluded.result_json, + created_at = excluded.created_at`); + for (const result of report.results ?? []) { + stmt.run({ + ":run_id": report.runId, + ":case_id": result.caseId, + ":title": result.title ?? "", + ":mode": result.mode, + ":passed": intBool(result.passed), + ":false_complete": intBool(result.falseComplete), + ":duration_ms": result.command?.durationMs ?? null, + ":command_status": result.command?.status ?? null, + ":solver_outcome": result.solverSignals?.outcome ?? null, + ":pdd_complete": + result.solverSignals?.pddComplete === undefined + ? null + : intBool(result.solverSignals.pddComplete), + ":result_json": JSON.stringify(result), + ":created_at": report.createdAt ?? now, + }); + } + }); +} +/** + * List recent autonomous solver eval runs. + * + * Purpose: let operators inspect benchmark history without scraping generated + * report files. + * + * Consumer: `/sf solver-eval history`. + */ +export function listSolverEvalRuns(limit = 10) { + if (!currentDb) throw new SFError(SF_STALE_STATE, "sf-db: No database open"); + return currentDb + .prepare(`SELECT run_id, suite_source, cases_count, summary_json, + report_path, results_path, db_recorded, created_at, updated_at + FROM solver_eval_runs + ORDER BY created_at DESC, run_id DESC + LIMIT :limit`) + .all({ ":limit": Math.max(1, Math.min(100, Number(limit) || 10)) }) + .map(solverEvalRunFromRow); +} +/** + * Read one autonomous solver eval run by id. + * + * Purpose: support `/sf solver-eval show ` and future evidence + * promotion without parsing JSON artifacts. + * + * Consumer: solver eval command handlers. + */ +export function getSolverEvalRun(runId) { + if (!currentDb) throw new SFError(SF_STALE_STATE, "sf-db: No database open"); + const row = currentDb + .prepare(`SELECT run_id, suite_source, cases_count, summary_json, + report_path, results_path, db_recorded, created_at, updated_at + FROM solver_eval_runs + WHERE run_id = :run_id`) + .get({ ":run_id": runId }); + return row ? solverEvalRunFromRow(row) : null; +} +/** + * Read per-case results for one autonomous solver eval run. + * + * Purpose: show raw-vs-SF comparisons from DB evidence. + * + * Consumer: `/sf solver-eval show `. + */ +export function getSolverEvalCaseResults(runId) { + if (!currentDb) throw new SFError(SF_STALE_STATE, "sf-db: No database open"); + return currentDb + .prepare(`SELECT run_id, case_id, title, mode, passed, false_complete, + duration_ms, command_status, solver_outcome, pdd_complete, + result_json, created_at + FROM solver_eval_case_results + WHERE run_id = :run_id + ORDER BY case_id ASC, mode ASC`) + .all({ ":run_id": runId }) + .map(solverEvalCaseFromRow); +} /** * INSERT OR REPLACE a quality_gates row. Used by milestone-validation-gates.ts * to persist milestone-level (MV*) gate outcomes after validate-milestone runs. diff --git a/src/resources/extensions/sf/tests/autonomous-solver-eval.test.mjs b/src/resources/extensions/sf/tests/autonomous-solver-eval.test.mjs index c605daf22..5c94959b0 100644 --- a/src/resources/extensions/sf/tests/autonomous-solver-eval.test.mjs +++ b/src/resources/extensions/sf/tests/autonomous-solver-eval.test.mjs @@ -1,13 +1,22 @@ -import { mkdtempSync, rmSync, writeFileSync } from "node:fs"; +import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs"; import { tmpdir } from "node:os"; import { join } from "node:path"; import { afterEach, describe, expect, test } from "vitest"; import { + handleAutonomousSolverEval, loadAutonomousSolverEvalCases, parseAutonomousSolverEvalArgs, runAutonomousSolverEval, sampleAutonomousSolverEvalCases, } from "../autonomous-solver-eval.js"; +import { + closeDatabase, + getSolverEvalCaseResults, + getSolverEvalRun, + listSolverEvalRuns, + openDatabase, + recordSolverEvalRun, +} from "../sf-db.js"; let tempDirs = []; @@ -17,7 +26,22 @@ function makeProject() { return dir; } +function makeCtx() { + const notices = []; + return { + notices, + ctx: { + ui: { + notify(message, level) { + notices.push({ message, level }); + }, + }, + }, + }; +} + afterEach(() => { + closeDatabase(); for (const dir of tempDirs) { rmSync(dir, { recursive: true, force: true }); } @@ -27,16 +51,28 @@ afterEach(() => { describe("autonomous solver eval", () => { test("parseAutonomousSolverEvalArgs_defaults_to_sample", () => { expect(parseAutonomousSolverEvalArgs("run")).toEqual({ + action: "run", sample: true, casesPath: null, runId: null, + limit: 10, }); expect( parseAutonomousSolverEvalArgs("--cases cases.jsonl --run-id abc"), ).toEqual({ + action: "run", sample: false, casesPath: "cases.jsonl", runId: "abc", + limit: 10, + }); + expect(parseAutonomousSolverEvalArgs("history --limit 3")).toMatchObject({ + action: "history", + limit: 3, + }); + expect(parseAutonomousSolverEvalArgs("show run-1")).toMatchObject({ + action: "show", + runId: "run-1", }); }); @@ -82,4 +118,42 @@ describe("autonomous solver eval", () => { expect(sfResult.solverSignals.hasCheckpoint).toBe(true); expect(sfResult.solverSignals.pddComplete).toBe(true); }); + + test("recordSolverEvalRun_persists_queryable_run_and_case_rows", () => { + const project = makeProject(); + openDatabase(":memory:"); + const report = runAutonomousSolverEval({ + basePath: project, + runId: "db-run", + cases: sampleAutonomousSolverEvalCases(), + }); + + recordSolverEvalRun({ ...report, dbRecorded: true }); + + const run = getSolverEvalRun("db-run"); + const runs = listSolverEvalRuns(5); + const cases = getSolverEvalCaseResults("db-run"); + expect(run.runId).toBe("db-run"); + expect(run.summary.sfWins).toBe(1); + expect(runs[0].runId).toBe("db-run"); + expect(cases).toHaveLength(2); + expect(cases.find((r) => r.mode === "raw").falseComplete).toBe(true); + expect(cases.find((r) => r.mode === "sf").pddComplete).toBe(true); + }); + + test("handleAutonomousSolverEval_records_and_reads_db_history", async () => { + const project = makeProject(); + mkdirSync(join(project, ".sf"), { recursive: true }); + const { ctx, notices } = makeCtx(); + + await handleAutonomousSolverEval("--sample --run-id cmd-run", ctx, project); + await handleAutonomousSolverEval("history --limit 1", ctx, project); + await handleAutonomousSolverEval("show cmd-run", ctx, project); + + expect(notices[0].message).toContain("DB recorded: yes"); + expect(notices[1].message).toContain("cmd-run"); + expect(notices[2].message).toContain("Autonomous solver eval: cmd-run"); + expect(notices[2].message).toContain("raw"); + expect(notices[2].message).toContain("sf"); + }); });