feat: persist solver eval evidence in db
This commit is contained in:
parent
dc51baa19a
commit
7a13dd82b1
3 changed files with 446 additions and 13 deletions
|
|
@ -18,6 +18,7 @@ import {
|
|||
} from "node:fs";
|
||||
import { dirname, isAbsolute, join, relative, resolve } from "node:path";
|
||||
import { atomicWriteSync } from "./atomic-write.js";
|
||||
import { ensureDbOpen } from "./bootstrap/dynamic-tools.js";
|
||||
import { sfRoot } from "./paths.js";
|
||||
|
||||
const DEFAULT_TIMEOUT_MS = 5 * 60_000;
|
||||
|
|
@ -374,6 +375,8 @@ export function runAutonomousSolverEval(options) {
|
|||
const runId =
|
||||
options.runId && RUN_ID_RE.test(options.runId) ? options.runId : nowRunId();
|
||||
const cases = options.cases ?? sampleAutonomousSolverEvalCases();
|
||||
const suiteSource =
|
||||
options.suiteSource ?? (options.casesPath ? options.casesPath : "sample");
|
||||
const outputDir = resolveOutputDir(basePath, runId);
|
||||
const workspaceRoot = join(outputDir, "workspaces");
|
||||
rmSync(outputDir, { recursive: true, force: true });
|
||||
|
|
@ -404,23 +407,27 @@ export function runAutonomousSolverEval(options) {
|
|||
runId,
|
||||
createdAt: new Date().toISOString(),
|
||||
basePath,
|
||||
suiteSource,
|
||||
summary: summarizeResults(results),
|
||||
results,
|
||||
dbRecorded: false,
|
||||
};
|
||||
atomicWriteSync(
|
||||
join(outputDir, "report.json"),
|
||||
`${JSON.stringify(report, null, 2)}\n`,
|
||||
);
|
||||
const reportPath = join(outputDir, "report.json");
|
||||
const resultsPath = join(outputDir, "results.jsonl");
|
||||
writeFileSync(
|
||||
join(outputDir, "results.jsonl"),
|
||||
resultsPath,
|
||||
results.map((result) => JSON.stringify(result)).join("\n") + "\n",
|
||||
"utf-8",
|
||||
);
|
||||
return {
|
||||
const finalReport = {
|
||||
...report,
|
||||
outputDir,
|
||||
relativeOutputDir: relative(basePath, outputDir),
|
||||
reportPath: relative(basePath, reportPath),
|
||||
resultsPath: relative(basePath, resultsPath),
|
||||
};
|
||||
atomicWriteSync(reportPath, `${JSON.stringify(finalReport, null, 2)}\n`);
|
||||
return finalReport;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -436,10 +443,42 @@ export function parseAutonomousSolverEvalArgs(raw) {
|
|||
.trim()
|
||||
.split(/\s+/)
|
||||
.filter(Boolean);
|
||||
const opts = { sample: false, casesPath: null, runId: null };
|
||||
const opts = {
|
||||
action: "run",
|
||||
sample: false,
|
||||
casesPath: null,
|
||||
runId: null,
|
||||
limit: 10,
|
||||
};
|
||||
for (let i = 0; i < tokens.length; i += 1) {
|
||||
const token = tokens[i];
|
||||
if (token === "run") continue;
|
||||
if (token === "run") {
|
||||
opts.action = "run";
|
||||
continue;
|
||||
}
|
||||
if (token === "history") {
|
||||
opts.action = "history";
|
||||
continue;
|
||||
}
|
||||
if (token === "show") {
|
||||
const value = tokens[i + 1];
|
||||
if (!value || !RUN_ID_RE.test(value)) {
|
||||
throw new Error("show requires a safe run id");
|
||||
}
|
||||
opts.action = "show";
|
||||
opts.runId = value;
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
if (token === "--limit") {
|
||||
const value = Number(tokens[i + 1]);
|
||||
if (!Number.isFinite(value) || value < 1) {
|
||||
throw new Error("--limit requires a positive number");
|
||||
}
|
||||
opts.limit = Math.floor(value);
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
if (token === "--sample") {
|
||||
opts.sample = true;
|
||||
continue;
|
||||
|
|
@ -462,10 +501,89 @@ export function parseAutonomousSolverEvalArgs(raw) {
|
|||
}
|
||||
throw new Error(`unknown solver-eval argument: ${token}`);
|
||||
}
|
||||
if (!opts.sample && !opts.casesPath) opts.sample = true;
|
||||
if (opts.action === "run" && !opts.sample && !opts.casesPath)
|
||||
opts.sample = true;
|
||||
return opts;
|
||||
}
|
||||
|
||||
async function recordEvalRunBestEffort(basePath, report) {
|
||||
try {
|
||||
if (!(await ensureDbOpen(basePath)))
|
||||
return { ok: false, error: "db-unavailable" };
|
||||
const { recordSolverEvalRun } = await import("./sf-db.js");
|
||||
recordSolverEvalRun(report);
|
||||
const updated = { ...report, dbRecorded: true };
|
||||
atomicWriteSync(
|
||||
join(basePath, updated.reportPath),
|
||||
`${JSON.stringify(updated, null, 2)}\n`,
|
||||
);
|
||||
return { ok: true, report: updated };
|
||||
} catch (err) {
|
||||
return {
|
||||
ok: false,
|
||||
error: err instanceof Error ? err.message : String(err),
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
async function notifySolverEvalHistory(ctx, basePath, limit) {
|
||||
if (!(await ensureDbOpen(basePath))) {
|
||||
ctx.ui.notify("No SF database available. Run /sf init first.", "warning");
|
||||
return;
|
||||
}
|
||||
const { listSolverEvalRuns } = await import("./sf-db.js");
|
||||
const runs = listSolverEvalRuns(limit);
|
||||
if (runs.length === 0) {
|
||||
ctx.ui.notify(
|
||||
"No solver eval runs recorded. Run /sf solver-eval --sample.",
|
||||
"info",
|
||||
);
|
||||
return;
|
||||
}
|
||||
const lines = ["Autonomous solver eval history"];
|
||||
for (const run of runs) {
|
||||
lines.push(
|
||||
[
|
||||
`- ${run.runId}`,
|
||||
`${run.summary?.cases ?? run.casesCount} case(s)`,
|
||||
`SF wins ${run.summary?.sfWins ?? 0}`,
|
||||
`raw wins ${run.summary?.rawWins ?? 0}`,
|
||||
`raw false-complete ${run.summary?.rawFalseCompletes ?? 0}`,
|
||||
run.reportPath,
|
||||
].join(" · "),
|
||||
);
|
||||
}
|
||||
ctx.ui.notify(lines.join("\n"), "info");
|
||||
}
|
||||
|
||||
async function notifySolverEvalShow(ctx, basePath, runId) {
|
||||
if (!(await ensureDbOpen(basePath))) {
|
||||
ctx.ui.notify("No SF database available. Run /sf init first.", "warning");
|
||||
return;
|
||||
}
|
||||
const { getSolverEvalCaseResults, getSolverEvalRun } = await import(
|
||||
"./sf-db.js"
|
||||
);
|
||||
const run = getSolverEvalRun(runId);
|
||||
if (!run) {
|
||||
ctx.ui.notify(`No solver eval run found for ${runId}.`, "warning");
|
||||
return;
|
||||
}
|
||||
const cases = getSolverEvalCaseResults(runId);
|
||||
const lines = [
|
||||
`Autonomous solver eval: ${run.runId}`,
|
||||
`Evidence: ${run.reportPath}`,
|
||||
`Summary: ${run.summary?.cases ?? run.casesCount} case(s), SF wins ${run.summary?.sfWins ?? 0}, raw wins ${run.summary?.rawWins ?? 0}`,
|
||||
"",
|
||||
];
|
||||
for (const result of cases) {
|
||||
lines.push(
|
||||
`- ${result.caseId} [${result.mode}] ${result.passed ? "pass" : "fail"}${result.falseComplete ? " false-complete" : ""}${result.solverOutcome ? ` outcome=${result.solverOutcome}` : ""}`,
|
||||
);
|
||||
}
|
||||
ctx.ui.notify(lines.join("\n"), "info");
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle `/sf solver-eval`.
|
||||
*
|
||||
|
|
@ -484,11 +602,19 @@ export async function handleAutonomousSolverEval(
|
|||
args = parseAutonomousSolverEvalArgs(rawArgs);
|
||||
} catch (err) {
|
||||
ctx.ui.notify(
|
||||
`Usage: /sf solver-eval [run] [--sample | --cases <jsonl>] [--run-id <id>]\n${err instanceof Error ? err.message : String(err)}`,
|
||||
`Usage: /sf solver-eval [run|history|show <run-id>] [--sample | --cases <jsonl>] [--run-id <id>] [--limit <n>]\n${err instanceof Error ? err.message : String(err)}`,
|
||||
"warning",
|
||||
);
|
||||
return;
|
||||
}
|
||||
if (args.action === "history") {
|
||||
await notifySolverEvalHistory(ctx, basePath, args.limit);
|
||||
return;
|
||||
}
|
||||
if (args.action === "show") {
|
||||
await notifySolverEvalShow(ctx, basePath, args.runId);
|
||||
return;
|
||||
}
|
||||
const cases = args.casesPath
|
||||
? loadAutonomousSolverEvalCases(
|
||||
isAbsolute(args.casesPath)
|
||||
|
|
@ -496,16 +622,21 @@ export async function handleAutonomousSolverEval(
|
|||
: join(basePath, args.casesPath),
|
||||
)
|
||||
: sampleAutonomousSolverEvalCases();
|
||||
const report = runAutonomousSolverEval({
|
||||
let report = runAutonomousSolverEval({
|
||||
basePath,
|
||||
cases,
|
||||
casesPath: args.casesPath,
|
||||
suiteSource: args.casesPath ?? "sample",
|
||||
runId: args.runId ?? undefined,
|
||||
});
|
||||
const dbRecord = await recordEvalRunBestEffort(basePath, report);
|
||||
if (dbRecord.ok) report = dbRecord.report;
|
||||
ctx.ui.notify(
|
||||
[
|
||||
"Autonomous solver eval complete",
|
||||
`Run: ${report.runId}`,
|
||||
`Evidence: ${report.relativeOutputDir}/report.json`,
|
||||
`DB recorded: ${report.dbRecorded ? "yes" : `no (${dbRecord.error})`}`,
|
||||
`Cases: ${report.summary.cases}`,
|
||||
`SF wins: ${report.summary.sfWins}`,
|
||||
`Raw wins: ${report.summary.rawWins}`,
|
||||
|
|
|
|||
|
|
@ -78,7 +78,7 @@ function openRawDb(path) {
|
|||
loadProvider();
|
||||
return new DatabaseSync(path);
|
||||
}
|
||||
const SCHEMA_VERSION = 26;
|
||||
const SCHEMA_VERSION = 27;
|
||||
function indexExists(db, name) {
|
||||
return !!db
|
||||
.prepare(
|
||||
|
|
@ -140,6 +140,48 @@ function ensureRepoProfileTables(db) {
|
|||
"CREATE INDEX IF NOT EXISTS idx_repo_file_observations_status ON repo_file_observations(git_status, ownership)",
|
||||
);
|
||||
}
|
||||
function ensureSolverEvalTables(db) {
|
||||
db.exec(`
|
||||
CREATE TABLE IF NOT EXISTS solver_eval_runs (
|
||||
run_id TEXT PRIMARY KEY,
|
||||
suite_source TEXT NOT NULL DEFAULT '',
|
||||
cases_count INTEGER NOT NULL DEFAULT 0,
|
||||
summary_json TEXT NOT NULL DEFAULT '{}',
|
||||
report_path TEXT NOT NULL DEFAULT '',
|
||||
results_path TEXT NOT NULL DEFAULT '',
|
||||
db_recorded INTEGER NOT NULL DEFAULT 1,
|
||||
created_at TEXT NOT NULL,
|
||||
updated_at TEXT NOT NULL
|
||||
)
|
||||
`);
|
||||
db.exec(`
|
||||
CREATE TABLE IF NOT EXISTS solver_eval_case_results (
|
||||
run_id TEXT NOT NULL,
|
||||
case_id TEXT NOT NULL,
|
||||
title TEXT NOT NULL DEFAULT '',
|
||||
mode TEXT NOT NULL,
|
||||
passed INTEGER NOT NULL DEFAULT 0,
|
||||
false_complete INTEGER NOT NULL DEFAULT 0,
|
||||
duration_ms INTEGER DEFAULT NULL,
|
||||
command_status INTEGER DEFAULT NULL,
|
||||
solver_outcome TEXT DEFAULT NULL,
|
||||
pdd_complete INTEGER DEFAULT NULL,
|
||||
result_json TEXT NOT NULL DEFAULT '{}',
|
||||
created_at TEXT NOT NULL,
|
||||
PRIMARY KEY (run_id, case_id, mode),
|
||||
FOREIGN KEY (run_id) REFERENCES solver_eval_runs(run_id) ON DELETE CASCADE
|
||||
)
|
||||
`);
|
||||
db.exec(
|
||||
"CREATE INDEX IF NOT EXISTS idx_solver_eval_runs_created ON solver_eval_runs(created_at DESC)",
|
||||
);
|
||||
db.exec(
|
||||
"CREATE INDEX IF NOT EXISTS idx_solver_eval_case_lookup ON solver_eval_case_results(run_id, case_id)",
|
||||
);
|
||||
db.exec(
|
||||
"CREATE INDEX IF NOT EXISTS idx_solver_eval_case_false_complete ON solver_eval_case_results(false_complete, mode)",
|
||||
);
|
||||
}
|
||||
function initSchema(db, fileBacked) {
|
||||
if (fileBacked) db.exec("PRAGMA journal_mode=WAL");
|
||||
if (fileBacked) db.exec("PRAGMA busy_timeout = 5000");
|
||||
|
|
@ -517,6 +559,7 @@ function initSchema(db, fileBacked) {
|
|||
updated_at TEXT NOT NULL
|
||||
)
|
||||
`);
|
||||
ensureSolverEvalTables(db);
|
||||
db.exec(
|
||||
"CREATE INDEX IF NOT EXISTS idx_memories_active ON memories(superseded_by)",
|
||||
);
|
||||
|
|
@ -578,6 +621,7 @@ function initSchema(db, fileBacked) {
|
|||
"CREATE INDEX IF NOT EXISTS idx_uok_runs_session ON uok_runs(session_id, started_at DESC)",
|
||||
);
|
||||
ensureRepoProfileTables(db);
|
||||
ensureSolverEvalTables(db);
|
||||
db.exec(
|
||||
`CREATE VIEW IF NOT EXISTS active_decisions AS SELECT * FROM decisions WHERE superseded_by IS NULL`,
|
||||
);
|
||||
|
|
@ -1475,6 +1519,15 @@ function migrateSchema(db) {
|
|||
":applied_at": new Date().toISOString(),
|
||||
});
|
||||
}
|
||||
if (currentVersion < 27) {
|
||||
ensureSolverEvalTables(db);
|
||||
db.prepare(
|
||||
"INSERT INTO schema_version (version, applied_at) VALUES (:version, :applied_at)",
|
||||
).run({
|
||||
":version": 27,
|
||||
":applied_at": new Date().toISOString(),
|
||||
});
|
||||
}
|
||||
db.exec("COMMIT");
|
||||
} catch (err) {
|
||||
db.exec("ROLLBACK");
|
||||
|
|
@ -3841,6 +3894,181 @@ export function getRepoFileObservations() {
|
|||
adoptionUnitId: asStringOrNull(row["adoption_unit_id"]),
|
||||
}));
|
||||
}
|
||||
function intBool(value) {
|
||||
return value ? 1 : 0;
|
||||
}
|
||||
function parseJsonObject(raw, fallback = {}) {
|
||||
try {
|
||||
return JSON.parse(raw);
|
||||
} catch {
|
||||
return fallback;
|
||||
}
|
||||
}
|
||||
function solverEvalRunFromRow(row) {
|
||||
return {
|
||||
runId: row["run_id"],
|
||||
suiteSource: row["suite_source"],
|
||||
casesCount: row["cases_count"] ?? 0,
|
||||
summary: parseJsonObject(row["summary_json"], {}),
|
||||
reportPath: row["report_path"],
|
||||
resultsPath: row["results_path"],
|
||||
dbRecorded: row["db_recorded"] === 1,
|
||||
createdAt: row["created_at"],
|
||||
updatedAt: row["updated_at"],
|
||||
};
|
||||
}
|
||||
function solverEvalCaseFromRow(row) {
|
||||
return {
|
||||
runId: row["run_id"],
|
||||
caseId: row["case_id"],
|
||||
title: row["title"],
|
||||
mode: row["mode"],
|
||||
passed: row["passed"] === 1,
|
||||
falseComplete: row["false_complete"] === 1,
|
||||
durationMs: row["duration_ms"],
|
||||
commandStatus: row["command_status"],
|
||||
solverOutcome: asStringOrNull(row["solver_outcome"]),
|
||||
pddComplete:
|
||||
row["pdd_complete"] === null || row["pdd_complete"] === undefined
|
||||
? null
|
||||
: row["pdd_complete"] === 1,
|
||||
result: parseJsonObject(row["result_json"], {}),
|
||||
createdAt: row["created_at"],
|
||||
};
|
||||
}
|
||||
/**
|
||||
* Persist an autonomous solver eval run and its per-mode case results.
|
||||
*
|
||||
* Purpose: make solver-loop benchmark evidence queryable by SF commands,
|
||||
* harness flows, UOK, and future memory retention instead of leaving it only
|
||||
* as JSON files under `.sf/evals`.
|
||||
*
|
||||
* Consumer: `/sf solver-eval` after each run completes.
|
||||
*/
|
||||
export function recordSolverEvalRun(report) {
|
||||
if (!currentDb) throw new SFError(SF_STALE_STATE, "sf-db: No database open");
|
||||
const now = new Date().toISOString();
|
||||
transaction(() => {
|
||||
currentDb
|
||||
.prepare(`INSERT INTO solver_eval_runs (
|
||||
run_id, suite_source, cases_count, summary_json, report_path,
|
||||
results_path, db_recorded, created_at, updated_at
|
||||
) VALUES (
|
||||
:run_id, :suite_source, :cases_count, :summary_json, :report_path,
|
||||
:results_path, 1, :created_at, :updated_at
|
||||
)
|
||||
ON CONFLICT(run_id) DO UPDATE SET
|
||||
suite_source = excluded.suite_source,
|
||||
cases_count = excluded.cases_count,
|
||||
summary_json = excluded.summary_json,
|
||||
report_path = excluded.report_path,
|
||||
results_path = excluded.results_path,
|
||||
db_recorded = 1,
|
||||
updated_at = excluded.updated_at`)
|
||||
.run({
|
||||
":run_id": report.runId,
|
||||
":suite_source": report.suiteSource ?? "",
|
||||
":cases_count": report.summary?.cases ?? report.results?.length ?? 0,
|
||||
":summary_json": JSON.stringify(report.summary ?? {}),
|
||||
":report_path": report.reportPath ?? "",
|
||||
":results_path": report.resultsPath ?? "",
|
||||
":created_at": report.createdAt ?? now,
|
||||
":updated_at": now,
|
||||
});
|
||||
const stmt = currentDb.prepare(`INSERT INTO solver_eval_case_results (
|
||||
run_id, case_id, title, mode, passed, false_complete, duration_ms,
|
||||
command_status, solver_outcome, pdd_complete, result_json, created_at
|
||||
) VALUES (
|
||||
:run_id, :case_id, :title, :mode, :passed, :false_complete, :duration_ms,
|
||||
:command_status, :solver_outcome, :pdd_complete, :result_json, :created_at
|
||||
)
|
||||
ON CONFLICT(run_id, case_id, mode) DO UPDATE SET
|
||||
title = excluded.title,
|
||||
passed = excluded.passed,
|
||||
false_complete = excluded.false_complete,
|
||||
duration_ms = excluded.duration_ms,
|
||||
command_status = excluded.command_status,
|
||||
solver_outcome = excluded.solver_outcome,
|
||||
pdd_complete = excluded.pdd_complete,
|
||||
result_json = excluded.result_json,
|
||||
created_at = excluded.created_at`);
|
||||
for (const result of report.results ?? []) {
|
||||
stmt.run({
|
||||
":run_id": report.runId,
|
||||
":case_id": result.caseId,
|
||||
":title": result.title ?? "",
|
||||
":mode": result.mode,
|
||||
":passed": intBool(result.passed),
|
||||
":false_complete": intBool(result.falseComplete),
|
||||
":duration_ms": result.command?.durationMs ?? null,
|
||||
":command_status": result.command?.status ?? null,
|
||||
":solver_outcome": result.solverSignals?.outcome ?? null,
|
||||
":pdd_complete":
|
||||
result.solverSignals?.pddComplete === undefined
|
||||
? null
|
||||
: intBool(result.solverSignals.pddComplete),
|
||||
":result_json": JSON.stringify(result),
|
||||
":created_at": report.createdAt ?? now,
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
/**
|
||||
* List recent autonomous solver eval runs.
|
||||
*
|
||||
* Purpose: let operators inspect benchmark history without scraping generated
|
||||
* report files.
|
||||
*
|
||||
* Consumer: `/sf solver-eval history`.
|
||||
*/
|
||||
export function listSolverEvalRuns(limit = 10) {
|
||||
if (!currentDb) throw new SFError(SF_STALE_STATE, "sf-db: No database open");
|
||||
return currentDb
|
||||
.prepare(`SELECT run_id, suite_source, cases_count, summary_json,
|
||||
report_path, results_path, db_recorded, created_at, updated_at
|
||||
FROM solver_eval_runs
|
||||
ORDER BY created_at DESC, run_id DESC
|
||||
LIMIT :limit`)
|
||||
.all({ ":limit": Math.max(1, Math.min(100, Number(limit) || 10)) })
|
||||
.map(solverEvalRunFromRow);
|
||||
}
|
||||
/**
|
||||
* Read one autonomous solver eval run by id.
|
||||
*
|
||||
* Purpose: support `/sf solver-eval show <run-id>` and future evidence
|
||||
* promotion without parsing JSON artifacts.
|
||||
*
|
||||
* Consumer: solver eval command handlers.
|
||||
*/
|
||||
export function getSolverEvalRun(runId) {
|
||||
if (!currentDb) throw new SFError(SF_STALE_STATE, "sf-db: No database open");
|
||||
const row = currentDb
|
||||
.prepare(`SELECT run_id, suite_source, cases_count, summary_json,
|
||||
report_path, results_path, db_recorded, created_at, updated_at
|
||||
FROM solver_eval_runs
|
||||
WHERE run_id = :run_id`)
|
||||
.get({ ":run_id": runId });
|
||||
return row ? solverEvalRunFromRow(row) : null;
|
||||
}
|
||||
/**
|
||||
* Read per-case results for one autonomous solver eval run.
|
||||
*
|
||||
* Purpose: show raw-vs-SF comparisons from DB evidence.
|
||||
*
|
||||
* Consumer: `/sf solver-eval show <run-id>`.
|
||||
*/
|
||||
export function getSolverEvalCaseResults(runId) {
|
||||
if (!currentDb) throw new SFError(SF_STALE_STATE, "sf-db: No database open");
|
||||
return currentDb
|
||||
.prepare(`SELECT run_id, case_id, title, mode, passed, false_complete,
|
||||
duration_ms, command_status, solver_outcome, pdd_complete,
|
||||
result_json, created_at
|
||||
FROM solver_eval_case_results
|
||||
WHERE run_id = :run_id
|
||||
ORDER BY case_id ASC, mode ASC`)
|
||||
.all({ ":run_id": runId })
|
||||
.map(solverEvalCaseFromRow);
|
||||
}
|
||||
/**
|
||||
* INSERT OR REPLACE a quality_gates row. Used by milestone-validation-gates.ts
|
||||
* to persist milestone-level (MV*) gate outcomes after validate-milestone runs.
|
||||
|
|
|
|||
|
|
@ -1,13 +1,22 @@
|
|||
import { mkdtempSync, rmSync, writeFileSync } from "node:fs";
|
||||
import { mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs";
|
||||
import { tmpdir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import { afterEach, describe, expect, test } from "vitest";
|
||||
import {
|
||||
handleAutonomousSolverEval,
|
||||
loadAutonomousSolverEvalCases,
|
||||
parseAutonomousSolverEvalArgs,
|
||||
runAutonomousSolverEval,
|
||||
sampleAutonomousSolverEvalCases,
|
||||
} from "../autonomous-solver-eval.js";
|
||||
import {
|
||||
closeDatabase,
|
||||
getSolverEvalCaseResults,
|
||||
getSolverEvalRun,
|
||||
listSolverEvalRuns,
|
||||
openDatabase,
|
||||
recordSolverEvalRun,
|
||||
} from "../sf-db.js";
|
||||
|
||||
let tempDirs = [];
|
||||
|
||||
|
|
@ -17,7 +26,22 @@ function makeProject() {
|
|||
return dir;
|
||||
}
|
||||
|
||||
function makeCtx() {
|
||||
const notices = [];
|
||||
return {
|
||||
notices,
|
||||
ctx: {
|
||||
ui: {
|
||||
notify(message, level) {
|
||||
notices.push({ message, level });
|
||||
},
|
||||
},
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
afterEach(() => {
|
||||
closeDatabase();
|
||||
for (const dir of tempDirs) {
|
||||
rmSync(dir, { recursive: true, force: true });
|
||||
}
|
||||
|
|
@ -27,16 +51,28 @@ afterEach(() => {
|
|||
describe("autonomous solver eval", () => {
|
||||
test("parseAutonomousSolverEvalArgs_defaults_to_sample", () => {
|
||||
expect(parseAutonomousSolverEvalArgs("run")).toEqual({
|
||||
action: "run",
|
||||
sample: true,
|
||||
casesPath: null,
|
||||
runId: null,
|
||||
limit: 10,
|
||||
});
|
||||
expect(
|
||||
parseAutonomousSolverEvalArgs("--cases cases.jsonl --run-id abc"),
|
||||
).toEqual({
|
||||
action: "run",
|
||||
sample: false,
|
||||
casesPath: "cases.jsonl",
|
||||
runId: "abc",
|
||||
limit: 10,
|
||||
});
|
||||
expect(parseAutonomousSolverEvalArgs("history --limit 3")).toMatchObject({
|
||||
action: "history",
|
||||
limit: 3,
|
||||
});
|
||||
expect(parseAutonomousSolverEvalArgs("show run-1")).toMatchObject({
|
||||
action: "show",
|
||||
runId: "run-1",
|
||||
});
|
||||
});
|
||||
|
||||
|
|
@ -82,4 +118,42 @@ describe("autonomous solver eval", () => {
|
|||
expect(sfResult.solverSignals.hasCheckpoint).toBe(true);
|
||||
expect(sfResult.solverSignals.pddComplete).toBe(true);
|
||||
});
|
||||
|
||||
test("recordSolverEvalRun_persists_queryable_run_and_case_rows", () => {
|
||||
const project = makeProject();
|
||||
openDatabase(":memory:");
|
||||
const report = runAutonomousSolverEval({
|
||||
basePath: project,
|
||||
runId: "db-run",
|
||||
cases: sampleAutonomousSolverEvalCases(),
|
||||
});
|
||||
|
||||
recordSolverEvalRun({ ...report, dbRecorded: true });
|
||||
|
||||
const run = getSolverEvalRun("db-run");
|
||||
const runs = listSolverEvalRuns(5);
|
||||
const cases = getSolverEvalCaseResults("db-run");
|
||||
expect(run.runId).toBe("db-run");
|
||||
expect(run.summary.sfWins).toBe(1);
|
||||
expect(runs[0].runId).toBe("db-run");
|
||||
expect(cases).toHaveLength(2);
|
||||
expect(cases.find((r) => r.mode === "raw").falseComplete).toBe(true);
|
||||
expect(cases.find((r) => r.mode === "sf").pddComplete).toBe(true);
|
||||
});
|
||||
|
||||
test("handleAutonomousSolverEval_records_and_reads_db_history", async () => {
|
||||
const project = makeProject();
|
||||
mkdirSync(join(project, ".sf"), { recursive: true });
|
||||
const { ctx, notices } = makeCtx();
|
||||
|
||||
await handleAutonomousSolverEval("--sample --run-id cmd-run", ctx, project);
|
||||
await handleAutonomousSolverEval("history --limit 1", ctx, project);
|
||||
await handleAutonomousSolverEval("show cmd-run", ctx, project);
|
||||
|
||||
expect(notices[0].message).toContain("DB recorded: yes");
|
||||
expect(notices[1].message).toContain("cmd-run");
|
||||
expect(notices[2].message).toContain("Autonomous solver eval: cmd-run");
|
||||
expect(notices[2].message).toContain("raw");
|
||||
expect(notices[2].message).toContain("sf");
|
||||
});
|
||||
});
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue