feat: add autonomous solver eval command

This commit is contained in:
Mikael Hugo 2026-05-06 03:37:58 +02:00
parent 34140fff38
commit dc51baa19a
5 changed files with 622 additions and 1 deletions

View file

@ -0,0 +1,517 @@
/**
* autonomous-solver-eval.ts first-class eval runner for solver-loop value.
*
* Purpose: compare raw agent loops against SF's autonomous solver control
* plane using the same task fixtures, deterministic assertions, and solver
* observability signals.
*
* Consumer: `/sf solver-eval` and focused regression tests.
*/
import { spawnSync } from "node:child_process";
import {
existsSync,
mkdirSync,
readdirSync,
readFileSync,
rmSync,
writeFileSync,
} from "node:fs";
import { dirname, isAbsolute, join, relative, resolve } from "node:path";
import { atomicWriteSync } from "./atomic-write.js";
import { sfRoot } from "./paths.js";
const DEFAULT_TIMEOUT_MS = 5 * 60_000;
const MAX_OUTPUT_CHARS = 20_000;
const RUN_ID_RE = /^[a-z0-9][a-z0-9._-]{0,80}$/i;
function nowRunId() {
return new Date().toISOString().replace(/[:.]/g, "-");
}
function safeRelPath(path) {
const normalized = String(path ?? "").replaceAll("\\", "/");
if (
!normalized ||
normalized.startsWith("/") ||
normalized.split("/").includes("..")
) {
throw new Error(`unsafe relative path: ${path}`);
}
return normalized;
}
function parseJsonLine(line, index, filePath) {
try {
return JSON.parse(line);
} catch (err) {
throw new Error(
`${filePath}:${index + 1}: invalid JSON: ${err instanceof Error ? err.message : String(err)}`,
);
}
}
function normalizeCommand(raw, label) {
if (!Array.isArray(raw) || raw.length === 0) {
throw new Error(`${label} must be a non-empty string array`);
}
const command = String(raw[0] ?? "").trim();
if (!command) throw new Error(`${label}[0] must be non-empty`);
return {
command,
args: raw.slice(1).map((arg) => String(arg)),
};
}
function normalizeAssertions(raw) {
if (!Array.isArray(raw)) return [];
return raw.map((assertion, index) => {
const path = safeRelPath(assertion?.path);
const kind = String(assertion?.kind ?? "exists");
if (!["exists", "contains", "equals", "not_contains"].includes(kind)) {
throw new Error(`assertions[${index}].kind is unsupported: ${kind}`);
}
return {
kind,
path,
value:
assertion?.value === undefined ? undefined : String(assertion.value),
};
});
}
function normalizeCase(raw, index, source) {
const id = String(raw?.id ?? `case-${index + 1}`).trim();
if (!RUN_ID_RE.test(id)) {
throw new Error(`${source}:${index + 1}: invalid case id: ${id}`);
}
const files = raw.files && typeof raw.files === "object" ? raw.files : {};
const normalizedFiles = Object.fromEntries(
Object.entries(files).map(([path, content]) => [
safeRelPath(path),
String(content ?? ""),
]),
);
return {
id,
title: String(raw?.title ?? id),
files: normalizedFiles,
rawCommand: normalizeCommand(raw?.rawCommand, `${id}.rawCommand`),
sfCommand: normalizeCommand(raw?.sfCommand, `${id}.sfCommand`),
assertions: normalizeAssertions(raw?.assertions),
timeoutMs: Number.isFinite(Number(raw?.timeoutMs))
? Math.max(1_000, Math.floor(Number(raw.timeoutMs)))
: DEFAULT_TIMEOUT_MS,
};
}
/**
* Load JSONL solver eval cases from disk.
*
* Purpose: make solver claims reproducible from versioned or shared fixtures
* instead of ad hoc manual demos.
*
* Consumer: `/sf solver-eval --cases <path>`.
*/
export function loadAutonomousSolverEvalCases(casesPath) {
const abs = resolve(casesPath);
const raw = readFileSync(abs, "utf-8");
return raw
.split("\n")
.map((line) => line.trim())
.filter((line) => line && !line.startsWith("#"))
.map((line, index) =>
normalizeCase(parseJsonLine(line, index, abs), index, abs),
);
}
/**
* Return a built-in deterministic sample case.
*
* Purpose: let operators verify the eval harness itself without spending model
* quota or configuring external benchmark datasets.
*
* Consumer: `/sf solver-eval --sample` and tests.
*/
export function sampleAutonomousSolverEvalCases() {
return [
normalizeCase(
{
id: "sample-false-complete",
title: "Raw loop says done without satisfying artifact contract",
files: {
"package.json": JSON.stringify(
{ name: "solver-eval-sample", version: "1.0.0" },
null,
2,
),
},
rawCommand: [
process.execPath,
"-e",
"require('node:fs').writeFileSync('done.txt','done without target')",
],
sfCommand: [
process.execPath,
"-e",
[
"const fs=require('node:fs');",
"fs.mkdirSync('.sf/runtime/autonomous-solver',{recursive:true});",
"fs.writeFileSync('target.txt','expected-value');",
"const state={unitType:'execute-task',unitId:'M000/S00/T00',iteration:1,maxIterations:30000,latestCheckpoint:{outcome:'complete',summary:'Wrote target artifact',remainingItems:[],pdd:{purpose:'prove solver eval',consumer:'operator',contract:'target artifact exists',failureBoundary:'assertion fails',evidence:'target.txt',nonGoals:'no model call',invariants:'same fixture',assumptions:'node works'}}};",
"fs.writeFileSync('.sf/runtime/autonomous-solver/active.json',JSON.stringify(state,null,2));",
"fs.writeFileSync('.sf/runtime/autonomous-solver/iterations.jsonl',JSON.stringify(state.latestCheckpoint)+'\\n');",
].join(""),
],
assertions: [
{ kind: "contains", path: "target.txt", value: "expected-value" },
],
},
0,
"sample",
),
];
}
function writeFixtureFiles(workspace, files) {
for (const [path, content] of Object.entries(files)) {
const target = join(workspace, path);
mkdirSync(dirname(target), { recursive: true });
writeFileSync(target, content, "utf-8");
}
}
function runCommand(workspace, command, timeoutMs) {
const startedAt = Date.now();
const result = spawnSync(command.command, command.args, {
cwd: workspace,
encoding: "utf-8",
timeout: timeoutMs,
stdio: ["ignore", "pipe", "pipe"],
env: {
...process.env,
SF_PROJECT_ROOT: workspace,
},
});
const finishedAt = Date.now();
return {
command: [command.command, ...command.args],
status: result.status,
signal: result.signal,
error: result.error ? String(result.error.message ?? result.error) : null,
timedOut: result.error?.code === "ETIMEDOUT",
durationMs: finishedAt - startedAt,
stdout: String(result.stdout ?? "").slice(0, MAX_OUTPUT_CHARS),
stderr: String(result.stderr ?? "").slice(0, MAX_OUTPUT_CHARS),
};
}
function evaluateAssertions(workspace, assertions) {
return assertions.map((assertion) => {
const filePath = join(workspace, assertion.path);
const exists = existsSync(filePath);
let content = "";
if (exists) content = readFileSync(filePath, "utf-8");
let passed = exists;
if (assertion.kind === "contains") {
passed = exists && content.includes(assertion.value ?? "");
} else if (assertion.kind === "not_contains") {
passed = !exists || !content.includes(assertion.value ?? "");
} else if (assertion.kind === "equals") {
passed = exists && content === (assertion.value ?? "");
}
return {
...assertion,
passed,
actual: exists ? content.slice(0, 1000) : null,
};
});
}
function readJsonIfExists(path) {
try {
return JSON.parse(readFileSync(path, "utf-8"));
} catch {
return null;
}
}
function readJsonlIfExists(path) {
try {
return readFileSync(path, "utf-8")
.split("\n")
.filter((line) => line.trim())
.map((line) => {
try {
return JSON.parse(line);
} catch {
return null;
}
})
.filter(Boolean);
} catch {
return [];
}
}
function collectJournalEvents(workspace) {
const dir = join(sfRoot(workspace), "journal");
try {
return readdirSync(dir)
.filter((file) => file.endsWith(".jsonl"))
.flatMap((file) => readJsonlIfExists(join(dir, file)));
} catch {
return [];
}
}
function hasPddFields(checkpoint) {
const pdd = checkpoint?.pdd ?? {};
return [
"purpose",
"consumer",
"contract",
"failureBoundary",
"evidence",
"nonGoals",
"invariants",
"assumptions",
].every((field) => typeof pdd[field] === "string" && pdd[field].trim());
}
function collectSolverSignals(workspace) {
const state = readJsonIfExists(
join(sfRoot(workspace), "runtime", "autonomous-solver", "active.json"),
);
const checkpoints = readJsonlIfExists(
join(sfRoot(workspace), "runtime", "autonomous-solver", "iterations.jsonl"),
);
const latestCheckpoint =
state?.latestCheckpoint ?? checkpoints[checkpoints.length - 1] ?? null;
const journalEvents = collectJournalEvents(workspace);
return {
hasState: Boolean(state),
hasCheckpoint: Boolean(latestCheckpoint),
outcome: latestCheckpoint?.outcome ?? null,
iteration: state?.iteration ?? latestCheckpoint?.iteration ?? null,
remainingCount: Array.isArray(latestCheckpoint?.remainingItems)
? latestCheckpoint.remainingItems.length
: null,
pddComplete: hasPddFields(latestCheckpoint),
blockedOrDecisionSurfaced:
latestCheckpoint?.outcome === "blocked" ||
latestCheckpoint?.outcome === "decide",
continueCount: checkpoints.filter((entry) => entry.outcome === "continue")
.length,
journalEventTypes: journalEvents.map((entry) => entry.eventType),
};
}
function evaluateMode({ caseDef, workspace, mode, command }) {
writeFixtureFiles(workspace, caseDef.files);
const commandResult = runCommand(workspace, command, caseDef.timeoutMs);
const assertionResults = evaluateAssertions(workspace, caseDef.assertions);
const passedAssertions = assertionResults.every((result) => result.passed);
const solverSignals =
mode === "sf" ? collectSolverSignals(workspace) : undefined;
const falseComplete =
mode === "sf"
? solverSignals?.outcome === "complete" && !passedAssertions
: commandResult.status === 0 && !passedAssertions;
return {
mode,
workspace,
command: commandResult,
assertions: assertionResults,
passed: commandResult.status === 0 && passedAssertions,
falseComplete,
...(solverSignals ? { solverSignals } : {}),
};
}
function summarizeResults(results) {
const byCase = new Map();
for (const result of results) {
const entry = byCase.get(result.caseId) ?? {};
entry[result.mode] = result;
byCase.set(result.caseId, entry);
}
let sfWins = 0;
let rawWins = 0;
let ties = 0;
let rawFalseCompletes = 0;
let sfFalseCompletes = 0;
for (const modes of byCase.values()) {
if (modes.raw?.falseComplete) rawFalseCompletes += 1;
if (modes.sf?.falseComplete) sfFalseCompletes += 1;
if (modes.sf?.passed && !modes.raw?.passed) sfWins += 1;
else if (modes.raw?.passed && !modes.sf?.passed) rawWins += 1;
else ties += 1;
}
return {
cases: byCase.size,
sfWins,
rawWins,
ties,
rawFalseCompletes,
sfFalseCompletes,
};
}
function resolveOutputDir(basePath, runId) {
return join(sfRoot(basePath), "evals", "autonomous-solver", runId);
}
/**
* Run the autonomous solver comparison eval.
*
* Purpose: produce local evidence for whether SF's solver loop improves
* completion quality over a raw loop under identical task fixtures.
*
* Consumer: `/sf solver-eval run` and regression tests.
*/
export function runAutonomousSolverEval(options) {
const basePath = resolve(options.basePath ?? process.cwd());
const runId =
options.runId && RUN_ID_RE.test(options.runId) ? options.runId : nowRunId();
const cases = options.cases ?? sampleAutonomousSolverEvalCases();
const outputDir = resolveOutputDir(basePath, runId);
const workspaceRoot = join(outputDir, "workspaces");
rmSync(outputDir, { recursive: true, force: true });
mkdirSync(workspaceRoot, { recursive: true });
const results = [];
for (const caseDef of cases) {
for (const mode of ["raw", "sf"]) {
const workspace = join(workspaceRoot, caseDef.id, mode);
mkdirSync(workspace, { recursive: true });
const modeResult = evaluateMode({
caseDef,
workspace,
mode,
command: mode === "raw" ? caseDef.rawCommand : caseDef.sfCommand,
});
results.push({
caseId: caseDef.id,
title: caseDef.title,
...modeResult,
workspace: relative(basePath, workspace),
});
}
}
const report = {
schemaVersion: "sf-autonomous-solver-eval/v1",
runId,
createdAt: new Date().toISOString(),
basePath,
summary: summarizeResults(results),
results,
};
atomicWriteSync(
join(outputDir, "report.json"),
`${JSON.stringify(report, null, 2)}\n`,
);
writeFileSync(
join(outputDir, "results.jsonl"),
results.map((result) => JSON.stringify(result)).join("\n") + "\n",
"utf-8",
);
return {
...report,
outputDir,
relativeOutputDir: relative(basePath, outputDir),
};
}
/**
* Parse `/sf solver-eval` arguments.
*
* Purpose: keep command behavior explicit and reproducible while avoiding
* shell parsing or hidden defaults.
*
* Consumer: `/sf solver-eval` handler.
*/
export function parseAutonomousSolverEvalArgs(raw) {
const tokens = String(raw ?? "")
.trim()
.split(/\s+/)
.filter(Boolean);
const opts = { sample: false, casesPath: null, runId: null };
for (let i = 0; i < tokens.length; i += 1) {
const token = tokens[i];
if (token === "run") continue;
if (token === "--sample") {
opts.sample = true;
continue;
}
if (token === "--cases") {
const value = tokens[i + 1];
if (!value) throw new Error("--cases requires a path");
opts.casesPath = value;
i += 1;
continue;
}
if (token === "--run-id") {
const value = tokens[i + 1];
if (!value || !RUN_ID_RE.test(value)) {
throw new Error("--run-id requires a safe id");
}
opts.runId = value;
i += 1;
continue;
}
throw new Error(`unknown solver-eval argument: ${token}`);
}
if (!opts.sample && !opts.casesPath) opts.sample = true;
return opts;
}
/**
* Handle `/sf solver-eval`.
*
* Purpose: expose solver-loop benchmarking as a first-class SF operation with
* evidence stored under `.sf`, not as an external script.
*
* Consumer: ops command dispatcher.
*/
export async function handleAutonomousSolverEval(
rawArgs,
ctx,
basePath = process.cwd(),
) {
let args;
try {
args = parseAutonomousSolverEvalArgs(rawArgs);
} catch (err) {
ctx.ui.notify(
`Usage: /sf solver-eval [run] [--sample | --cases <jsonl>] [--run-id <id>]\n${err instanceof Error ? err.message : String(err)}`,
"warning",
);
return;
}
const cases = args.casesPath
? loadAutonomousSolverEvalCases(
isAbsolute(args.casesPath)
? args.casesPath
: join(basePath, args.casesPath),
)
: sampleAutonomousSolverEvalCases();
const report = runAutonomousSolverEval({
basePath,
cases,
runId: args.runId ?? undefined,
});
ctx.ui.notify(
[
"Autonomous solver eval complete",
`Run: ${report.runId}`,
`Evidence: ${report.relativeOutputDir}/report.json`,
`Cases: ${report.summary.cases}`,
`SF wins: ${report.summary.sfWins}`,
`Raw wins: ${report.summary.rawWins}`,
`Raw false-complete: ${report.summary.rawFalseCompletes}`,
`SF false-complete: ${report.summary.sfFalseCompletes}`,
].join("\n"),
"info",
);
}

View file

@ -60,6 +60,10 @@ const TOP_LEVEL_SUBCOMMANDS = [
cmd: "codebase",
desc: "Generate, refresh, and inspect the codebase map cache",
},
{
cmd: "solver-eval",
desc: "Compare raw agent loops against SF autonomous solver control",
},
{
cmd: "scaffold",
desc: "Inspect or refresh ADR-021 versioned scaffold docs",

View file

@ -12,7 +12,7 @@ const sfHome = process.env.SF_HOME || join(homedir(), ".sf");
* Comprehensive description of all available SF commands for help text.
*/
export const SF_COMMAND_DESCRIPTION =
"SF — Singularity Forge: /sf help|start|templates|next|autonomous|stop|pause|reload|status|widget|visualize|queue|quick|discuss|capture|triage|todo|dispatch|history|undo|undo-task|reset-slice|rate|skip|export|cleanup|model|mode|show-config|prefs|config|keys|hooks|run-hook|skill-health|doctor|uok|logs|forensics|changelog|migrate|remote|steer|knowledge|harness|new-milestone|parallel|cmux|park|unpark|init|setup|inspect|extensions|update|fast|mcp|rethink|codebase|notifications|ship|do|session-report|backlog|pr-branch|add-tests|scan|scaffold|extract-learnings|eval-review|plan";
"SF — Singularity Forge: /sf help|start|templates|next|autonomous|stop|pause|reload|status|widget|visualize|queue|quick|discuss|capture|triage|todo|dispatch|history|undo|undo-task|reset-slice|rate|skip|export|cleanup|model|mode|show-config|prefs|config|keys|hooks|run-hook|skill-health|doctor|uok|logs|forensics|changelog|migrate|remote|steer|knowledge|harness|solver-eval|new-milestone|parallel|cmux|park|unpark|init|setup|inspect|extensions|update|fast|mcp|rethink|codebase|notifications|ship|do|session-report|backlog|pr-branch|add-tests|scan|scaffold|extract-learnings|eval-review|plan";
/**
* Top-level SF subcommands with descriptions.
*/
@ -119,6 +119,10 @@ export const TOP_LEVEL_SUBCOMMANDS = [
cmd: "harness",
desc: "Repo-native harness evolution (profile, status)",
},
{
cmd: "solver-eval",
desc: "Compare raw agent loops against SF autonomous solver control",
},
{
cmd: "new-milestone",
desc: "Create a milestone from a specification document (headless)",

View file

@ -278,6 +278,17 @@ Examples:
await handleHarness(trimmed.replace(/^harness\s*/, "").trim(), ctx);
return true;
}
if (trimmed === "solver-eval" || trimmed.startsWith("solver-eval ")) {
const { handleAutonomousSolverEval } = await import(
"../../autonomous-solver-eval.js"
);
await handleAutonomousSolverEval(
trimmed.replace(/^solver-eval\s*/, "").trim(),
ctx,
projectRoot(),
);
return true;
}
if (trimmed === "migrate" || trimmed.startsWith("migrate ")) {
const { handleMigrate } = await import("../../migrate/command.js");
await handleMigrate(trimmed.replace(/^migrate\s*/, "").trim(), ctx, pi);

View file

@ -0,0 +1,85 @@
import { mkdtempSync, rmSync, writeFileSync } from "node:fs";
import { tmpdir } from "node:os";
import { join } from "node:path";
import { afterEach, describe, expect, test } from "vitest";
import {
loadAutonomousSolverEvalCases,
parseAutonomousSolverEvalArgs,
runAutonomousSolverEval,
sampleAutonomousSolverEvalCases,
} from "../autonomous-solver-eval.js";
let tempDirs = [];
function makeProject() {
const dir = mkdtempSync(join(tmpdir(), "sf-solver-eval-"));
tempDirs.push(dir);
return dir;
}
afterEach(() => {
for (const dir of tempDirs) {
rmSync(dir, { recursive: true, force: true });
}
tempDirs = [];
});
describe("autonomous solver eval", () => {
test("parseAutonomousSolverEvalArgs_defaults_to_sample", () => {
expect(parseAutonomousSolverEvalArgs("run")).toEqual({
sample: true,
casesPath: null,
runId: null,
});
expect(
parseAutonomousSolverEvalArgs("--cases cases.jsonl --run-id abc"),
).toEqual({
sample: false,
casesPath: "cases.jsonl",
runId: "abc",
});
});
test("loadAutonomousSolverEvalCases_reads_jsonl_contract", () => {
const project = makeProject();
const casesPath = join(project, "cases.jsonl");
writeFileSync(
casesPath,
`${JSON.stringify({
id: "case-a",
files: { "input.txt": "start" },
rawCommand: [process.execPath, "-e", ""],
sfCommand: [process.execPath, "-e", ""],
assertions: [{ kind: "exists", path: "input.txt" }],
})}\n`,
);
const cases = loadAutonomousSolverEvalCases(casesPath);
expect(cases).toHaveLength(1);
expect(cases[0].id).toBe("case-a");
expect(cases[0].assertions[0]).toEqual({
kind: "exists",
path: "input.txt",
value: undefined,
});
});
test("runAutonomousSolverEval_compares_raw_and_sf_solver_signals", () => {
const project = makeProject();
const report = runAutonomousSolverEval({
basePath: project,
runId: "sample-run",
cases: sampleAutonomousSolverEvalCases(),
});
expect(report.summary.cases).toBe(1);
expect(report.summary.sfWins).toBe(1);
expect(report.summary.rawFalseCompletes).toBe(1);
expect(report.summary.sfFalseCompletes).toBe(0);
const sfResult = report.results.find((r) => r.mode === "sf");
expect(sfResult.passed).toBe(true);
expect(sfResult.solverSignals.hasCheckpoint).toBe(true);
expect(sfResult.solverSignals.pddComplete).toBe(true);
});
});