From dc51baa19a3ddeac10c0893464a20a2efef55e10 Mon Sep 17 00:00:00 2001 From: Mikael Hugo Date: Wed, 6 May 2026 03:37:58 +0200 Subject: [PATCH] feat: add autonomous solver eval command --- .../extensions/sf/autonomous-solver-eval.js | 517 ++++++++++++++++++ .../extensions/sf/commands-bootstrap.js | 4 + .../extensions/sf/commands/catalog.js | 6 +- .../extensions/sf/commands/handlers/ops.js | 11 + .../sf/tests/autonomous-solver-eval.test.mjs | 85 +++ 5 files changed, 622 insertions(+), 1 deletion(-) create mode 100644 src/resources/extensions/sf/autonomous-solver-eval.js create mode 100644 src/resources/extensions/sf/tests/autonomous-solver-eval.test.mjs diff --git a/src/resources/extensions/sf/autonomous-solver-eval.js b/src/resources/extensions/sf/autonomous-solver-eval.js new file mode 100644 index 000000000..e2f568841 --- /dev/null +++ b/src/resources/extensions/sf/autonomous-solver-eval.js @@ -0,0 +1,517 @@ +/** + * autonomous-solver-eval.ts — first-class eval runner for solver-loop value. + * + * Purpose: compare raw agent loops against SF's autonomous solver control + * plane using the same task fixtures, deterministic assertions, and solver + * observability signals. + * + * Consumer: `/sf solver-eval` and focused regression tests. + */ +import { spawnSync } from "node:child_process"; +import { + existsSync, + mkdirSync, + readdirSync, + readFileSync, + rmSync, + writeFileSync, +} from "node:fs"; +import { dirname, isAbsolute, join, relative, resolve } from "node:path"; +import { atomicWriteSync } from "./atomic-write.js"; +import { sfRoot } from "./paths.js"; + +const DEFAULT_TIMEOUT_MS = 5 * 60_000; +const MAX_OUTPUT_CHARS = 20_000; +const RUN_ID_RE = /^[a-z0-9][a-z0-9._-]{0,80}$/i; + +function nowRunId() { + return new Date().toISOString().replace(/[:.]/g, "-"); +} + +function safeRelPath(path) { + const normalized = String(path ?? "").replaceAll("\\", "/"); + if ( + !normalized || + normalized.startsWith("/") || + normalized.split("/").includes("..") + ) { + throw new Error(`unsafe relative path: ${path}`); + } + return normalized; +} + +function parseJsonLine(line, index, filePath) { + try { + return JSON.parse(line); + } catch (err) { + throw new Error( + `${filePath}:${index + 1}: invalid JSON: ${err instanceof Error ? err.message : String(err)}`, + ); + } +} + +function normalizeCommand(raw, label) { + if (!Array.isArray(raw) || raw.length === 0) { + throw new Error(`${label} must be a non-empty string array`); + } + const command = String(raw[0] ?? "").trim(); + if (!command) throw new Error(`${label}[0] must be non-empty`); + return { + command, + args: raw.slice(1).map((arg) => String(arg)), + }; +} + +function normalizeAssertions(raw) { + if (!Array.isArray(raw)) return []; + return raw.map((assertion, index) => { + const path = safeRelPath(assertion?.path); + const kind = String(assertion?.kind ?? "exists"); + if (!["exists", "contains", "equals", "not_contains"].includes(kind)) { + throw new Error(`assertions[${index}].kind is unsupported: ${kind}`); + } + return { + kind, + path, + value: + assertion?.value === undefined ? undefined : String(assertion.value), + }; + }); +} + +function normalizeCase(raw, index, source) { + const id = String(raw?.id ?? `case-${index + 1}`).trim(); + if (!RUN_ID_RE.test(id)) { + throw new Error(`${source}:${index + 1}: invalid case id: ${id}`); + } + const files = raw.files && typeof raw.files === "object" ? raw.files : {}; + const normalizedFiles = Object.fromEntries( + Object.entries(files).map(([path, content]) => [ + safeRelPath(path), + String(content ?? ""), + ]), + ); + return { + id, + title: String(raw?.title ?? id), + files: normalizedFiles, + rawCommand: normalizeCommand(raw?.rawCommand, `${id}.rawCommand`), + sfCommand: normalizeCommand(raw?.sfCommand, `${id}.sfCommand`), + assertions: normalizeAssertions(raw?.assertions), + timeoutMs: Number.isFinite(Number(raw?.timeoutMs)) + ? Math.max(1_000, Math.floor(Number(raw.timeoutMs))) + : DEFAULT_TIMEOUT_MS, + }; +} + +/** + * Load JSONL solver eval cases from disk. + * + * Purpose: make solver claims reproducible from versioned or shared fixtures + * instead of ad hoc manual demos. + * + * Consumer: `/sf solver-eval --cases `. + */ +export function loadAutonomousSolverEvalCases(casesPath) { + const abs = resolve(casesPath); + const raw = readFileSync(abs, "utf-8"); + return raw + .split("\n") + .map((line) => line.trim()) + .filter((line) => line && !line.startsWith("#")) + .map((line, index) => + normalizeCase(parseJsonLine(line, index, abs), index, abs), + ); +} + +/** + * Return a built-in deterministic sample case. + * + * Purpose: let operators verify the eval harness itself without spending model + * quota or configuring external benchmark datasets. + * + * Consumer: `/sf solver-eval --sample` and tests. + */ +export function sampleAutonomousSolverEvalCases() { + return [ + normalizeCase( + { + id: "sample-false-complete", + title: "Raw loop says done without satisfying artifact contract", + files: { + "package.json": JSON.stringify( + { name: "solver-eval-sample", version: "1.0.0" }, + null, + 2, + ), + }, + rawCommand: [ + process.execPath, + "-e", + "require('node:fs').writeFileSync('done.txt','done without target')", + ], + sfCommand: [ + process.execPath, + "-e", + [ + "const fs=require('node:fs');", + "fs.mkdirSync('.sf/runtime/autonomous-solver',{recursive:true});", + "fs.writeFileSync('target.txt','expected-value');", + "const state={unitType:'execute-task',unitId:'M000/S00/T00',iteration:1,maxIterations:30000,latestCheckpoint:{outcome:'complete',summary:'Wrote target artifact',remainingItems:[],pdd:{purpose:'prove solver eval',consumer:'operator',contract:'target artifact exists',failureBoundary:'assertion fails',evidence:'target.txt',nonGoals:'no model call',invariants:'same fixture',assumptions:'node works'}}};", + "fs.writeFileSync('.sf/runtime/autonomous-solver/active.json',JSON.stringify(state,null,2));", + "fs.writeFileSync('.sf/runtime/autonomous-solver/iterations.jsonl',JSON.stringify(state.latestCheckpoint)+'\\n');", + ].join(""), + ], + assertions: [ + { kind: "contains", path: "target.txt", value: "expected-value" }, + ], + }, + 0, + "sample", + ), + ]; +} + +function writeFixtureFiles(workspace, files) { + for (const [path, content] of Object.entries(files)) { + const target = join(workspace, path); + mkdirSync(dirname(target), { recursive: true }); + writeFileSync(target, content, "utf-8"); + } +} + +function runCommand(workspace, command, timeoutMs) { + const startedAt = Date.now(); + const result = spawnSync(command.command, command.args, { + cwd: workspace, + encoding: "utf-8", + timeout: timeoutMs, + stdio: ["ignore", "pipe", "pipe"], + env: { + ...process.env, + SF_PROJECT_ROOT: workspace, + }, + }); + const finishedAt = Date.now(); + return { + command: [command.command, ...command.args], + status: result.status, + signal: result.signal, + error: result.error ? String(result.error.message ?? result.error) : null, + timedOut: result.error?.code === "ETIMEDOUT", + durationMs: finishedAt - startedAt, + stdout: String(result.stdout ?? "").slice(0, MAX_OUTPUT_CHARS), + stderr: String(result.stderr ?? "").slice(0, MAX_OUTPUT_CHARS), + }; +} + +function evaluateAssertions(workspace, assertions) { + return assertions.map((assertion) => { + const filePath = join(workspace, assertion.path); + const exists = existsSync(filePath); + let content = ""; + if (exists) content = readFileSync(filePath, "utf-8"); + let passed = exists; + if (assertion.kind === "contains") { + passed = exists && content.includes(assertion.value ?? ""); + } else if (assertion.kind === "not_contains") { + passed = !exists || !content.includes(assertion.value ?? ""); + } else if (assertion.kind === "equals") { + passed = exists && content === (assertion.value ?? ""); + } + return { + ...assertion, + passed, + actual: exists ? content.slice(0, 1000) : null, + }; + }); +} + +function readJsonIfExists(path) { + try { + return JSON.parse(readFileSync(path, "utf-8")); + } catch { + return null; + } +} + +function readJsonlIfExists(path) { + try { + return readFileSync(path, "utf-8") + .split("\n") + .filter((line) => line.trim()) + .map((line) => { + try { + return JSON.parse(line); + } catch { + return null; + } + }) + .filter(Boolean); + } catch { + return []; + } +} + +function collectJournalEvents(workspace) { + const dir = join(sfRoot(workspace), "journal"); + try { + return readdirSync(dir) + .filter((file) => file.endsWith(".jsonl")) + .flatMap((file) => readJsonlIfExists(join(dir, file))); + } catch { + return []; + } +} + +function hasPddFields(checkpoint) { + const pdd = checkpoint?.pdd ?? {}; + return [ + "purpose", + "consumer", + "contract", + "failureBoundary", + "evidence", + "nonGoals", + "invariants", + "assumptions", + ].every((field) => typeof pdd[field] === "string" && pdd[field].trim()); +} + +function collectSolverSignals(workspace) { + const state = readJsonIfExists( + join(sfRoot(workspace), "runtime", "autonomous-solver", "active.json"), + ); + const checkpoints = readJsonlIfExists( + join(sfRoot(workspace), "runtime", "autonomous-solver", "iterations.jsonl"), + ); + const latestCheckpoint = + state?.latestCheckpoint ?? checkpoints[checkpoints.length - 1] ?? null; + const journalEvents = collectJournalEvents(workspace); + return { + hasState: Boolean(state), + hasCheckpoint: Boolean(latestCheckpoint), + outcome: latestCheckpoint?.outcome ?? null, + iteration: state?.iteration ?? latestCheckpoint?.iteration ?? null, + remainingCount: Array.isArray(latestCheckpoint?.remainingItems) + ? latestCheckpoint.remainingItems.length + : null, + pddComplete: hasPddFields(latestCheckpoint), + blockedOrDecisionSurfaced: + latestCheckpoint?.outcome === "blocked" || + latestCheckpoint?.outcome === "decide", + continueCount: checkpoints.filter((entry) => entry.outcome === "continue") + .length, + journalEventTypes: journalEvents.map((entry) => entry.eventType), + }; +} + +function evaluateMode({ caseDef, workspace, mode, command }) { + writeFixtureFiles(workspace, caseDef.files); + const commandResult = runCommand(workspace, command, caseDef.timeoutMs); + const assertionResults = evaluateAssertions(workspace, caseDef.assertions); + const passedAssertions = assertionResults.every((result) => result.passed); + const solverSignals = + mode === "sf" ? collectSolverSignals(workspace) : undefined; + const falseComplete = + mode === "sf" + ? solverSignals?.outcome === "complete" && !passedAssertions + : commandResult.status === 0 && !passedAssertions; + return { + mode, + workspace, + command: commandResult, + assertions: assertionResults, + passed: commandResult.status === 0 && passedAssertions, + falseComplete, + ...(solverSignals ? { solverSignals } : {}), + }; +} + +function summarizeResults(results) { + const byCase = new Map(); + for (const result of results) { + const entry = byCase.get(result.caseId) ?? {}; + entry[result.mode] = result; + byCase.set(result.caseId, entry); + } + let sfWins = 0; + let rawWins = 0; + let ties = 0; + let rawFalseCompletes = 0; + let sfFalseCompletes = 0; + for (const modes of byCase.values()) { + if (modes.raw?.falseComplete) rawFalseCompletes += 1; + if (modes.sf?.falseComplete) sfFalseCompletes += 1; + if (modes.sf?.passed && !modes.raw?.passed) sfWins += 1; + else if (modes.raw?.passed && !modes.sf?.passed) rawWins += 1; + else ties += 1; + } + return { + cases: byCase.size, + sfWins, + rawWins, + ties, + rawFalseCompletes, + sfFalseCompletes, + }; +} + +function resolveOutputDir(basePath, runId) { + return join(sfRoot(basePath), "evals", "autonomous-solver", runId); +} + +/** + * Run the autonomous solver comparison eval. + * + * Purpose: produce local evidence for whether SF's solver loop improves + * completion quality over a raw loop under identical task fixtures. + * + * Consumer: `/sf solver-eval run` and regression tests. + */ +export function runAutonomousSolverEval(options) { + const basePath = resolve(options.basePath ?? process.cwd()); + const runId = + options.runId && RUN_ID_RE.test(options.runId) ? options.runId : nowRunId(); + const cases = options.cases ?? sampleAutonomousSolverEvalCases(); + const outputDir = resolveOutputDir(basePath, runId); + const workspaceRoot = join(outputDir, "workspaces"); + rmSync(outputDir, { recursive: true, force: true }); + mkdirSync(workspaceRoot, { recursive: true }); + + const results = []; + for (const caseDef of cases) { + for (const mode of ["raw", "sf"]) { + const workspace = join(workspaceRoot, caseDef.id, mode); + mkdirSync(workspace, { recursive: true }); + const modeResult = evaluateMode({ + caseDef, + workspace, + mode, + command: mode === "raw" ? caseDef.rawCommand : caseDef.sfCommand, + }); + results.push({ + caseId: caseDef.id, + title: caseDef.title, + ...modeResult, + workspace: relative(basePath, workspace), + }); + } + } + + const report = { + schemaVersion: "sf-autonomous-solver-eval/v1", + runId, + createdAt: new Date().toISOString(), + basePath, + summary: summarizeResults(results), + results, + }; + atomicWriteSync( + join(outputDir, "report.json"), + `${JSON.stringify(report, null, 2)}\n`, + ); + writeFileSync( + join(outputDir, "results.jsonl"), + results.map((result) => JSON.stringify(result)).join("\n") + "\n", + "utf-8", + ); + return { + ...report, + outputDir, + relativeOutputDir: relative(basePath, outputDir), + }; +} + +/** + * Parse `/sf solver-eval` arguments. + * + * Purpose: keep command behavior explicit and reproducible while avoiding + * shell parsing or hidden defaults. + * + * Consumer: `/sf solver-eval` handler. + */ +export function parseAutonomousSolverEvalArgs(raw) { + const tokens = String(raw ?? "") + .trim() + .split(/\s+/) + .filter(Boolean); + const opts = { sample: false, casesPath: null, runId: null }; + for (let i = 0; i < tokens.length; i += 1) { + const token = tokens[i]; + if (token === "run") continue; + if (token === "--sample") { + opts.sample = true; + continue; + } + if (token === "--cases") { + const value = tokens[i + 1]; + if (!value) throw new Error("--cases requires a path"); + opts.casesPath = value; + i += 1; + continue; + } + if (token === "--run-id") { + const value = tokens[i + 1]; + if (!value || !RUN_ID_RE.test(value)) { + throw new Error("--run-id requires a safe id"); + } + opts.runId = value; + i += 1; + continue; + } + throw new Error(`unknown solver-eval argument: ${token}`); + } + if (!opts.sample && !opts.casesPath) opts.sample = true; + return opts; +} + +/** + * Handle `/sf solver-eval`. + * + * Purpose: expose solver-loop benchmarking as a first-class SF operation with + * evidence stored under `.sf`, not as an external script. + * + * Consumer: ops command dispatcher. + */ +export async function handleAutonomousSolverEval( + rawArgs, + ctx, + basePath = process.cwd(), +) { + let args; + try { + args = parseAutonomousSolverEvalArgs(rawArgs); + } catch (err) { + ctx.ui.notify( + `Usage: /sf solver-eval [run] [--sample | --cases ] [--run-id ]\n${err instanceof Error ? err.message : String(err)}`, + "warning", + ); + return; + } + const cases = args.casesPath + ? loadAutonomousSolverEvalCases( + isAbsolute(args.casesPath) + ? args.casesPath + : join(basePath, args.casesPath), + ) + : sampleAutonomousSolverEvalCases(); + const report = runAutonomousSolverEval({ + basePath, + cases, + runId: args.runId ?? undefined, + }); + ctx.ui.notify( + [ + "Autonomous solver eval complete", + `Run: ${report.runId}`, + `Evidence: ${report.relativeOutputDir}/report.json`, + `Cases: ${report.summary.cases}`, + `SF wins: ${report.summary.sfWins}`, + `Raw wins: ${report.summary.rawWins}`, + `Raw false-complete: ${report.summary.rawFalseCompletes}`, + `SF false-complete: ${report.summary.sfFalseCompletes}`, + ].join("\n"), + "info", + ); +} diff --git a/src/resources/extensions/sf/commands-bootstrap.js b/src/resources/extensions/sf/commands-bootstrap.js index 4808c7080..5ef7d1ae3 100644 --- a/src/resources/extensions/sf/commands-bootstrap.js +++ b/src/resources/extensions/sf/commands-bootstrap.js @@ -60,6 +60,10 @@ const TOP_LEVEL_SUBCOMMANDS = [ cmd: "codebase", desc: "Generate, refresh, and inspect the codebase map cache", }, + { + cmd: "solver-eval", + desc: "Compare raw agent loops against SF autonomous solver control", + }, { cmd: "scaffold", desc: "Inspect or refresh ADR-021 versioned scaffold docs", diff --git a/src/resources/extensions/sf/commands/catalog.js b/src/resources/extensions/sf/commands/catalog.js index 1a96d00a8..e08dd8855 100644 --- a/src/resources/extensions/sf/commands/catalog.js +++ b/src/resources/extensions/sf/commands/catalog.js @@ -12,7 +12,7 @@ const sfHome = process.env.SF_HOME || join(homedir(), ".sf"); * Comprehensive description of all available SF commands for help text. */ export const SF_COMMAND_DESCRIPTION = - "SF — Singularity Forge: /sf help|start|templates|next|autonomous|stop|pause|reload|status|widget|visualize|queue|quick|discuss|capture|triage|todo|dispatch|history|undo|undo-task|reset-slice|rate|skip|export|cleanup|model|mode|show-config|prefs|config|keys|hooks|run-hook|skill-health|doctor|uok|logs|forensics|changelog|migrate|remote|steer|knowledge|harness|new-milestone|parallel|cmux|park|unpark|init|setup|inspect|extensions|update|fast|mcp|rethink|codebase|notifications|ship|do|session-report|backlog|pr-branch|add-tests|scan|scaffold|extract-learnings|eval-review|plan"; + "SF — Singularity Forge: /sf help|start|templates|next|autonomous|stop|pause|reload|status|widget|visualize|queue|quick|discuss|capture|triage|todo|dispatch|history|undo|undo-task|reset-slice|rate|skip|export|cleanup|model|mode|show-config|prefs|config|keys|hooks|run-hook|skill-health|doctor|uok|logs|forensics|changelog|migrate|remote|steer|knowledge|harness|solver-eval|new-milestone|parallel|cmux|park|unpark|init|setup|inspect|extensions|update|fast|mcp|rethink|codebase|notifications|ship|do|session-report|backlog|pr-branch|add-tests|scan|scaffold|extract-learnings|eval-review|plan"; /** * Top-level SF subcommands with descriptions. */ @@ -119,6 +119,10 @@ export const TOP_LEVEL_SUBCOMMANDS = [ cmd: "harness", desc: "Repo-native harness evolution (profile, status)", }, + { + cmd: "solver-eval", + desc: "Compare raw agent loops against SF autonomous solver control", + }, { cmd: "new-milestone", desc: "Create a milestone from a specification document (headless)", diff --git a/src/resources/extensions/sf/commands/handlers/ops.js b/src/resources/extensions/sf/commands/handlers/ops.js index 97716b006..6d43c2c33 100644 --- a/src/resources/extensions/sf/commands/handlers/ops.js +++ b/src/resources/extensions/sf/commands/handlers/ops.js @@ -278,6 +278,17 @@ Examples: await handleHarness(trimmed.replace(/^harness\s*/, "").trim(), ctx); return true; } + if (trimmed === "solver-eval" || trimmed.startsWith("solver-eval ")) { + const { handleAutonomousSolverEval } = await import( + "../../autonomous-solver-eval.js" + ); + await handleAutonomousSolverEval( + trimmed.replace(/^solver-eval\s*/, "").trim(), + ctx, + projectRoot(), + ); + return true; + } if (trimmed === "migrate" || trimmed.startsWith("migrate ")) { const { handleMigrate } = await import("../../migrate/command.js"); await handleMigrate(trimmed.replace(/^migrate\s*/, "").trim(), ctx, pi); diff --git a/src/resources/extensions/sf/tests/autonomous-solver-eval.test.mjs b/src/resources/extensions/sf/tests/autonomous-solver-eval.test.mjs new file mode 100644 index 000000000..c605daf22 --- /dev/null +++ b/src/resources/extensions/sf/tests/autonomous-solver-eval.test.mjs @@ -0,0 +1,85 @@ +import { mkdtempSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { afterEach, describe, expect, test } from "vitest"; +import { + loadAutonomousSolverEvalCases, + parseAutonomousSolverEvalArgs, + runAutonomousSolverEval, + sampleAutonomousSolverEvalCases, +} from "../autonomous-solver-eval.js"; + +let tempDirs = []; + +function makeProject() { + const dir = mkdtempSync(join(tmpdir(), "sf-solver-eval-")); + tempDirs.push(dir); + return dir; +} + +afterEach(() => { + for (const dir of tempDirs) { + rmSync(dir, { recursive: true, force: true }); + } + tempDirs = []; +}); + +describe("autonomous solver eval", () => { + test("parseAutonomousSolverEvalArgs_defaults_to_sample", () => { + expect(parseAutonomousSolverEvalArgs("run")).toEqual({ + sample: true, + casesPath: null, + runId: null, + }); + expect( + parseAutonomousSolverEvalArgs("--cases cases.jsonl --run-id abc"), + ).toEqual({ + sample: false, + casesPath: "cases.jsonl", + runId: "abc", + }); + }); + + test("loadAutonomousSolverEvalCases_reads_jsonl_contract", () => { + const project = makeProject(); + const casesPath = join(project, "cases.jsonl"); + writeFileSync( + casesPath, + `${JSON.stringify({ + id: "case-a", + files: { "input.txt": "start" }, + rawCommand: [process.execPath, "-e", ""], + sfCommand: [process.execPath, "-e", ""], + assertions: [{ kind: "exists", path: "input.txt" }], + })}\n`, + ); + + const cases = loadAutonomousSolverEvalCases(casesPath); + + expect(cases).toHaveLength(1); + expect(cases[0].id).toBe("case-a"); + expect(cases[0].assertions[0]).toEqual({ + kind: "exists", + path: "input.txt", + value: undefined, + }); + }); + + test("runAutonomousSolverEval_compares_raw_and_sf_solver_signals", () => { + const project = makeProject(); + const report = runAutonomousSolverEval({ + basePath: project, + runId: "sample-run", + cases: sampleAutonomousSolverEvalCases(), + }); + + expect(report.summary.cases).toBe(1); + expect(report.summary.sfWins).toBe(1); + expect(report.summary.rawFalseCompletes).toBe(1); + expect(report.summary.sfFalseCompletes).toBe(0); + const sfResult = report.results.find((r) => r.mode === "sf"); + expect(sfResult.passed).toBe(true); + expect(sfResult.solverSignals.hasCheckpoint).toBe(true); + expect(sfResult.solverSignals.pddComplete).toBe(true); + }); +});