feat: add autonomous solver eval command
This commit is contained in:
parent
34140fff38
commit
dc51baa19a
5 changed files with 622 additions and 1 deletions
517
src/resources/extensions/sf/autonomous-solver-eval.js
Normal file
517
src/resources/extensions/sf/autonomous-solver-eval.js
Normal file
|
|
@ -0,0 +1,517 @@
|
|||
/**
|
||||
* autonomous-solver-eval.ts — first-class eval runner for solver-loop value.
|
||||
*
|
||||
* Purpose: compare raw agent loops against SF's autonomous solver control
|
||||
* plane using the same task fixtures, deterministic assertions, and solver
|
||||
* observability signals.
|
||||
*
|
||||
* Consumer: `/sf solver-eval` and focused regression tests.
|
||||
*/
|
||||
import { spawnSync } from "node:child_process";
|
||||
import {
|
||||
existsSync,
|
||||
mkdirSync,
|
||||
readdirSync,
|
||||
readFileSync,
|
||||
rmSync,
|
||||
writeFileSync,
|
||||
} from "node:fs";
|
||||
import { dirname, isAbsolute, join, relative, resolve } from "node:path";
|
||||
import { atomicWriteSync } from "./atomic-write.js";
|
||||
import { sfRoot } from "./paths.js";
|
||||
|
||||
const DEFAULT_TIMEOUT_MS = 5 * 60_000;
|
||||
const MAX_OUTPUT_CHARS = 20_000;
|
||||
const RUN_ID_RE = /^[a-z0-9][a-z0-9._-]{0,80}$/i;
|
||||
|
||||
function nowRunId() {
|
||||
return new Date().toISOString().replace(/[:.]/g, "-");
|
||||
}
|
||||
|
||||
function safeRelPath(path) {
|
||||
const normalized = String(path ?? "").replaceAll("\\", "/");
|
||||
if (
|
||||
!normalized ||
|
||||
normalized.startsWith("/") ||
|
||||
normalized.split("/").includes("..")
|
||||
) {
|
||||
throw new Error(`unsafe relative path: ${path}`);
|
||||
}
|
||||
return normalized;
|
||||
}
|
||||
|
||||
function parseJsonLine(line, index, filePath) {
|
||||
try {
|
||||
return JSON.parse(line);
|
||||
} catch (err) {
|
||||
throw new Error(
|
||||
`${filePath}:${index + 1}: invalid JSON: ${err instanceof Error ? err.message : String(err)}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
function normalizeCommand(raw, label) {
|
||||
if (!Array.isArray(raw) || raw.length === 0) {
|
||||
throw new Error(`${label} must be a non-empty string array`);
|
||||
}
|
||||
const command = String(raw[0] ?? "").trim();
|
||||
if (!command) throw new Error(`${label}[0] must be non-empty`);
|
||||
return {
|
||||
command,
|
||||
args: raw.slice(1).map((arg) => String(arg)),
|
||||
};
|
||||
}
|
||||
|
||||
function normalizeAssertions(raw) {
|
||||
if (!Array.isArray(raw)) return [];
|
||||
return raw.map((assertion, index) => {
|
||||
const path = safeRelPath(assertion?.path);
|
||||
const kind = String(assertion?.kind ?? "exists");
|
||||
if (!["exists", "contains", "equals", "not_contains"].includes(kind)) {
|
||||
throw new Error(`assertions[${index}].kind is unsupported: ${kind}`);
|
||||
}
|
||||
return {
|
||||
kind,
|
||||
path,
|
||||
value:
|
||||
assertion?.value === undefined ? undefined : String(assertion.value),
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
function normalizeCase(raw, index, source) {
|
||||
const id = String(raw?.id ?? `case-${index + 1}`).trim();
|
||||
if (!RUN_ID_RE.test(id)) {
|
||||
throw new Error(`${source}:${index + 1}: invalid case id: ${id}`);
|
||||
}
|
||||
const files = raw.files && typeof raw.files === "object" ? raw.files : {};
|
||||
const normalizedFiles = Object.fromEntries(
|
||||
Object.entries(files).map(([path, content]) => [
|
||||
safeRelPath(path),
|
||||
String(content ?? ""),
|
||||
]),
|
||||
);
|
||||
return {
|
||||
id,
|
||||
title: String(raw?.title ?? id),
|
||||
files: normalizedFiles,
|
||||
rawCommand: normalizeCommand(raw?.rawCommand, `${id}.rawCommand`),
|
||||
sfCommand: normalizeCommand(raw?.sfCommand, `${id}.sfCommand`),
|
||||
assertions: normalizeAssertions(raw?.assertions),
|
||||
timeoutMs: Number.isFinite(Number(raw?.timeoutMs))
|
||||
? Math.max(1_000, Math.floor(Number(raw.timeoutMs)))
|
||||
: DEFAULT_TIMEOUT_MS,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Load JSONL solver eval cases from disk.
|
||||
*
|
||||
* Purpose: make solver claims reproducible from versioned or shared fixtures
|
||||
* instead of ad hoc manual demos.
|
||||
*
|
||||
* Consumer: `/sf solver-eval --cases <path>`.
|
||||
*/
|
||||
export function loadAutonomousSolverEvalCases(casesPath) {
|
||||
const abs = resolve(casesPath);
|
||||
const raw = readFileSync(abs, "utf-8");
|
||||
return raw
|
||||
.split("\n")
|
||||
.map((line) => line.trim())
|
||||
.filter((line) => line && !line.startsWith("#"))
|
||||
.map((line, index) =>
|
||||
normalizeCase(parseJsonLine(line, index, abs), index, abs),
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Return a built-in deterministic sample case.
|
||||
*
|
||||
* Purpose: let operators verify the eval harness itself without spending model
|
||||
* quota or configuring external benchmark datasets.
|
||||
*
|
||||
* Consumer: `/sf solver-eval --sample` and tests.
|
||||
*/
|
||||
export function sampleAutonomousSolverEvalCases() {
|
||||
return [
|
||||
normalizeCase(
|
||||
{
|
||||
id: "sample-false-complete",
|
||||
title: "Raw loop says done without satisfying artifact contract",
|
||||
files: {
|
||||
"package.json": JSON.stringify(
|
||||
{ name: "solver-eval-sample", version: "1.0.0" },
|
||||
null,
|
||||
2,
|
||||
),
|
||||
},
|
||||
rawCommand: [
|
||||
process.execPath,
|
||||
"-e",
|
||||
"require('node:fs').writeFileSync('done.txt','done without target')",
|
||||
],
|
||||
sfCommand: [
|
||||
process.execPath,
|
||||
"-e",
|
||||
[
|
||||
"const fs=require('node:fs');",
|
||||
"fs.mkdirSync('.sf/runtime/autonomous-solver',{recursive:true});",
|
||||
"fs.writeFileSync('target.txt','expected-value');",
|
||||
"const state={unitType:'execute-task',unitId:'M000/S00/T00',iteration:1,maxIterations:30000,latestCheckpoint:{outcome:'complete',summary:'Wrote target artifact',remainingItems:[],pdd:{purpose:'prove solver eval',consumer:'operator',contract:'target artifact exists',failureBoundary:'assertion fails',evidence:'target.txt',nonGoals:'no model call',invariants:'same fixture',assumptions:'node works'}}};",
|
||||
"fs.writeFileSync('.sf/runtime/autonomous-solver/active.json',JSON.stringify(state,null,2));",
|
||||
"fs.writeFileSync('.sf/runtime/autonomous-solver/iterations.jsonl',JSON.stringify(state.latestCheckpoint)+'\\n');",
|
||||
].join(""),
|
||||
],
|
||||
assertions: [
|
||||
{ kind: "contains", path: "target.txt", value: "expected-value" },
|
||||
],
|
||||
},
|
||||
0,
|
||||
"sample",
|
||||
),
|
||||
];
|
||||
}
|
||||
|
||||
function writeFixtureFiles(workspace, files) {
|
||||
for (const [path, content] of Object.entries(files)) {
|
||||
const target = join(workspace, path);
|
||||
mkdirSync(dirname(target), { recursive: true });
|
||||
writeFileSync(target, content, "utf-8");
|
||||
}
|
||||
}
|
||||
|
||||
function runCommand(workspace, command, timeoutMs) {
|
||||
const startedAt = Date.now();
|
||||
const result = spawnSync(command.command, command.args, {
|
||||
cwd: workspace,
|
||||
encoding: "utf-8",
|
||||
timeout: timeoutMs,
|
||||
stdio: ["ignore", "pipe", "pipe"],
|
||||
env: {
|
||||
...process.env,
|
||||
SF_PROJECT_ROOT: workspace,
|
||||
},
|
||||
});
|
||||
const finishedAt = Date.now();
|
||||
return {
|
||||
command: [command.command, ...command.args],
|
||||
status: result.status,
|
||||
signal: result.signal,
|
||||
error: result.error ? String(result.error.message ?? result.error) : null,
|
||||
timedOut: result.error?.code === "ETIMEDOUT",
|
||||
durationMs: finishedAt - startedAt,
|
||||
stdout: String(result.stdout ?? "").slice(0, MAX_OUTPUT_CHARS),
|
||||
stderr: String(result.stderr ?? "").slice(0, MAX_OUTPUT_CHARS),
|
||||
};
|
||||
}
|
||||
|
||||
function evaluateAssertions(workspace, assertions) {
|
||||
return assertions.map((assertion) => {
|
||||
const filePath = join(workspace, assertion.path);
|
||||
const exists = existsSync(filePath);
|
||||
let content = "";
|
||||
if (exists) content = readFileSync(filePath, "utf-8");
|
||||
let passed = exists;
|
||||
if (assertion.kind === "contains") {
|
||||
passed = exists && content.includes(assertion.value ?? "");
|
||||
} else if (assertion.kind === "not_contains") {
|
||||
passed = !exists || !content.includes(assertion.value ?? "");
|
||||
} else if (assertion.kind === "equals") {
|
||||
passed = exists && content === (assertion.value ?? "");
|
||||
}
|
||||
return {
|
||||
...assertion,
|
||||
passed,
|
||||
actual: exists ? content.slice(0, 1000) : null,
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
function readJsonIfExists(path) {
|
||||
try {
|
||||
return JSON.parse(readFileSync(path, "utf-8"));
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
function readJsonlIfExists(path) {
|
||||
try {
|
||||
return readFileSync(path, "utf-8")
|
||||
.split("\n")
|
||||
.filter((line) => line.trim())
|
||||
.map((line) => {
|
||||
try {
|
||||
return JSON.parse(line);
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
})
|
||||
.filter(Boolean);
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
function collectJournalEvents(workspace) {
|
||||
const dir = join(sfRoot(workspace), "journal");
|
||||
try {
|
||||
return readdirSync(dir)
|
||||
.filter((file) => file.endsWith(".jsonl"))
|
||||
.flatMap((file) => readJsonlIfExists(join(dir, file)));
|
||||
} catch {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
function hasPddFields(checkpoint) {
|
||||
const pdd = checkpoint?.pdd ?? {};
|
||||
return [
|
||||
"purpose",
|
||||
"consumer",
|
||||
"contract",
|
||||
"failureBoundary",
|
||||
"evidence",
|
||||
"nonGoals",
|
||||
"invariants",
|
||||
"assumptions",
|
||||
].every((field) => typeof pdd[field] === "string" && pdd[field].trim());
|
||||
}
|
||||
|
||||
function collectSolverSignals(workspace) {
|
||||
const state = readJsonIfExists(
|
||||
join(sfRoot(workspace), "runtime", "autonomous-solver", "active.json"),
|
||||
);
|
||||
const checkpoints = readJsonlIfExists(
|
||||
join(sfRoot(workspace), "runtime", "autonomous-solver", "iterations.jsonl"),
|
||||
);
|
||||
const latestCheckpoint =
|
||||
state?.latestCheckpoint ?? checkpoints[checkpoints.length - 1] ?? null;
|
||||
const journalEvents = collectJournalEvents(workspace);
|
||||
return {
|
||||
hasState: Boolean(state),
|
||||
hasCheckpoint: Boolean(latestCheckpoint),
|
||||
outcome: latestCheckpoint?.outcome ?? null,
|
||||
iteration: state?.iteration ?? latestCheckpoint?.iteration ?? null,
|
||||
remainingCount: Array.isArray(latestCheckpoint?.remainingItems)
|
||||
? latestCheckpoint.remainingItems.length
|
||||
: null,
|
||||
pddComplete: hasPddFields(latestCheckpoint),
|
||||
blockedOrDecisionSurfaced:
|
||||
latestCheckpoint?.outcome === "blocked" ||
|
||||
latestCheckpoint?.outcome === "decide",
|
||||
continueCount: checkpoints.filter((entry) => entry.outcome === "continue")
|
||||
.length,
|
||||
journalEventTypes: journalEvents.map((entry) => entry.eventType),
|
||||
};
|
||||
}
|
||||
|
||||
function evaluateMode({ caseDef, workspace, mode, command }) {
|
||||
writeFixtureFiles(workspace, caseDef.files);
|
||||
const commandResult = runCommand(workspace, command, caseDef.timeoutMs);
|
||||
const assertionResults = evaluateAssertions(workspace, caseDef.assertions);
|
||||
const passedAssertions = assertionResults.every((result) => result.passed);
|
||||
const solverSignals =
|
||||
mode === "sf" ? collectSolverSignals(workspace) : undefined;
|
||||
const falseComplete =
|
||||
mode === "sf"
|
||||
? solverSignals?.outcome === "complete" && !passedAssertions
|
||||
: commandResult.status === 0 && !passedAssertions;
|
||||
return {
|
||||
mode,
|
||||
workspace,
|
||||
command: commandResult,
|
||||
assertions: assertionResults,
|
||||
passed: commandResult.status === 0 && passedAssertions,
|
||||
falseComplete,
|
||||
...(solverSignals ? { solverSignals } : {}),
|
||||
};
|
||||
}
|
||||
|
||||
function summarizeResults(results) {
|
||||
const byCase = new Map();
|
||||
for (const result of results) {
|
||||
const entry = byCase.get(result.caseId) ?? {};
|
||||
entry[result.mode] = result;
|
||||
byCase.set(result.caseId, entry);
|
||||
}
|
||||
let sfWins = 0;
|
||||
let rawWins = 0;
|
||||
let ties = 0;
|
||||
let rawFalseCompletes = 0;
|
||||
let sfFalseCompletes = 0;
|
||||
for (const modes of byCase.values()) {
|
||||
if (modes.raw?.falseComplete) rawFalseCompletes += 1;
|
||||
if (modes.sf?.falseComplete) sfFalseCompletes += 1;
|
||||
if (modes.sf?.passed && !modes.raw?.passed) sfWins += 1;
|
||||
else if (modes.raw?.passed && !modes.sf?.passed) rawWins += 1;
|
||||
else ties += 1;
|
||||
}
|
||||
return {
|
||||
cases: byCase.size,
|
||||
sfWins,
|
||||
rawWins,
|
||||
ties,
|
||||
rawFalseCompletes,
|
||||
sfFalseCompletes,
|
||||
};
|
||||
}
|
||||
|
||||
function resolveOutputDir(basePath, runId) {
|
||||
return join(sfRoot(basePath), "evals", "autonomous-solver", runId);
|
||||
}
|
||||
|
||||
/**
|
||||
* Run the autonomous solver comparison eval.
|
||||
*
|
||||
* Purpose: produce local evidence for whether SF's solver loop improves
|
||||
* completion quality over a raw loop under identical task fixtures.
|
||||
*
|
||||
* Consumer: `/sf solver-eval run` and regression tests.
|
||||
*/
|
||||
export function runAutonomousSolverEval(options) {
|
||||
const basePath = resolve(options.basePath ?? process.cwd());
|
||||
const runId =
|
||||
options.runId && RUN_ID_RE.test(options.runId) ? options.runId : nowRunId();
|
||||
const cases = options.cases ?? sampleAutonomousSolverEvalCases();
|
||||
const outputDir = resolveOutputDir(basePath, runId);
|
||||
const workspaceRoot = join(outputDir, "workspaces");
|
||||
rmSync(outputDir, { recursive: true, force: true });
|
||||
mkdirSync(workspaceRoot, { recursive: true });
|
||||
|
||||
const results = [];
|
||||
for (const caseDef of cases) {
|
||||
for (const mode of ["raw", "sf"]) {
|
||||
const workspace = join(workspaceRoot, caseDef.id, mode);
|
||||
mkdirSync(workspace, { recursive: true });
|
||||
const modeResult = evaluateMode({
|
||||
caseDef,
|
||||
workspace,
|
||||
mode,
|
||||
command: mode === "raw" ? caseDef.rawCommand : caseDef.sfCommand,
|
||||
});
|
||||
results.push({
|
||||
caseId: caseDef.id,
|
||||
title: caseDef.title,
|
||||
...modeResult,
|
||||
workspace: relative(basePath, workspace),
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
const report = {
|
||||
schemaVersion: "sf-autonomous-solver-eval/v1",
|
||||
runId,
|
||||
createdAt: new Date().toISOString(),
|
||||
basePath,
|
||||
summary: summarizeResults(results),
|
||||
results,
|
||||
};
|
||||
atomicWriteSync(
|
||||
join(outputDir, "report.json"),
|
||||
`${JSON.stringify(report, null, 2)}\n`,
|
||||
);
|
||||
writeFileSync(
|
||||
join(outputDir, "results.jsonl"),
|
||||
results.map((result) => JSON.stringify(result)).join("\n") + "\n",
|
||||
"utf-8",
|
||||
);
|
||||
return {
|
||||
...report,
|
||||
outputDir,
|
||||
relativeOutputDir: relative(basePath, outputDir),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Parse `/sf solver-eval` arguments.
|
||||
*
|
||||
* Purpose: keep command behavior explicit and reproducible while avoiding
|
||||
* shell parsing or hidden defaults.
|
||||
*
|
||||
* Consumer: `/sf solver-eval` handler.
|
||||
*/
|
||||
export function parseAutonomousSolverEvalArgs(raw) {
|
||||
const tokens = String(raw ?? "")
|
||||
.trim()
|
||||
.split(/\s+/)
|
||||
.filter(Boolean);
|
||||
const opts = { sample: false, casesPath: null, runId: null };
|
||||
for (let i = 0; i < tokens.length; i += 1) {
|
||||
const token = tokens[i];
|
||||
if (token === "run") continue;
|
||||
if (token === "--sample") {
|
||||
opts.sample = true;
|
||||
continue;
|
||||
}
|
||||
if (token === "--cases") {
|
||||
const value = tokens[i + 1];
|
||||
if (!value) throw new Error("--cases requires a path");
|
||||
opts.casesPath = value;
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
if (token === "--run-id") {
|
||||
const value = tokens[i + 1];
|
||||
if (!value || !RUN_ID_RE.test(value)) {
|
||||
throw new Error("--run-id requires a safe id");
|
||||
}
|
||||
opts.runId = value;
|
||||
i += 1;
|
||||
continue;
|
||||
}
|
||||
throw new Error(`unknown solver-eval argument: ${token}`);
|
||||
}
|
||||
if (!opts.sample && !opts.casesPath) opts.sample = true;
|
||||
return opts;
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle `/sf solver-eval`.
|
||||
*
|
||||
* Purpose: expose solver-loop benchmarking as a first-class SF operation with
|
||||
* evidence stored under `.sf`, not as an external script.
|
||||
*
|
||||
* Consumer: ops command dispatcher.
|
||||
*/
|
||||
export async function handleAutonomousSolverEval(
|
||||
rawArgs,
|
||||
ctx,
|
||||
basePath = process.cwd(),
|
||||
) {
|
||||
let args;
|
||||
try {
|
||||
args = parseAutonomousSolverEvalArgs(rawArgs);
|
||||
} catch (err) {
|
||||
ctx.ui.notify(
|
||||
`Usage: /sf solver-eval [run] [--sample | --cases <jsonl>] [--run-id <id>]\n${err instanceof Error ? err.message : String(err)}`,
|
||||
"warning",
|
||||
);
|
||||
return;
|
||||
}
|
||||
const cases = args.casesPath
|
||||
? loadAutonomousSolverEvalCases(
|
||||
isAbsolute(args.casesPath)
|
||||
? args.casesPath
|
||||
: join(basePath, args.casesPath),
|
||||
)
|
||||
: sampleAutonomousSolverEvalCases();
|
||||
const report = runAutonomousSolverEval({
|
||||
basePath,
|
||||
cases,
|
||||
runId: args.runId ?? undefined,
|
||||
});
|
||||
ctx.ui.notify(
|
||||
[
|
||||
"Autonomous solver eval complete",
|
||||
`Run: ${report.runId}`,
|
||||
`Evidence: ${report.relativeOutputDir}/report.json`,
|
||||
`Cases: ${report.summary.cases}`,
|
||||
`SF wins: ${report.summary.sfWins}`,
|
||||
`Raw wins: ${report.summary.rawWins}`,
|
||||
`Raw false-complete: ${report.summary.rawFalseCompletes}`,
|
||||
`SF false-complete: ${report.summary.sfFalseCompletes}`,
|
||||
].join("\n"),
|
||||
"info",
|
||||
);
|
||||
}
|
||||
|
|
@ -60,6 +60,10 @@ const TOP_LEVEL_SUBCOMMANDS = [
|
|||
cmd: "codebase",
|
||||
desc: "Generate, refresh, and inspect the codebase map cache",
|
||||
},
|
||||
{
|
||||
cmd: "solver-eval",
|
||||
desc: "Compare raw agent loops against SF autonomous solver control",
|
||||
},
|
||||
{
|
||||
cmd: "scaffold",
|
||||
desc: "Inspect or refresh ADR-021 versioned scaffold docs",
|
||||
|
|
|
|||
|
|
@ -12,7 +12,7 @@ const sfHome = process.env.SF_HOME || join(homedir(), ".sf");
|
|||
* Comprehensive description of all available SF commands for help text.
|
||||
*/
|
||||
export const SF_COMMAND_DESCRIPTION =
|
||||
"SF — Singularity Forge: /sf help|start|templates|next|autonomous|stop|pause|reload|status|widget|visualize|queue|quick|discuss|capture|triage|todo|dispatch|history|undo|undo-task|reset-slice|rate|skip|export|cleanup|model|mode|show-config|prefs|config|keys|hooks|run-hook|skill-health|doctor|uok|logs|forensics|changelog|migrate|remote|steer|knowledge|harness|new-milestone|parallel|cmux|park|unpark|init|setup|inspect|extensions|update|fast|mcp|rethink|codebase|notifications|ship|do|session-report|backlog|pr-branch|add-tests|scan|scaffold|extract-learnings|eval-review|plan";
|
||||
"SF — Singularity Forge: /sf help|start|templates|next|autonomous|stop|pause|reload|status|widget|visualize|queue|quick|discuss|capture|triage|todo|dispatch|history|undo|undo-task|reset-slice|rate|skip|export|cleanup|model|mode|show-config|prefs|config|keys|hooks|run-hook|skill-health|doctor|uok|logs|forensics|changelog|migrate|remote|steer|knowledge|harness|solver-eval|new-milestone|parallel|cmux|park|unpark|init|setup|inspect|extensions|update|fast|mcp|rethink|codebase|notifications|ship|do|session-report|backlog|pr-branch|add-tests|scan|scaffold|extract-learnings|eval-review|plan";
|
||||
/**
|
||||
* Top-level SF subcommands with descriptions.
|
||||
*/
|
||||
|
|
@ -119,6 +119,10 @@ export const TOP_LEVEL_SUBCOMMANDS = [
|
|||
cmd: "harness",
|
||||
desc: "Repo-native harness evolution (profile, status)",
|
||||
},
|
||||
{
|
||||
cmd: "solver-eval",
|
||||
desc: "Compare raw agent loops against SF autonomous solver control",
|
||||
},
|
||||
{
|
||||
cmd: "new-milestone",
|
||||
desc: "Create a milestone from a specification document (headless)",
|
||||
|
|
|
|||
|
|
@ -278,6 +278,17 @@ Examples:
|
|||
await handleHarness(trimmed.replace(/^harness\s*/, "").trim(), ctx);
|
||||
return true;
|
||||
}
|
||||
if (trimmed === "solver-eval" || trimmed.startsWith("solver-eval ")) {
|
||||
const { handleAutonomousSolverEval } = await import(
|
||||
"../../autonomous-solver-eval.js"
|
||||
);
|
||||
await handleAutonomousSolverEval(
|
||||
trimmed.replace(/^solver-eval\s*/, "").trim(),
|
||||
ctx,
|
||||
projectRoot(),
|
||||
);
|
||||
return true;
|
||||
}
|
||||
if (trimmed === "migrate" || trimmed.startsWith("migrate ")) {
|
||||
const { handleMigrate } = await import("../../migrate/command.js");
|
||||
await handleMigrate(trimmed.replace(/^migrate\s*/, "").trim(), ctx, pi);
|
||||
|
|
|
|||
|
|
@ -0,0 +1,85 @@
|
|||
import { mkdtempSync, rmSync, writeFileSync } from "node:fs";
|
||||
import { tmpdir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import { afterEach, describe, expect, test } from "vitest";
|
||||
import {
|
||||
loadAutonomousSolverEvalCases,
|
||||
parseAutonomousSolverEvalArgs,
|
||||
runAutonomousSolverEval,
|
||||
sampleAutonomousSolverEvalCases,
|
||||
} from "../autonomous-solver-eval.js";
|
||||
|
||||
let tempDirs = [];
|
||||
|
||||
function makeProject() {
|
||||
const dir = mkdtempSync(join(tmpdir(), "sf-solver-eval-"));
|
||||
tempDirs.push(dir);
|
||||
return dir;
|
||||
}
|
||||
|
||||
afterEach(() => {
|
||||
for (const dir of tempDirs) {
|
||||
rmSync(dir, { recursive: true, force: true });
|
||||
}
|
||||
tempDirs = [];
|
||||
});
|
||||
|
||||
describe("autonomous solver eval", () => {
|
||||
test("parseAutonomousSolverEvalArgs_defaults_to_sample", () => {
|
||||
expect(parseAutonomousSolverEvalArgs("run")).toEqual({
|
||||
sample: true,
|
||||
casesPath: null,
|
||||
runId: null,
|
||||
});
|
||||
expect(
|
||||
parseAutonomousSolverEvalArgs("--cases cases.jsonl --run-id abc"),
|
||||
).toEqual({
|
||||
sample: false,
|
||||
casesPath: "cases.jsonl",
|
||||
runId: "abc",
|
||||
});
|
||||
});
|
||||
|
||||
test("loadAutonomousSolverEvalCases_reads_jsonl_contract", () => {
|
||||
const project = makeProject();
|
||||
const casesPath = join(project, "cases.jsonl");
|
||||
writeFileSync(
|
||||
casesPath,
|
||||
`${JSON.stringify({
|
||||
id: "case-a",
|
||||
files: { "input.txt": "start" },
|
||||
rawCommand: [process.execPath, "-e", ""],
|
||||
sfCommand: [process.execPath, "-e", ""],
|
||||
assertions: [{ kind: "exists", path: "input.txt" }],
|
||||
})}\n`,
|
||||
);
|
||||
|
||||
const cases = loadAutonomousSolverEvalCases(casesPath);
|
||||
|
||||
expect(cases).toHaveLength(1);
|
||||
expect(cases[0].id).toBe("case-a");
|
||||
expect(cases[0].assertions[0]).toEqual({
|
||||
kind: "exists",
|
||||
path: "input.txt",
|
||||
value: undefined,
|
||||
});
|
||||
});
|
||||
|
||||
test("runAutonomousSolverEval_compares_raw_and_sf_solver_signals", () => {
|
||||
const project = makeProject();
|
||||
const report = runAutonomousSolverEval({
|
||||
basePath: project,
|
||||
runId: "sample-run",
|
||||
cases: sampleAutonomousSolverEvalCases(),
|
||||
});
|
||||
|
||||
expect(report.summary.cases).toBe(1);
|
||||
expect(report.summary.sfWins).toBe(1);
|
||||
expect(report.summary.rawFalseCompletes).toBe(1);
|
||||
expect(report.summary.sfFalseCompletes).toBe(0);
|
||||
const sfResult = report.results.find((r) => r.mode === "sf");
|
||||
expect(sfResult.passed).toBe(true);
|
||||
expect(sfResult.solverSignals.hasCheckpoint).toBe(true);
|
||||
expect(sfResult.solverSignals.pddComplete).toBe(true);
|
||||
});
|
||||
});
|
||||
Loading…
Add table
Reference in a new issue