From dc51baa19a3ddeac10c0893464a20a2efef55e10 Mon Sep 17 00:00:00 2001
From: Mikael Hugo <mikkihugo@users.noreply.github.com>
Date: Wed, 6 May 2026 03:37:58 +0200
Subject: [PATCH] feat: add autonomous solver eval command

---
 .../extensions/sf/autonomous-solver-eval.js   | 517 ++++++++++++++++++
 .../extensions/sf/commands-bootstrap.js       |   4 +
 .../extensions/sf/commands/catalog.js         |   6 +-
 .../extensions/sf/commands/handlers/ops.js    |  11 +
 .../sf/tests/autonomous-solver-eval.test.mjs  |  85 +++
 5 files changed, 622 insertions(+), 1 deletion(-)
 create mode 100644 src/resources/extensions/sf/autonomous-solver-eval.js
 create mode 100644 src/resources/extensions/sf/tests/autonomous-solver-eval.test.mjs

diff --git a/src/resources/extensions/sf/autonomous-solver-eval.js b/src/resources/extensions/sf/autonomous-solver-eval.js
new file mode 100644
index 000000000..e2f568841
--- /dev/null
+++ b/src/resources/extensions/sf/autonomous-solver-eval.js
@@ -0,0 +1,517 @@
+/**
+ * autonomous-solver-eval.ts — first-class eval runner for solver-loop value.
+ *
+ * Purpose: compare raw agent loops against SF's autonomous solver control
+ * plane using the same task fixtures, deterministic assertions, and solver
+ * observability signals.
+ *
+ * Consumer: `/sf solver-eval` and focused regression tests.
+ */
+import { spawnSync } from "node:child_process";
+import {
+	existsSync,
+	mkdirSync,
+	readdirSync,
+	readFileSync,
+	rmSync,
+	writeFileSync,
+} from "node:fs";
+import { dirname, isAbsolute, join, relative, resolve } from "node:path";
+import { atomicWriteSync } from "./atomic-write.js";
+import { sfRoot } from "./paths.js";
+
+const DEFAULT_TIMEOUT_MS = 5 * 60_000;
+const MAX_OUTPUT_CHARS = 20_000;
+const RUN_ID_RE = /^[a-z0-9][a-z0-9._-]{0,80}$/i;
+
+function nowRunId() {
+	return new Date().toISOString().replace(/[:.]/g, "-");
+}
+
+function safeRelPath(path) {
+	const normalized = String(path ?? "").replaceAll("\\", "/");
+	if (
+		!normalized ||
+		normalized.startsWith("/") ||
+		normalized.split("/").includes("..")
+	) {
+		throw new Error(`unsafe relative path: ${path}`);
+	}
+	return normalized;
+}
+
+function parseJsonLine(line, index, filePath) {
+	try {
+		return JSON.parse(line);
+	} catch (err) {
+		throw new Error(
+			`${filePath}:${index + 1}: invalid JSON: ${err instanceof Error ? err.message : String(err)}`,
+		);
+	}
+}
+
+function normalizeCommand(raw, label) {
+	if (!Array.isArray(raw) || raw.length === 0) {
+		throw new Error(`${label} must be a non-empty string array`);
+	}
+	const command = String(raw[0] ?? "").trim();
+	if (!command) throw new Error(`${label}[0] must be non-empty`);
+	return {
+		command,
+		args: raw.slice(1).map((arg) => String(arg)),
+	};
+}
+
+function normalizeAssertions(raw) {
+	if (!Array.isArray(raw)) return [];
+	return raw.map((assertion, index) => {
+		const path = safeRelPath(assertion?.path);
+		const kind = String(assertion?.kind ?? "exists");
+		if (!["exists", "contains", "equals", "not_contains"].includes(kind)) {
+			throw new Error(`assertions[${index}].kind is unsupported: ${kind}`);
+		}
+		return {
+			kind,
+			path,
+			value:
+				assertion?.value === undefined ? undefined : String(assertion.value),
+		};
+	});
+}
+
+function normalizeCase(raw, index, source) {
+	const id = String(raw?.id ?? `case-${index + 1}`).trim();
+	if (!RUN_ID_RE.test(id)) {
+		throw new Error(`${source}:${index + 1}: invalid case id: ${id}`);
+	}
+	const files = raw.files && typeof raw.files === "object" ? raw.files : {};
+	const normalizedFiles = Object.fromEntries(
+		Object.entries(files).map(([path, content]) => [
+			safeRelPath(path),
+			String(content ?? ""),
+		]),
+	);
+	return {
+		id,
+		title: String(raw?.title ?? id),
+		files: normalizedFiles,
+		rawCommand: normalizeCommand(raw?.rawCommand, `${id}.rawCommand`),
+		sfCommand: normalizeCommand(raw?.sfCommand, `${id}.sfCommand`),
+		assertions: normalizeAssertions(raw?.assertions),
+		timeoutMs: Number.isFinite(Number(raw?.timeoutMs))
+			? Math.max(1_000, Math.floor(Number(raw.timeoutMs)))
+			: DEFAULT_TIMEOUT_MS,
+	};
+}
+
+/**
+ * Load JSONL solver eval cases from disk.
+ *
+ * Purpose: make solver claims reproducible from versioned or shared fixtures
+ * instead of ad hoc manual demos.
+ *
+ * Consumer: `/sf solver-eval --cases <path>`.
+ */
+export function loadAutonomousSolverEvalCases(casesPath) {
+	const abs = resolve(casesPath);
+	const raw = readFileSync(abs, "utf-8");
+	return raw
+		.split("\n")
+		.map((line) => line.trim())
+		.filter((line) => line && !line.startsWith("#"))
+		.map((line, index) =>
+			normalizeCase(parseJsonLine(line, index, abs), index, abs),
+		);
+}
+
+/**
+ * Return a built-in deterministic sample case.
+ *
+ * Purpose: let operators verify the eval harness itself without spending model
+ * quota or configuring external benchmark datasets.
+ *
+ * Consumer: `/sf solver-eval --sample` and tests.
+ */
+export function sampleAutonomousSolverEvalCases() {
+	return [
+		normalizeCase(
+			{
+				id: "sample-false-complete",
+				title: "Raw loop says done without satisfying artifact contract",
+				files: {
+					"package.json": JSON.stringify(
+						{ name: "solver-eval-sample", version: "1.0.0" },
+						null,
+						2,
+					),
+				},
+				rawCommand: [
+					process.execPath,
+					"-e",
+					"require('node:fs').writeFileSync('done.txt','done without target')",
+				],
+				sfCommand: [
+					process.execPath,
+					"-e",
+					[
+						"const fs=require('node:fs');",
+						"fs.mkdirSync('.sf/runtime/autonomous-solver',{recursive:true});",
+						"fs.writeFileSync('target.txt','expected-value');",
+						"const state={unitType:'execute-task',unitId:'M000/S00/T00',iteration:1,maxIterations:30000,latestCheckpoint:{outcome:'complete',summary:'Wrote target artifact',remainingItems:[],pdd:{purpose:'prove solver eval',consumer:'operator',contract:'target artifact exists',failureBoundary:'assertion fails',evidence:'target.txt',nonGoals:'no model call',invariants:'same fixture',assumptions:'node works'}}};",
+						"fs.writeFileSync('.sf/runtime/autonomous-solver/active.json',JSON.stringify(state,null,2));",
+						"fs.writeFileSync('.sf/runtime/autonomous-solver/iterations.jsonl',JSON.stringify(state.latestCheckpoint)+'\\n');",
+					].join(""),
+				],
+				assertions: [
+					{ kind: "contains", path: "target.txt", value: "expected-value" },
+				],
+			},
+			0,
+			"sample",
+		),
+	];
+}
+
+function writeFixtureFiles(workspace, files) {
+	for (const [path, content] of Object.entries(files)) {
+		const target = join(workspace, path);
+		mkdirSync(dirname(target), { recursive: true });
+		writeFileSync(target, content, "utf-8");
+	}
+}
+
+function runCommand(workspace, command, timeoutMs) {
+	const startedAt = Date.now();
+	const result = spawnSync(command.command, command.args, {
+		cwd: workspace,
+		encoding: "utf-8",
+		timeout: timeoutMs,
+		stdio: ["ignore", "pipe", "pipe"],
+		env: {
+			...process.env,
+			SF_PROJECT_ROOT: workspace,
+		},
+	});
+	const finishedAt = Date.now();
+	return {
+		command: [command.command, ...command.args],
+		status: result.status,
+		signal: result.signal,
+		error: result.error ? String(result.error.message ?? result.error) : null,
+		timedOut: result.error?.code === "ETIMEDOUT",
+		durationMs: finishedAt - startedAt,
+		stdout: String(result.stdout ?? "").slice(0, MAX_OUTPUT_CHARS),
+		stderr: String(result.stderr ?? "").slice(0, MAX_OUTPUT_CHARS),
+	};
+}
+
+function evaluateAssertions(workspace, assertions) {
+	return assertions.map((assertion) => {
+		const filePath = join(workspace, assertion.path);
+		const exists = existsSync(filePath);
+		let content = "";
+		if (exists) content = readFileSync(filePath, "utf-8");
+		let passed = exists;
+		if (assertion.kind === "contains") {
+			passed = exists && content.includes(assertion.value ?? "");
+		} else if (assertion.kind === "not_contains") {
+			passed = !exists || !content.includes(assertion.value ?? "");
+		} else if (assertion.kind === "equals") {
+			passed = exists && content === (assertion.value ?? "");
+		}
+		return {
+			...assertion,
+			passed,
+			actual: exists ? content.slice(0, 1000) : null,
+		};
+	});
+}
+
+function readJsonIfExists(path) {
+	try {
+		return JSON.parse(readFileSync(path, "utf-8"));
+	} catch {
+		return null;
+	}
+}
+
+function readJsonlIfExists(path) {
+	try {
+		return readFileSync(path, "utf-8")
+			.split("\n")
+			.filter((line) => line.trim())
+			.map((line) => {
+				try {
+					return JSON.parse(line);
+				} catch {
+					return null;
+				}
+			})
+			.filter(Boolean);
+	} catch {
+		return [];
+	}
+}
+
+function collectJournalEvents(workspace) {
+	const dir = join(sfRoot(workspace), "journal");
+	try {
+		return readdirSync(dir)
+			.filter((file) => file.endsWith(".jsonl"))
+			.flatMap((file) => readJsonlIfExists(join(dir, file)));
+	} catch {
+		return [];
+	}
+}
+
+function hasPddFields(checkpoint) {
+	const pdd = checkpoint?.pdd ?? {};
+	return [
+		"purpose",
+		"consumer",
+		"contract",
+		"failureBoundary",
+		"evidence",
+		"nonGoals",
+		"invariants",
+		"assumptions",
+	].every((field) => typeof pdd[field] === "string" && pdd[field].trim());
+}
+
+function collectSolverSignals(workspace) {
+	const state = readJsonIfExists(
+		join(sfRoot(workspace), "runtime", "autonomous-solver", "active.json"),
+	);
+	const checkpoints = readJsonlIfExists(
+		join(sfRoot(workspace), "runtime", "autonomous-solver", "iterations.jsonl"),
+	);
+	const latestCheckpoint =
+		state?.latestCheckpoint ?? checkpoints[checkpoints.length - 1] ?? null;
+	const journalEvents = collectJournalEvents(workspace);
+	return {
+		hasState: Boolean(state),
+		hasCheckpoint: Boolean(latestCheckpoint),
+		outcome: latestCheckpoint?.outcome ?? null,
+		iteration: state?.iteration ?? latestCheckpoint?.iteration ?? null,
+		remainingCount: Array.isArray(latestCheckpoint?.remainingItems)
+			? latestCheckpoint.remainingItems.length
+			: null,
+		pddComplete: hasPddFields(latestCheckpoint),
+		blockedOrDecisionSurfaced:
+			latestCheckpoint?.outcome === "blocked" ||
+			latestCheckpoint?.outcome === "decide",
+		continueCount: checkpoints.filter((entry) => entry.outcome === "continue")
+			.length,
+		journalEventTypes: journalEvents.map((entry) => entry.eventType),
+	};
+}
+
+function evaluateMode({ caseDef, workspace, mode, command }) {
+	writeFixtureFiles(workspace, caseDef.files);
+	const commandResult = runCommand(workspace, command, caseDef.timeoutMs);
+	const assertionResults = evaluateAssertions(workspace, caseDef.assertions);
+	const passedAssertions = assertionResults.every((result) => result.passed);
+	const solverSignals =
+		mode === "sf" ? collectSolverSignals(workspace) : undefined;
+	const falseComplete =
+		mode === "sf"
+			? solverSignals?.outcome === "complete" && !passedAssertions
+			: commandResult.status === 0 && !passedAssertions;
+	return {
+		mode,
+		workspace,
+		command: commandResult,
+		assertions: assertionResults,
+		passed: commandResult.status === 0 && passedAssertions,
+		falseComplete,
+		...(solverSignals ? { solverSignals } : {}),
+	};
+}
+
+function summarizeResults(results) {
+	const byCase = new Map();
+	for (const result of results) {
+		const entry = byCase.get(result.caseId) ?? {};
+		entry[result.mode] = result;
+		byCase.set(result.caseId, entry);
+	}
+	let sfWins = 0;
+	let rawWins = 0;
+	let ties = 0;
+	let rawFalseCompletes = 0;
+	let sfFalseCompletes = 0;
+	for (const modes of byCase.values()) {
+		if (modes.raw?.falseComplete) rawFalseCompletes += 1;
+		if (modes.sf?.falseComplete) sfFalseCompletes += 1;
+		if (modes.sf?.passed && !modes.raw?.passed) sfWins += 1;
+		else if (modes.raw?.passed && !modes.sf?.passed) rawWins += 1;
+		else ties += 1;
+	}
+	return {
+		cases: byCase.size,
+		sfWins,
+		rawWins,
+		ties,
+		rawFalseCompletes,
+		sfFalseCompletes,
+	};
+}
+
+function resolveOutputDir(basePath, runId) {
+	return join(sfRoot(basePath), "evals", "autonomous-solver", runId);
+}
+
+/**
+ * Run the autonomous solver comparison eval.
+ *
+ * Purpose: produce local evidence for whether SF's solver loop improves
+ * completion quality over a raw loop under identical task fixtures.
+ *
+ * Consumer: `/sf solver-eval run` and regression tests.
+ */
+export function runAutonomousSolverEval(options) {
+	const basePath = resolve(options.basePath ?? process.cwd());
+	const runId =
+		options.runId && RUN_ID_RE.test(options.runId) ? options.runId : nowRunId();
+	const cases = options.cases ?? sampleAutonomousSolverEvalCases();
+	const outputDir = resolveOutputDir(basePath, runId);
+	const workspaceRoot = join(outputDir, "workspaces");
+	rmSync(outputDir, { recursive: true, force: true });
+	mkdirSync(workspaceRoot, { recursive: true });
+
+	const results = [];
+	for (const caseDef of cases) {
+		for (const mode of ["raw", "sf"]) {
+			const workspace = join(workspaceRoot, caseDef.id, mode);
+			mkdirSync(workspace, { recursive: true });
+			const modeResult = evaluateMode({
+				caseDef,
+				workspace,
+				mode,
+				command: mode === "raw" ? caseDef.rawCommand : caseDef.sfCommand,
+			});
+			results.push({
+				caseId: caseDef.id,
+				title: caseDef.title,
+				...modeResult,
+				workspace: relative(basePath, workspace),
+			});
+		}
+	}
+
+	const report = {
+		schemaVersion: "sf-autonomous-solver-eval/v1",
+		runId,
+		createdAt: new Date().toISOString(),
+		basePath,
+		summary: summarizeResults(results),
+		results,
+	};
+	atomicWriteSync(
+		join(outputDir, "report.json"),
+		`${JSON.stringify(report, null, 2)}\n`,
+	);
+	writeFileSync(
+		join(outputDir, "results.jsonl"),
+		results.map((result) => JSON.stringify(result)).join("\n") + "\n",
+		"utf-8",
+	);
+	return {
+		...report,
+		outputDir,
+		relativeOutputDir: relative(basePath, outputDir),
+	};
+}
+
+/**
+ * Parse `/sf solver-eval` arguments.
+ *
+ * Purpose: keep command behavior explicit and reproducible while avoiding
+ * shell parsing or hidden defaults.
+ *
+ * Consumer: `/sf solver-eval` handler.
+ */
+export function parseAutonomousSolverEvalArgs(raw) {
+	const tokens = String(raw ?? "")
+		.trim()
+		.split(/\s+/)
+		.filter(Boolean);
+	const opts = { sample: false, casesPath: null, runId: null };
+	for (let i = 0; i < tokens.length; i += 1) {
+		const token = tokens[i];
+		if (token === "run") continue;
+		if (token === "--sample") {
+			opts.sample = true;
+			continue;
+		}
+		if (token === "--cases") {
+			const value = tokens[i + 1];
+			if (!value) throw new Error("--cases requires a path");
+			opts.casesPath = value;
+			i += 1;
+			continue;
+		}
+		if (token === "--run-id") {
+			const value = tokens[i + 1];
+			if (!value || !RUN_ID_RE.test(value)) {
+				throw new Error("--run-id requires a safe id");
+			}
+			opts.runId = value;
+			i += 1;
+			continue;
+		}
+		throw new Error(`unknown solver-eval argument: ${token}`);
+	}
+	if (!opts.sample && !opts.casesPath) opts.sample = true;
+	return opts;
+}
+
+/**
+ * Handle `/sf solver-eval`.
+ *
+ * Purpose: expose solver-loop benchmarking as a first-class SF operation with
+ * evidence stored under `.sf`, not as an external script.
+ *
+ * Consumer: ops command dispatcher.
+ */
+export async function handleAutonomousSolverEval(
+	rawArgs,
+	ctx,
+	basePath = process.cwd(),
+) {
+	let args;
+	try {
+		args = parseAutonomousSolverEvalArgs(rawArgs);
+	} catch (err) {
+		ctx.ui.notify(
+			`Usage: /sf solver-eval [run] [--sample | --cases <jsonl>] [--run-id <id>]\n${err instanceof Error ? err.message : String(err)}`,
+			"warning",
+		);
+		return;
+	}
+	const cases = args.casesPath
+		? loadAutonomousSolverEvalCases(
+				isAbsolute(args.casesPath)
+					? args.casesPath
+					: join(basePath, args.casesPath),
+			)
+		: sampleAutonomousSolverEvalCases();
+	const report = runAutonomousSolverEval({
+		basePath,
+		cases,
+		runId: args.runId ?? undefined,
+	});
+	ctx.ui.notify(
+		[
+			"Autonomous solver eval complete",
+			`Run: ${report.runId}`,
+			`Evidence: ${report.relativeOutputDir}/report.json`,
+			`Cases: ${report.summary.cases}`,
+			`SF wins: ${report.summary.sfWins}`,
+			`Raw wins: ${report.summary.rawWins}`,
+			`Raw false-complete: ${report.summary.rawFalseCompletes}`,
+			`SF false-complete: ${report.summary.sfFalseCompletes}`,
+		].join("\n"),
+		"info",
+	);
+}
diff --git a/src/resources/extensions/sf/commands-bootstrap.js b/src/resources/extensions/sf/commands-bootstrap.js
index 4808c7080..5ef7d1ae3 100644
--- a/src/resources/extensions/sf/commands-bootstrap.js
+++ b/src/resources/extensions/sf/commands-bootstrap.js
@@ -60,6 +60,10 @@ const TOP_LEVEL_SUBCOMMANDS = [
 		cmd: "codebase",
 		desc: "Generate, refresh, and inspect the codebase map cache",
 	},
+	{
+		cmd: "solver-eval",
+		desc: "Compare raw agent loops against SF autonomous solver control",
+	},
 	{
 		cmd: "scaffold",
 		desc: "Inspect or refresh ADR-021 versioned scaffold docs",
diff --git a/src/resources/extensions/sf/commands/catalog.js b/src/resources/extensions/sf/commands/catalog.js
index 1a96d00a8..e08dd8855 100644
--- a/src/resources/extensions/sf/commands/catalog.js
+++ b/src/resources/extensions/sf/commands/catalog.js
@@ -12,7 +12,7 @@ const sfHome = process.env.SF_HOME || join(homedir(), ".sf");
  * Comprehensive description of all available SF commands for help text.
  */
 export const SF_COMMAND_DESCRIPTION =
-	"SF — Singularity Forge: /sf help|start|templates|next|autonomous|stop|pause|reload|status|widget|visualize|queue|quick|discuss|capture|triage|todo|dispatch|history|undo|undo-task|reset-slice|rate|skip|export|cleanup|model|mode|show-config|prefs|config|keys|hooks|run-hook|skill-health|doctor|uok|logs|forensics|changelog|migrate|remote|steer|knowledge|harness|new-milestone|parallel|cmux|park|unpark|init|setup|inspect|extensions|update|fast|mcp|rethink|codebase|notifications|ship|do|session-report|backlog|pr-branch|add-tests|scan|scaffold|extract-learnings|eval-review|plan";
+	"SF — Singularity Forge: /sf help|start|templates|next|autonomous|stop|pause|reload|status|widget|visualize|queue|quick|discuss|capture|triage|todo|dispatch|history|undo|undo-task|reset-slice|rate|skip|export|cleanup|model|mode|show-config|prefs|config|keys|hooks|run-hook|skill-health|doctor|uok|logs|forensics|changelog|migrate|remote|steer|knowledge|harness|solver-eval|new-milestone|parallel|cmux|park|unpark|init|setup|inspect|extensions|update|fast|mcp|rethink|codebase|notifications|ship|do|session-report|backlog|pr-branch|add-tests|scan|scaffold|extract-learnings|eval-review|plan";
 /**
  * Top-level SF subcommands with descriptions.
  */
@@ -119,6 +119,10 @@ export const TOP_LEVEL_SUBCOMMANDS = [
 		cmd: "harness",
 		desc: "Repo-native harness evolution (profile, status)",
 	},
+	{
+		cmd: "solver-eval",
+		desc: "Compare raw agent loops against SF autonomous solver control",
+	},
 	{
 		cmd: "new-milestone",
 		desc: "Create a milestone from a specification document (headless)",
diff --git a/src/resources/extensions/sf/commands/handlers/ops.js b/src/resources/extensions/sf/commands/handlers/ops.js
index 97716b006..6d43c2c33 100644
--- a/src/resources/extensions/sf/commands/handlers/ops.js
+++ b/src/resources/extensions/sf/commands/handlers/ops.js
@@ -278,6 +278,17 @@ Examples:
 		await handleHarness(trimmed.replace(/^harness\s*/, "").trim(), ctx);
 		return true;
 	}
+	if (trimmed === "solver-eval" || trimmed.startsWith("solver-eval ")) {
+		const { handleAutonomousSolverEval } = await import(
+			"../../autonomous-solver-eval.js"
+		);
+		await handleAutonomousSolverEval(
+			trimmed.replace(/^solver-eval\s*/, "").trim(),
+			ctx,
+			projectRoot(),
+		);
+		return true;
+	}
 	if (trimmed === "migrate" || trimmed.startsWith("migrate ")) {
 		const { handleMigrate } = await import("../../migrate/command.js");
 		await handleMigrate(trimmed.replace(/^migrate\s*/, "").trim(), ctx, pi);
diff --git a/src/resources/extensions/sf/tests/autonomous-solver-eval.test.mjs b/src/resources/extensions/sf/tests/autonomous-solver-eval.test.mjs
new file mode 100644
index 000000000..c605daf22
--- /dev/null
+++ b/src/resources/extensions/sf/tests/autonomous-solver-eval.test.mjs
@@ -0,0 +1,85 @@
+import { mkdtempSync, rmSync, writeFileSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import { afterEach, describe, expect, test } from "vitest";
+import {
+	loadAutonomousSolverEvalCases,
+	parseAutonomousSolverEvalArgs,
+	runAutonomousSolverEval,
+	sampleAutonomousSolverEvalCases,
+} from "../autonomous-solver-eval.js";
+
+let tempDirs = [];
+
+function makeProject() {
+	const dir = mkdtempSync(join(tmpdir(), "sf-solver-eval-"));
+	tempDirs.push(dir);
+	return dir;
+}
+
+afterEach(() => {
+	for (const dir of tempDirs) {
+		rmSync(dir, { recursive: true, force: true });
+	}
+	tempDirs = [];
+});
+
+describe("autonomous solver eval", () => {
+	test("parseAutonomousSolverEvalArgs_defaults_to_sample", () => {
+		expect(parseAutonomousSolverEvalArgs("run")).toEqual({
+			sample: true,
+			casesPath: null,
+			runId: null,
+		});
+		expect(
+			parseAutonomousSolverEvalArgs("--cases cases.jsonl --run-id abc"),
+		).toEqual({
+			sample: false,
+			casesPath: "cases.jsonl",
+			runId: "abc",
+		});
+	});
+
+	test("loadAutonomousSolverEvalCases_reads_jsonl_contract", () => {
+		const project = makeProject();
+		const casesPath = join(project, "cases.jsonl");
+		writeFileSync(
+			casesPath,
+			`${JSON.stringify({
+				id: "case-a",
+				files: { "input.txt": "start" },
+				rawCommand: [process.execPath, "-e", ""],
+				sfCommand: [process.execPath, "-e", ""],
+				assertions: [{ kind: "exists", path: "input.txt" }],
+			})}\n`,
+		);
+
+		const cases = loadAutonomousSolverEvalCases(casesPath);
+
+		expect(cases).toHaveLength(1);
+		expect(cases[0].id).toBe("case-a");
+		expect(cases[0].assertions[0]).toEqual({
+			kind: "exists",
+			path: "input.txt",
+			value: undefined,
+		});
+	});
+
+	test("runAutonomousSolverEval_compares_raw_and_sf_solver_signals", () => {
+		const project = makeProject();
+		const report = runAutonomousSolverEval({
+			basePath: project,
+			runId: "sample-run",
+			cases: sampleAutonomousSolverEvalCases(),
+		});
+
+		expect(report.summary.cases).toBe(1);
+		expect(report.summary.sfWins).toBe(1);
+		expect(report.summary.rawFalseCompletes).toBe(1);
+		expect(report.summary.sfFalseCompletes).toBe(0);
+		const sfResult = report.results.find((r) => r.mode === "sf");
+		expect(sfResult.passed).toBe(true);
+		expect(sfResult.solverSignals.hasCheckpoint).toBe(true);
+		expect(sfResult.solverSignals.pddComplete).toBe(true);
+	});
+});