Harden SF model routing and harness contracts

2026-04-30 07:41:24 +02:00 · 2026-04-30 07:41:24 +02:00 · cd69e85608
commit cd69e85608
parent 37c5db3dd3
69 changed files with 3967 additions and 247 deletions
--- a/package-lock.json
+++ b/package-lock.json
--- a/package.json
+++ b/package.json
@ -49,7 +49,7 @@
    "build:rpc-client": "bun run --filter @singularity-forge/rpc-client build",
    "build:pi": "npm run build:native-pkg && npm run build:pi-tui && npm run build:pi-ai && npm run build:pi-agent-core && npm run build:pi-coding-agent",
    "build:mcp-server": "bun run --filter @singularity-forge/mcp-server build",
-    "build:core": "npm run build:pi && npm run build:rpc-client && npm run build:mcp-server && tsc && npm run copy-resources && npm run copy-themes && npm run copy-export-html",
+    "build:core": "npm run build:pi && npm run build:rpc-client && npm run build:mcp-server && npm run check:versioned-json && tsc && npm run copy-resources && npm run copy-themes && npm run copy-export-html",
    "build": "npm run build:core && node scripts/build-web-if-stale.cjs",
    "stage:web-host": "node scripts/stage-web-standalone.cjs",
    "build:web-host": "npm --prefix web run build && npm run stage:web-host",
@ -88,9 +88,10 @@
    "sync-pkg-version": "node scripts/sync-pkg-version.cjs",
    "sync-platform-versions": "node native/scripts/sync-platform-versions.cjs",
    "validate-pack": "node scripts/validate-pack.js",
-    "typecheck:extensions": "tsc --noEmit --project tsconfig.extensions.json",
-    "lint": "biome check src/",
-    "lint:fix": "biome check src/ --write",
+    "typecheck:extensions": "npm run check:versioned-json && tsc --noEmit --project tsconfig.extensions.json",
+    "check:versioned-json": "node scripts/check-versioned-json.mjs",
+    "lint": "npm run check:versioned-json && biome check src/",
+    "lint:fix": "npm run check:versioned-json && biome check src/ --write",
    "pipeline:version-stamp": "node scripts/version-stamp.mjs",
    "release:changelog": "node scripts/generate-changelog.mjs",
    "release:bump": "node scripts/bump-version.mjs",
@ -111,19 +112,22 @@
    "@modelcontextprotocol/sdk": "^1.27.1",
    "@octokit/rest": "^22.0.1",
    "@silvia-odwyer/photon-node": "^0.3.4",
-    "@sinclair/typebox": "^0.34.41",
+    "@sinclair/typebox": "^0.34.49",
    "@types/mime-types": "^2.1.4",
-    "ajv": "^8.17.1",
+    "ajv": "^8.20.0",
    "ajv-formats": "^3.0.1",
    "chalk": "^5.6.2",
    "chokidar": "^5.0.0",
    "diff": "^8.0.2",
    "extract-zip": "^2.0.1",
+    "fast-check": "^4.7.0",
    "file-type": "^21.1.1",
    "get-east-asian-width": "^1.3.0",
    "glob": "^13.0.1",
    "hosted-git-info": "^9.0.2",
    "ignore": "^7.0.5",
+    "jsonrepair": "^3.14.0",
+    "markdownlint": "^0.40.0",
    "marked": "^15.0.12",
    "mime-types": "^3.0.1",
    "minimatch": "^10.2.3",
@ -132,12 +136,16 @@
    "playwright": "^1.58.2",
    "proper-lockfile": "^4.1.2",
    "proxy-agent": "^6.5.0",
+    "remark-parse": "^11.0.0",
    "sharp": "^0.34.5",
    "shell-quote": "^1.8.3",
    "sql.js": "^1.14.1",
    "strip-ansi": "^7.1.0",
    "undici": "^7.24.2",
+    "unified": "^11.0.5",
+    "unist-util-visit": "^5.1.0",
    "yaml": "^2.8.2",
+    "zod": "^4.4.1",
    "zod-to-json-schema": "^3.24.6"
  },
  "devDependencies": {
--- a/packages/pi-ai/package.json
+++ b/packages/pi-ai/package.json
@ -34,9 +34,11 @@
    "ajv-formats": "^3.0.1",
    "chalk": "^5.6.2",
    "gaxios": "^6",
+    "jsonrepair": "^3.14.0",
    "openai": "^6.26.0",
    "proxy-agent": "^6.5.0",
    "undici": "^7.24.2",
+    "yaml": "^2.8.3",
    "zod-to-json-schema": "^3.24.6"
  },
  "devDependencies": {
--- a/packages/pi-ai/src/utils/json-parse.ts
+++ b/packages/pi-ai/src/utils/json-parse.ts
@ -1,5 +1,9 @@
 import { parseStreamingJson as nativeParseStreamingJson } from "@singularity-forge/native";
-import { hasXmlParameterTags, hasYamlBulletLists, repairToolJson } from "./repair-tool-json.js";
+import {
+	hasXmlParameterTags,
+	hasYamlBulletLists,
+	repairToolJsonWithReport,
+} from "./repair-tool-json.js";

 /**
 * Attempts to parse potentially incomplete JSON during streaming.
@ -17,6 +21,9 @@ export function parseStreamingJson<T = any>(partialJson: string | undefined): T
 	if (!partialJson || partialJson.trim() === "") {
 		return {} as T;
 	}
+	if (looksLikeIncompleteObjectValue(partialJson)) {
+		return {} as T;
+	}

 	// Fast path: try native streaming parser first
 	const result = nativeParseStreamingJson<T>(partialJson);
@ -25,23 +32,24 @@ export function parseStreamingJson<T = any>(partialJson: string | undefined): T
 	// so run repair before trusting the native parse result.
 	if (hasXmlParameterTags(partialJson)) {
 		try {
-			return JSON.parse(repairToolJson(partialJson)) as T;
+			return JSON.parse(repairToolJsonWithReport(partialJson).output) as T;
 		} catch {
 			// Fall through to the native parser result on incomplete partials
 		}
 	}

 	// If the native parser returned a non-empty result, use it.
-	// Only attempt repair when the result is empty AND the input
-	// contains YAML bullet patterns (avoids unnecessary work).
+	// Only attempt repair when the result is empty AND the input looks like a
+	// complete malformed object or YAML-shaped map. This avoids inventing
+	// values for ordinary incomplete streaming chunks.
 	if (
 		result &&
 		typeof result === "object" &&
 		Object.keys(result as object).length === 0 &&
-		hasYamlBulletLists(partialJson)
+		shouldAttemptRepair(partialJson)
 	) {
 		try {
-			return JSON.parse(repairToolJson(partialJson)) as T;
+			return JSON.parse(repairToolJsonWithReport(partialJson).output) as T;
 		} catch {
 			// Repair failed — return the empty object from native parser
 		}
@ -49,3 +57,25 @@ export function parseStreamingJson<T = any>(partialJson: string | undefined): T

 	return result;
 }
+
+function looksLikeIncompleteObjectValue(input: string): boolean {
+	const trimmed = input.trim();
+	return trimmed.startsWith("{") && /:\s*$/.test(trimmed);
+}
+
+function shouldAttemptRepair(input: string): boolean {
+	if (hasXmlParameterTags(input) || hasYamlBulletLists(input)) return true;
+
+	const trimmed = input.trim();
+	if (!trimmed) return false;
+	if (
+		(trimmed.startsWith("{") && trimmed.endsWith("}")) ||
+		(trimmed.startsWith("[") && trimmed.endsWith("]"))
+	) {
+		return true;
+	}
+
+	// Full YAML map/list tool arguments from weaker models. Require a newline
+	// so normal prose with a colon does not get parsed as a scalar/map.
+	return /^[A-Za-z_][A-Za-z0-9_-]*\s*:/m.test(trimmed) && trimmed.includes("\n");
+}
--- a/packages/pi-ai/src/utils/repair-tool-json.ts
+++ b/packages/pi-ai/src/utils/repair-tool-json.ts
@ -17,6 +17,20 @@
 * @see https://github.com/singularity-forge/sf-run/issues/2660
 */

+import { jsonrepair } from "jsonrepair";
+import { parse as parseYaml } from "yaml";
+
+export const TOOL_JSON_REPAIR_PIPELINE_VERSION = 1;
+
+export interface ToolJsonRepairReport {
+	version: number;
+	input: string;
+	output: string;
+	changed: boolean;
+	repairs: string[];
+	parseable: boolean;
+}
+
 /**
 * Detect whether a JSON string contains YAML-style bullet-list values
 * (i.e. `"key": - item` instead of `"key": ["item"]`).
@ -148,6 +162,69 @@ function repairTruncatedNumbers(json: string): string {
 	return repaired;
 }

+function isParseableJson(json: string): boolean {
+	try {
+		JSON.parse(json);
+		return true;
+	} catch {
+		return false;
+	}
+}
+
+function repairWithJsonRepair(json: string): string {
+	try {
+		const repaired = jsonrepair(json);
+		return isParseableJson(repaired) ? repaired : json;
+	} catch {
+		return json;
+	}
+}
+
+function repairWithYaml(json: string): string {
+	try {
+		const parsed = parseYaml(json);
+		if (
+			parsed === null ||
+			typeof parsed !== "object" ||
+			parsed instanceof Date
+		) {
+			return json;
+		}
+		const repaired = JSON.stringify(parsed);
+		return isParseableJson(repaired) ? repaired : json;
+	} catch {
+		return json;
+	}
+}
+
+function applyGenericRepairs(json: string): { output: string; repairs: string[] } {
+	if (isParseableJson(json)) return { output: json, repairs: [] };
+
+	if (looksLikeYamlObject(json)) {
+		const yamlRepaired = repairWithYaml(json);
+		if (yamlRepaired !== json) {
+			return { output: yamlRepaired, repairs: ["yaml"] };
+		}
+	}
+
+	const jsonRepaired = repairWithJsonRepair(json);
+	if (jsonRepaired !== json) {
+		return { output: jsonRepaired, repairs: ["jsonrepair"] };
+	}
+
+	const yamlRepaired = repairWithYaml(json);
+	if (yamlRepaired !== json) {
+		return { output: yamlRepaired, repairs: ["yaml"] };
+	}
+
+	return { output: json, repairs: [] };
+}
+
+function looksLikeYamlObject(input: string): boolean {
+	const trimmed = input.trim();
+	return /^[A-Za-z_][A-Za-z0-9_-]*\s*:/m.test(trimmed) && trimmed.includes("\n");
+}
+
 /**
 * Attempt to repair malformed JSON in LLM tool-call arguments.
 *
@ -160,23 +237,37 @@ function repairTruncatedNumbers(json: string): string {
 * Returns the original string unchanged if no patterns are detected
 * or if the repair itself would produce invalid JSON.
 */
-export function repairToolJson(json: string): string {
+export function repairToolJsonWithReport(json: string): ToolJsonRepairReport {
 	let repaired = json;
+	const repairs: string[] = [];

 	// Phase 1: Strip XML parameter tags
 	if (hasXmlParameterTags(repaired)) {
 		repaired = promoteXmlParametersToTopLevel(repaired);
+		repairs.push("xml-parameter-tags");
 	}

 	// Phase 2: Repair truncated numbers
 	if (hasTruncatedNumbers(repaired)) {
 		repaired = repairTruncatedNumbers(repaired);
+		repairs.push("truncated-numbers");
 	}

 	// Phase 3: Repair YAML bullet lists
 	if (!hasYamlBulletLists(repaired)) {
-		return repaired;
+		const generic = applyGenericRepairs(repaired);
+		repairs.push(...generic.repairs);
+		const output = generic.output;
+		return {
+			version: TOOL_JSON_REPAIR_PIPELINE_VERSION,
+			input: json,
+			output,
+			changed: output !== json,
+			repairs,
+			parseable: isParseableJson(output),
+		};
 	}
+	repairs.push("yaml-bullet-lists");

 	// Strategy: find each `"key": - item1\n  - item2\n  - item3` region and
 	// wrap items in a JSON array.
@ -216,5 +307,23 @@ export function repairToolJson(json: string): string {
 	// Strip trailing commas before } or ] (common in repaired JSON)
 	repaired = repaired.replace(/,(\s*[}\]])/g, "$1");

-	return repaired;
+	// Final phase: general-purpose repair for common JSON-ish model output:
+	// unquoted keys, single quotes, trailing commas, missing quotes, etc.
+	// This runs after SF-specific repairs so battle-tested generic repair
+	// handles broad syntax cleanup without weakening known field semantics.
+	const generic = applyGenericRepairs(repaired);
+	repairs.push(...generic.repairs);
+	const output = generic.output;
+	return {
+		version: TOOL_JSON_REPAIR_PIPELINE_VERSION,
+		input: json,
+		output,
+		changed: output !== json,
+		repairs,
+		parseable: isParseableJson(output),
+	};
+}
+
+export function repairToolJson(json: string): string {
+	return repairToolJsonWithReport(json).output;
 }
--- a/packages/pi-ai/src/utils/tests/json-parse.test.ts
+++ b/packages/pi-ai/src/utils/tests/json-parse.test.ts
@ -15,3 +15,29 @@ describe("parseStreamingJson — XML parameter recovery (#3751)", () => {
 		assert.equal(parsed.oneLiner, "done");
 	});
 });
+
+describe("parseStreamingJson — generic malformed tool argument recovery", () => {
+	test("repairs complete JSON-ish objects with unquoted keys", () => {
+		const parsed = parseStreamingJson<Record<string, unknown>>(
+			"{title: 'Done', verificationPassed: true,}",
+		);
+
+		assert.equal(parsed.title, "Done");
+		assert.equal(parsed.verificationPassed, true);
+	});
+
+	test("repairs full YAML-shaped object arguments", () => {
+		const parsed = parseStreamingJson<Record<string, unknown>>(
+			"title: Done\nverificationPassed: true\n",
+		);
+
+		assert.equal(parsed.title, "Done");
+		assert.equal(parsed.verificationPassed, true);
+	});
+
+	test("does not repair incomplete streaming chunks into fabricated values", () => {
+		const parsed = parseStreamingJson<Record<string, unknown>>('{"title":');
+
+		assert.deepEqual(parsed, {});
+	});
+});
--- a/packages/pi-ai/src/utils/tests/repair-tool-json.test.ts
+++ b/packages/pi-ai/src/utils/tests/repair-tool-json.test.ts
@ -1,6 +1,13 @@
 import { describe, test } from "node:test";
 import assert from "node:assert/strict";
-import { repairToolJson, hasYamlBulletLists, hasXmlParameterTags, hasTruncatedNumbers } from "../repair-tool-json.js";
+import {
+	repairToolJson,
+	repairToolJsonWithReport,
+	hasYamlBulletLists,
+	hasXmlParameterTags,
+	hasTruncatedNumbers,
+	TOOL_JSON_REPAIR_PIPELINE_VERSION,
+} from "../repair-tool-json.js";

 describe("repairToolJson — YAML bullet list repair (#2660)", () => {
 	// ── Detection ──────────────────────────────────────────────────────────
@ -101,6 +108,62 @@ describe("repairToolJson — YAML bullet list repair (#2660)", () => {
 	});
 });

+// ═══════════════════════════════════════════════════════════════════════════
+// General JSON repair via jsonrepair
+// ═══════════════════════════════════════════════════════════════════════════
+
+describe("repairToolJson — general JSON repair via jsonrepair", () => {
+	test("repairs unquoted keys and trailing commas", () => {
+		const malformed = "{title: 'Done', count: 2,}";
+		const repaired = repairToolJson(malformed);
+		const parsed = JSON.parse(repaired);
+
+		assert.deepEqual(parsed, { title: "Done", count: 2 });
+	});
+
+	test("repairs single-quoted strings", () => {
+		const malformed = "{'milestoneId':'M001','title':'Plan'}";
+		const repaired = repairToolJson(malformed);
+		const parsed = JSON.parse(repaired);
+
+		assert.deepEqual(parsed, { milestoneId: "M001", title: "Plan" });
+	});
+
+	test("returns a versioned repair report with provenance", () => {
+		const report = repairToolJsonWithReport("{title: 'Done', count: 2,}");
+
+		assert.equal(report.version, TOOL_JSON_REPAIR_PIPELINE_VERSION);
+		assert.equal(report.changed, true);
+		assert.equal(report.parseable, true);
+		assert.ok(report.repairs.includes("jsonrepair"));
+		assert.deepEqual(JSON.parse(report.output), { title: "Done", count: 2 });
+	});
+});
+
+describe("repairToolJson — full YAML object fallback", () => {
+	test("repairs YAML-shaped tool arguments to JSON", () => {
+		const malformed = [
+			"title: Done",
+			"keyDecisions:",
+			"  - Keep semantic model aliases",
+			"  - Prefer strict validation",
+			"verificationPassed: true",
+		].join("\n");
+		const report = repairToolJsonWithReport(malformed);
+		const parsed = JSON.parse(report.output);
+
+		assert.ok(report.repairs.includes("yaml"));
+		assert.deepEqual(parsed, {
+			title: "Done",
+			keyDecisions: [
+				"Keep semantic model aliases",
+				"Prefer strict validation",
+			],
+			verificationPassed: true,
+		});
+	});
+});
+
 // ═══════════════════════════════════════════════════════════════════════════
 // XML parameter tag repair (#3403)
 // ═══════════════════════════════════════════════════════════════════════════
--- a/packages/pi-coding-agent/src/cli/list-models.test.ts
+++ b/packages/pi-coding-agent/src/cli/list-models.test.ts
@ -0,0 +1,70 @@
+import assert from "node:assert/strict";
+import { afterEach, beforeEach, describe, it } from "node:test";
+import type { Model } from "@singularity-forge/pi-ai";
+import type { ModelRegistry } from "../core/model-registry.js";
+import { listModels } from "./list-models.js";
+
+const model = (provider: string, id: string): Model<any> => ({
+	id,
+	name: id,
+	api: "openai-completions",
+	provider,
+	baseUrl: "https://example.invalid",
+	reasoning: false,
+	input: ["text"],
+	cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
+	contextWindow: 128000,
+	maxTokens: 16000,
+});
+
+let originalLog: typeof console.log;
+let output: string[];
+
+beforeEach(() => {
+	originalLog = console.log;
+	output = [];
+	console.log = (...args: unknown[]) => {
+		output.push(args.join(" "));
+	};
+});
+
+afterEach(() => {
+	console.log = originalLog;
+});
+
+describe("listModels", () => {
+	it("exact live provider search uses discovery and replaces static rows", async () => {
+		const registry = {
+			discoverModels: async (providers?: string[]) => {
+				assert.deepEqual(providers, ["zai"]);
+				return [{ provider: "zai", models: [{ id: "glm-5.1" }], fetchedAt: Date.now() }];
+			},
+			getAvailable: () => [model("zai", "glm-4.5-air"), model("zai", "glm-5.1")],
+			getDiscoveredModels: () => [model("zai", "glm-5.1")],
+			isDiscovered: (m: Model<any>) => m.provider === "zai" && m.id === "glm-5.1",
+		} as unknown as ModelRegistry;
+
+		await listModels(registry, { searchPattern: "zai" });
+
+		const rendered = output.join("\n");
+		assert.match(rendered, /glm-5\.1/);
+		assert.doesNotMatch(rendered, /glm-4\.5-air/);
+	});
+
+	it("discovery errors hide stale static rows for attempted live providers", async () => {
+		const registry = {
+			discoverModels: async () => [
+				{ provider: "zai", models: [], fetchedAt: Date.now(), error: "401 Unauthorized" },
+			],
+			getAvailable: () => [model("zai", "glm-5.1"), model("minimax", "MiniMax-M2.7")],
+			getDiscoveredModels: () => [],
+			isDiscovered: () => false,
+		} as unknown as ModelRegistry;
+
+		await listModels(registry, { discover: true });
+
+		const rendered = output.join("\n");
+		assert.doesNotMatch(rendered, /zai/);
+		assert.match(rendered, /MiniMax-M2\.7/);
+	});
+});
--- a/packages/pi-coding-agent/src/cli/list-models.ts
+++ b/packages/pi-coding-agent/src/cli/list-models.ts
@ -4,6 +4,7 @@

 import type { Api, Model } from "@singularity-forge/pi-ai";
 import { fuzzyFilter } from "@singularity-forge/pi-tui";
+import { getDiscoverableProviders } from "../core/model-discovery.js";
 import type { ModelRegistry } from "../core/model-registry.js";

 export interface ListModelsOptions {
@ -62,14 +63,27 @@ export async function listModels(
 			? { searchPattern: optionsOrSearch }
 			: optionsOrSearch ?? {};

-	// If discover flag is set, run discovery first
-	if (options.discover) {
-		await modelRegistry.discoverModels();
-	}
+	const exactDiscoveryProvider = resolveExactDiscoveryProvider(options.searchPattern);
+	const shouldDiscover = options.discover || exactDiscoveryProvider !== undefined;
+	const discoveryResults = shouldDiscover
+		? await modelRegistry.discoverModels(
+				exactDiscoveryProvider ? [exactDiscoveryProvider] : undefined,
+			)
+		: [];
+	const discoveredProviders = new Set(discoveryResults.map((r) => r.provider));

-	// Get models — include discovered if discovery was run
-	const models = options.discover
-		? modelRegistry.getAllWithDiscovered()
+	// Live-listed providers must not fall back to stale static catalog rows once a
+	// discovery pass was attempted. If the provider returns 401/429/empty, it
+	// contributes zero rows to this diagnostic output.
+	const models = shouldDiscover
+		? [
+				...modelRegistry
+					.getAvailable()
+					.filter((m) => !discoveredProviders.has(m.provider)),
+				...modelRegistry
+					.getDiscoveredModels()
+					.filter((m) => discoveredProviders.has(m.provider)),
+			]
 		: modelRegistry.getAvailable();

 	if (models.length === 0) {
@ -78,8 +92,10 @@ export async function listModels(
 	}

 	// Apply fuzzy filter if search pattern provided
-	let filteredModels: Model<Api>[] = models;
-	if (options.searchPattern) {
+	let filteredModels: Model<Api>[] = exactDiscoveryProvider
+		? models.filter((m) => m.provider.toLowerCase() === exactDiscoveryProvider)
+		: models;
+	if (options.searchPattern && !exactDiscoveryProvider) {
 		filteredModels = fuzzyFilter(models, options.searchPattern, (m) => `${m.provider} ${m.id}`);
 	}

@ -162,3 +178,11 @@ export async function listModels(
 		console.log(line);
 	}
 }
+
+function resolveExactDiscoveryProvider(searchPattern?: string): string | undefined {
+	const query = searchPattern?.trim().toLowerCase();
+	if (!query) return undefined;
+	return getDiscoverableProviders().find(
+		(provider) => provider.toLowerCase() === query,
+	);
+}
--- a/packages/pi-coding-agent/src/core/model-registry-discovery.test.ts
+++ b/packages/pi-coding-agent/src/core/model-registry-discovery.test.ts
@ -134,8 +134,11 @@ describe("ModelRegistry — public discovery providers", () => {
 			)) as typeof fetch;

 		try {
-			const registry = new ModelRegistry(AuthStorage.inMemory({}), undefined);
-			registry.getDiscoveryCache().clear("ollama-cloud");
+			const registry = new ModelRegistry(
+				AuthStorage.inMemory({}),
+				undefined,
+				new ModelDiscoveryCache(join(testDir, "ollama-cloud-cache.json")),
+			);
 			const results = await registry.discoverModels(["ollama-cloud"]);

 			assert.equal(results[0]?.provider, "ollama-cloud");
@ -165,6 +168,9 @@ describe("ModelRegistry — public discovery providers", () => {
 							name: "GLM-5.2",
 							context_length: 200000,
 						},
+						{
+							id: "glm-4.5-air",
+						},
 					],
 				}),
 				{ status: 200 },
@ -176,18 +182,27 @@ describe("ModelRegistry — public discovery providers", () => {
 					zai: { type: "api_key", key: "zai-test" },
 				}),
 				undefined,
+				new ModelDiscoveryCache(join(testDir, "zai-cache.json")),
 			);
-			registry.getDiscoveryCache().clear("zai");
 			const results = await registry.discoverModels(["zai"]);

 			assert.equal(results[0]?.provider, "zai");
-			assert.deepEqual(results[0]?.models.map((m) => m.id), ["glm-5.2"]);
+			assert.deepEqual(results[0]?.models.map((m) => m.id), [
+				"glm-5.2",
+				"glm-4.5-air",
+			]);
 			const model = registry
 				.getAllWithDiscovered()
 				.find((m) => m.provider === "zai" && m.id === "glm-5.2");
 			assert.ok(model, "discovered direct model should be available under zai");
 			assert.equal(model.api, "openai-completions");
 			assert.equal(model.baseUrl, "https://api.z.ai/api/coding/paas/v4");
+			const knownModel = registry
+				.getDiscoveredModels()
+				.find((m) => m.provider === "zai" && m.id === "glm-4.5-air");
+			assert.ok(knownModel, "known direct model should be materialized from live discovery");
+			assert.equal(knownModel.name, "GLM-4.5-Air");
+			assert.equal(knownModel.reasoning, true);
 		} finally {
 			globalThis.fetch = originalFetch;
 		}
@ -217,8 +232,11 @@ describe("ModelRegistry — public discovery providers", () => {
 			)) as typeof fetch;

 		try {
-			const registry = new ModelRegistry(AuthStorage.inMemory({}), undefined);
-			registry.getDiscoveryCache().clear("singularity-memory");
+			const registry = new ModelRegistry(
+				AuthStorage.inMemory({}),
+				undefined,
+				new ModelDiscoveryCache(join(testDir, "memory-cache.json")),
+			);
 			const results = await registry.discoverModelCatalogs(["singularity-memory"]);

 			assert.equal(results[0]?.provider, "singularity-memory");
--- a/packages/pi-coding-agent/src/core/model-registry-proxy-routing.test.ts
+++ b/packages/pi-coding-agent/src/core/model-registry-proxy-routing.test.ts
@ -136,6 +136,65 @@ describe("ModelRegistry.getModelsForProxy — basic", () => {
 			"paid OpenCode should not be a fallback candidate",
 		);
 	});
+
+	it("hides explicit Xiaomi token-plan regional aliases while keeping the default Xiaomi provider", () => {
+		const registry = createRegistry(() => true);
+		const available = registry.getAvailable();
+
+		assert.ok(
+			available.some((m) => m.provider === "xiaomi" && m.id === "mimo-v2-pro"),
+			"xiaomi/default AMS provider should remain available",
+		);
+		assert.ok(
+			!available.some((m) => m.provider.startsWith("xiaomi-token-plan-")),
+			"regional Xiaomi token-plan aliases should not be listed",
+		);
+		assert.equal(
+			registry.find("xiaomi-token-plan-ams", "mimo-v2-pro"),
+			undefined,
+			"direct lookup should also hide regional Xiaomi token-plan aliases",
+		);
+	});
+
+	it("hides Claude Code because it is not part of the managed provider pool", () => {
+		const registry = createRegistry(() => true);
+		const available = registry.getAvailable();
+
+		assert.ok(
+			!available.some((m) => m.provider === "claude-code"),
+			"Claude Code should not be listed or selected by SF provider policy",
+		);
+		assert.equal(
+			registry.find("claude-code", "sonnet"),
+			undefined,
+			"direct lookup should also hide Claude Code models",
+		);
+	});
+
+	it("hides Mistral non-selection endpoints while keeping chat and coding models", () => {
+		const registry = createRegistry(() => true);
+		registry.registerProvider("mistral", {
+			authMode: "none",
+			baseUrl: "https://api.mistral.ai",
+			api: "mistral-conversations",
+			streamSimple: noopStream,
+			models: [
+				{ id: "mistral-large-latest", name: "Mistral Large", api: "mistral-conversations", reasoning: false, input: ["text"], cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }, contextWindow: 128000, maxTokens: 16384 },
+				{ id: "codestral-latest", name: "Codestral", api: "mistral-conversations", reasoning: false, input: ["text"], cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }, contextWindow: 128000, maxTokens: 16384 },
+				{ id: "mistral-embed", name: "Mistral Embed", api: "mistral-conversations", reasoning: false, input: ["text"], cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }, contextWindow: 8192, maxTokens: 8192 },
+				{ id: "mistral-ocr-latest", name: "Mistral OCR", api: "mistral-conversations", reasoning: false, input: ["text", "image"], cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }, contextWindow: 8192, maxTokens: 8192 },
+				{ id: "voxtral-mini-tts-latest", name: "Voxtral TTS", api: "mistral-conversations", reasoning: false, input: ["text"], cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }, contextWindow: 8192, maxTokens: 8192 },
+				{ id: "ft:codestral-latest:abc", name: "Private Fine Tune", api: "mistral-conversations", reasoning: false, input: ["text"], cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 }, contextWindow: 8192, maxTokens: 8192 },
+			],
+		});
+
+		const available = registry
+			.getAvailable()
+			.filter((m) => m.provider === "mistral")
+			.map((m) => m.id);
+
+		assert.deepEqual(available, ["mistral-large-latest", "codestral-latest"]);
+	});
 });

 // ── getModelsForProxy — family priority ordering ──────────────────────────────
--- a/packages/pi-coding-agent/src/core/model-registry.ts
+++ b/packages/pi-coding-agent/src/core/model-registry.ts
@ -214,6 +214,13 @@ const OPENCODE_FREE_MODEL_IDS = new Set([
 	"nemotron-3-super-free",
 ]);

+const HIDDEN_MODEL_PROVIDERS = new Set([
+	"claude-code",
+	"xiaomi-token-plan-ams",
+	"xiaomi-token-plan-cn",
+	"xiaomi-token-plan-sgp",
+]);
+
 function providerModelAllowEntryMatches(allowedModel: string, modelKey: string): boolean {
 	const allowedKey = allowedModel.trim().toLowerCase();
 	if (!allowedKey) return false;
@ -235,9 +242,32 @@ function isZeroCost(cost: Model<Api>["cost"] | undefined): boolean {
 	return !!cost && cost.input === 0 && cost.output === 0 && cost.cacheRead === 0 && cost.cacheWrite === 0;
 }

+function isMistralSelectionModel(modelId: string): boolean {
+	const modelKey = modelId.trim().toLowerCase();
+	if (
+		modelKey.startsWith("ft:") ||
+		modelKey.includes("embed") ||
+		modelKey.includes("moderation") ||
+		modelKey.includes("ocr") ||
+		modelKey.includes("voxtral") ||
+		modelKey.includes("transcribe") ||
+		modelKey.includes("tts") ||
+		modelKey.includes("realtime")
+	) {
+		return false;
+	}
+	return true;
+}
+
 function isModelAllowedByBuiltInProviderPolicy(model: ProviderPolicyModel): boolean {
 	const provider = model.provider.toLowerCase();
 	const modelKey = model.id.trim().toLowerCase();
+	if (HIDDEN_MODEL_PROVIDERS.has(provider)) {
+		return false;
+	}
+	if (provider === "mistral") {
+		return isMistralSelectionModel(model.id);
+	}
 	if (provider === "openrouter") {
 		return providerModelAllowEntryMatches(":free", modelKey);
 	}
@ -355,8 +385,9 @@ export class ModelRegistry {
 	constructor(
 		readonly authStorage: AuthStorage,
 		readonly modelsJsonPath: string | undefined = join(getAgentDir(), "models.json"),
+		discoveryCache?: ModelDiscoveryCache,
 	) {
-		this.discoveryCache = new ModelDiscoveryCache();
+		this.discoveryCache = discoveryCache ?? new ModelDiscoveryCache();

 		// Set up fallback resolver for custom provider API keys
 		this.authStorage.setFallbackResolver((provider) => {
@ -1056,6 +1087,17 @@ export class ModelRegistry {
 		return this.filterProviderModelAllow([...this.models, ...unique]);
 	}

+	/**
+	 * Return only models from the most recent discovery pass.
+	 *
+	 * Purpose: let diagnostic list commands replace stale static rows for live-listed
+	 * providers with the provider's actual `/models` response.
+	 * Consumer: cli/list-models.ts when `--discover` or an exact provider query is used.
+	 */
+	getDiscoveredModels(providerModelAllow?: ProviderModelAllowList): Model<Api>[] {
+		return this.filterProviderModelAllow(this.discoveredModels, providerModelAllow);
+	}
+
 	/**
 	 * Check if a model was added via discovery (not built-in or custom).
 	 */
@ -1075,20 +1117,29 @@ export class ModelRegistry {
 	 */
 	private convertDiscoveredModels(results: DiscoveryResult[]): Model<Api>[] {
 		const converted: Model<Api>[] = [];
+		const seen = new Set<string>();
 		for (const result of results) {
 			if (result.error) continue;
 			for (const dm of result.models) {
+				const provider = dm.provider ?? result.provider;
+				const key = `${provider}/${dm.id}`;
+				if (seen.has(key)) continue;
+				seen.add(key);
+				const known = this.models.find((m) => m.provider === provider && m.id === dm.id);
+				const discoveredName =
+					dm.name && dm.name !== dm.id ? dm.name : undefined;
 				converted.push({
+					...known,
 					id: dm.id,
-					name: dm.name ?? dm.id,
-					api: (dm.api ?? "openai") as Api,
-					provider: dm.provider ?? result.provider,
-					baseUrl: dm.baseUrl ?? "",
-					reasoning: dm.reasoning ?? false,
-					input: dm.input ?? ["text"],
-					cost: dm.cost ?? { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
-					contextWindow: dm.contextWindow ?? 128000,
-					maxTokens: dm.maxTokens ?? 16384,
+					name: discoveredName ?? known?.name ?? dm.id,
+					api: (dm.api ?? known?.api ?? "openai") as Api,
+					provider,
+					baseUrl: dm.baseUrl ?? known?.baseUrl ?? "",
+					reasoning: dm.reasoning ?? known?.reasoning ?? false,
+					input: dm.input ?? known?.input ?? ["text"],
+					cost: dm.cost ?? known?.cost ?? { input: 0, output: 0, cacheRead: 0, cacheWrite: 0 },
+					contextWindow: dm.contextWindow ?? known?.contextWindow ?? 128000,
+					maxTokens: dm.maxTokens ?? known?.maxTokens ?? 16384,
 				} as Model<Api>);
 			}
 		}
--- a/packages/pi-coding-agent/src/index.ts
+++ b/packages/pi-coding-agent/src/index.ts
@ -160,6 +160,7 @@ export type { DiscoveredModel, DiscoveryResult, DiscoverySourceType, ProviderDis
 export { getDiscoverableCatalogSources, getDiscoverableProviders, getDiscoveryAdapter } from "./core/model-discovery.js";
 export { ModelRegistry } from "./core/model-registry.js";
 export { ModelsJsonWriter } from "./core/models-json-writer.js";
+export { discoverAndPrintModels, listModels } from "./cli/list-models.js";
 export type {
 	PackageManager,
 	PathMetadata,
--- a/scripts/check-versioned-json.mjs
+++ b/scripts/check-versioned-json.mjs
@ -0,0 +1,89 @@
+#!/usr/bin/env node
+/**
+ * Enforce schema/version markers on SF-owned JSON contracts.
+ *
+ * This intentionally does not scan ecosystem configuration files such as
+ * tsconfig.json, package.json, Biome config, or lockfiles. Those files are
+ * versioned by their owning tools. This check covers JSON that SF owns as
+ * runtime data, persisted contracts, or generated artifact templates.
+ */
+
+import { execFileSync } from "node:child_process";
+import { readFileSync } from "node:fs";
+
+const REQUIRED_PREFIXES = ["src/resources/extensions/sf/"];
+const EXEMPT_SUFFIXES = ["/package.json"];
+const VERSION_KEYS = ["schemaVersion", "version"];
+
+function trackedJsonFiles() {
+	try {
+		const out = execFileSync("git", ["ls-files", "*.json"], {
+			encoding: "utf8",
+			stdio: ["ignore", "pipe", "pipe"],
+		});
+		return out
+			.split("\n")
+			.map((line) => line.trim())
+			.filter(Boolean);
+	} catch (error) {
+		const message = error instanceof Error ? error.message : String(error);
+		throw new Error(`failed to list tracked JSON files: ${message}`);
+	}
+}
+
+function shouldCheck(path) {
+	return (
+		REQUIRED_PREFIXES.some((prefix) => path.startsWith(prefix)) &&
+		!EXEMPT_SUFFIXES.some((suffix) => path.endsWith(suffix))
+	);
+}
+
+function hasOwn(object, key) {
+	return Object.prototype.hasOwnProperty.call(object, key);
+}
+
+function hasVersionMarker(parsed) {
+	if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) return false;
+	if (VERSION_KEYS.some((key) => hasOwn(parsed, key))) return true;
+
+	const meta = parsed._meta;
+	return Boolean(
+		meta &&
+			typeof meta === "object" &&
+			!Array.isArray(meta) &&
+			VERSION_KEYS.some((key) => hasOwn(meta, key)),
+	);
+}
+
+const failures = [];
+let checked = 0;
+
+for (const path of trackedJsonFiles()) {
+	if (!shouldCheck(path)) continue;
+	checked++;
+
+	let parsed;
+	try {
+		parsed = JSON.parse(readFileSync(path, "utf8"));
+	} catch (error) {
+		const message = error instanceof Error ? error.message : String(error);
+		failures.push(`${path}: invalid JSON (${message})`);
+		continue;
+	}
+
+	if (!hasVersionMarker(parsed)) {
+		failures.push(
+			`${path}: missing schemaVersion/version marker (top-level or _meta)`,
+		);
+	}
+}
+
+if (failures.length > 0) {
+	console.error("Versioned JSON check failed:");
+	for (const failure of failures) {
+		console.error(`  - ${failure}`);
+	}
+	process.exit(1);
+}
+
+console.log(`Versioned JSON check passed (${checked} file${checked === 1 ? "" : "s"}).`);
--- a/src/cli-web-branch.ts
+++ b/src/cli-web-branch.ts
@ -28,6 +28,7 @@ export interface CliFlags {
 	worktree?: boolean | string;
 	model?: string;
 	listModels?: string | true;
+	discover?: boolean;
 	extensions: string[];
 	appendSystemPrompt?: string;
 	tools?: string[];
@ -119,6 +120,8 @@ export function parseCliArgs(argv: string[]): CliFlags {
 		} else if (arg === "--list-models") {
 			flags.listModels =
 				i + 1 < args.length && !args[i + 1].startsWith("-") ? args[++i] : true;
+		} else if (arg === "--discover") {
+			flags.discover = true;
 		} else if (!arg.startsWith("--") && !arg.startsWith("-")) {
 			flags.messages.push(arg);
 		}
--- a/src/cli.ts
+++ b/src/cli.ts
@ -5,6 +5,7 @@ import {
 	createAgentSession,
 	DefaultResourceLoader,
 	InteractiveMode,
+	listModels,
 	ModelRegistry,
 	runPackageCommand,
 	runPrintMode,
@ -599,54 +600,12 @@ if (cliFlags.listModels !== undefined) {
 	}
 	listModelsExtensions.runtime.pendingProviderRegistrations = [];

-	const models = modelRegistry.getAvailable();
-	if (models.length === 0) {
-		console.log("No models available. Set API keys in environment variables.");
-		process.exit(0);
-	}
-
 	const searchPattern =
 		typeof cliFlags.listModels === "string" ? cliFlags.listModels : undefined;
-	let filtered = models;
-	if (searchPattern) {
-		const q = searchPattern.toLowerCase();
-		filtered = models.filter((m) =>
-			`${m.provider} ${m.id} ${m.name}`.toLowerCase().includes(q),
-		);
-	}
-
-	// Sort by name descending (newest first), then provider, then id
-	filtered.sort((a, b) => {
-		const nameCmp = b.name.localeCompare(a.name);
-		if (nameCmp !== 0) return nameCmp;
-		const provCmp = a.provider.localeCompare(b.provider);
-		if (provCmp !== 0) return provCmp;
-		return a.id.localeCompare(b.id);
+	await listModels(modelRegistry, {
+		searchPattern,
+		discover: cliFlags.discover || searchPattern === undefined,
 	});
-
-	const fmt = (n: number) =>
-		n >= 1_000_000
-			? `${n / 1_000_000}M`
-			: n >= 1_000
-				? `${n / 1_000}K`
-				: `${n}`;
-	const rows = filtered.map((m) => [
-		m.provider,
-		m.id,
-		m.name,
-		fmt(m.contextWindow),
-		fmt(m.maxTokens),
-		m.reasoning ? "yes" : "no",
-	]);
-	const hdrs = ["provider", "model", "name", "context", "max-out", "thinking"];
-	const widths = hdrs.map((h, i) =>
-		Math.max(h.length, ...rows.map((r) => r[i].length)),
-	);
-	const pad = (s: string, w: number) => s.padEnd(w);
-	console.log(hdrs.map((h, i) => pad(h, widths[i])).join("  "));
-	for (const row of rows) {
-		console.log(row.map((c, i) => pad(c, widths[i])).join("  "));
-	}
 	process.exit(0);
 }

--- a/src/headless-events.ts
+++ b/src/headless-events.ts
@ -148,7 +148,11 @@ export function isMilestoneReadyNotification(
 ): boolean {
 	if (event.type !== "extension_ui_request" || event.method !== "notify")
 		return false;
-	return /milestone\s+m\d+.*ready/i.test(String(event.message ?? ""));
+	return isMilestoneReadyText(String(event.message ?? ""));
+}
+
+export function isMilestoneReadyText(text: string): boolean {
+	return /milestone\s+m\d+.*ready/i.test(text);
 }

 export function isInteractiveHeadlessTool(
--- a/src/headless-query.ts
+++ b/src/headless-query.ts
@ -4,7 +4,8 @@
 * Single read-only command that returns the full project snapshot as JSON
 * to stdout, without spawning an LLM session. Instant (~50ms).
 *
- * Output: { state, next, cost }
+ * Output: { schemaVersion, state, next, cost }
+ *   schemaVersion — output contract version
 *   state         — deriveState() output (phase, milestones, progress, blockers)
 *   next          — dry-run dispatch preview (what auto-mode would do next)
 *   cost          — aggregated parallel worker costs
@ -87,6 +88,7 @@ async function loadExtensionModules() {
 // ─── Types ──────────────────────────────────────────────────────────────────

 export interface QuerySnapshot {
+	schemaVersion: 1;
 	state: SFState;
 	next: {
 		action: "dispatch" | "stop" | "skip";
@ -164,6 +166,7 @@ export async function buildQuerySnapshot(
 	}));

 	const snapshot: QuerySnapshot = {
+		schemaVersion: 1,
 		state,
 		next,
 		cost: { workers, total: workers.reduce((sum, w) => sum + w.cost, 0) },
--- a/src/headless-types.ts
+++ b/src/headless-types.ts
@ -22,6 +22,7 @@ export const VALID_OUTPUT_FORMATS: ReadonlySet<string> = new Set([
 // ---------------------------------------------------------------------------

 export interface HeadlessJsonResult {
+	schemaVersion: 1;
 	status: "success" | "error" | "blocked" | "cancelled" | "timeout";
 	exitCode: number;
 	sessionId?: string;
--- a/src/headless.ts
+++ b/src/headless.ts
@ -34,6 +34,7 @@ import {
 	isBlockedNotification,
 	isInteractiveHeadlessTool,
 	isMilestoneReadyNotification,
+	isMilestoneReadyText,
 	isPauseNotification,
 	isQuickCommand,
 	isTerminalNotification,
@ -761,6 +762,7 @@ async function runHeadlessOnce(
 						: "timeout"
 					: "success";
 		const result: HeadlessJsonResult = {
+			schemaVersion: 1,
 			status,
 			exitCode,
 			sessionId: lastSessionId,
@ -1034,6 +1036,10 @@ async function runHeadlessOnce(
 				const ame = eventObj.assistantMessageEvent as
 					| Record<string, unknown>
 					| undefined;
+				const deltaText = String(ame?.delta ?? ame?.text ?? "");
+				if (deltaText && isNewMilestone && options.auto) {
+					milestoneReady ||= isMilestoneReadyText(deltaText);
+				}
 				if (ame && options.verbose) {
 					const ameType = String(ame.type ?? "");

--- a/src/help-text.ts
+++ b/src/help-text.ts
@ -237,6 +237,9 @@ export function printHelp(version: string): void {
 	process.stdout.write(
 		"  --list-models [search]   List available models and exit\n",
 	);
+	process.stdout.write(
+		"  --discover               Force live verification for filtered --list-models\n",
+	);
 	process.stdout.write("  --version, -v            Print version and exit\n");
 	process.stdout.write("  --help, -h               Print this help and exit\n");
 	process.stdout.write("\nSubcommands:\n");
--- a/src/resources/extensions/sf/auto-model-selection.ts
+++ b/src/resources/extensions/sf/auto-model-selection.ts
@ -154,13 +154,7 @@ const BARE_MODEL_FAMILY_PRIORITY: Array<{
 	{ match: /^MiniMax-|^minimax-/i, providers: ["minimax", "minimax-cn"] },
 	{
 		match: /^mimo-|^xiaomi-/i,
-		providers: [
-			"xiaomi",
-			"xiaomi-token-plan-ams",
-			"xiaomi-token-plan-sgp",
-			"xiaomi-token-plan-cn",
-			"opencode-go",
-		],
+		providers: ["xiaomi", "opencode-go"],
 	},
 ];

--- a/src/resources/extensions/sf/auto-post-unit.ts
+++ b/src/resources/extensions/sf/auto-post-unit.ts
@ -66,6 +66,7 @@ import {
 } from "./pre-execution-checks.js";
 import { loadEffectiveSFPreferences } from "./preferences.js";
 import { loadPrompt } from "./prompt-loader.js";
+import { recordSelfFeedback } from "./self-feedback.js";
 // crossReferenceEvidence available for future use when verification_evidence is stored in DB
 // import { crossReferenceEvidence, type ClaimedEvidence } from "./safety/evidence-cross-ref.js";
 import { validateContent } from "./safety/content-validator.js";
@ -610,6 +611,25 @@ export async function postUnitPreVerification(
 						`Git stage failed: ${stageErrMsg.split("\n")[0]}`,
 						"warning",
 					);
+					// Record as self-feedback so future runs can drain it from the
+					// backlog. Empty-pathspec failures are low-severity (the upstream
+					// guard in nativeAddPaths now no-ops; if we still hit this branch
+					// the cause is something else worth flagging at medium).
+					const isEmptyPathspec = /\(none\)|add -- failed|empty pathspec/i.test(
+						stageErrMsg,
+					);
+					recordSelfFeedback(
+						{
+							kind: isEmptyPathspec
+								? "git-empty-pathspec"
+								: "git-stage-failure",
+							severity: isEmptyPathspec ? "low" : "medium",
+							summary: `git stage failed during postUnit: ${stageErrMsg.split("\n")[0]}`,
+							evidence: stageErrMsg,
+							source: "detector",
+						},
+						s.basePath,
+					);
 				}
 			} else {
 				const gitResult = runTurnGitAction({
--- a/src/resources/extensions/sf/auto-runaway-guard.ts
+++ b/src/resources/extensions/sf/auto-runaway-guard.ts
@ -43,6 +43,29 @@ export interface RunawayGuardMetrics {
 	topTools?: Record<string, number>;
 }

+export interface RunawayGuardPauseMetadata {
+	reason: string;
+	pausedAt: number;
+	unitType: string;
+	unitId: string;
+	diagnosticTurns: number;
+	warningsSent: number;
+	thresholdReasons: string[];
+	metrics: RunawayGuardMetrics;
+	lastWarningMetrics: Pick<
+		RunawayGuardMetrics,
+		"toolCalls" | "sessionTokens" | "elapsedMs"
+	>;
+	thresholds: Pick<
+		RunawayGuardConfig,
+		| "toolCallWarning"
+		| "tokenWarning"
+		| "elapsedMs"
+		| "changedFilesWarning"
+		| "minIntervalMs"
+	>;
+}
+
 interface RunawayGuardState {
 	unitKey: string;
 	baselineSessionTokens: number;
@ -202,7 +225,7 @@ export function collectWorktreeFingerprint(cwd: string): string | null {
 export type RunawayGuardDecision =
 	| { action: "none" }
 	| { action: "warn"; message: string; final: boolean }
-	| { action: "pause"; reason: string };
+	| { action: "pause"; reason: string; metadata: RunawayGuardPauseMetadata };

 export function evaluateRunawayGuard(
 	unitType: string,
@ -235,12 +258,35 @@ export function evaluateRunawayGuard(
 		s.finalWarningSent &&
 		hasMeaningfulGrowth(unitMetrics, s, config)
 	) {
-		return {
-			action: "pause",
-			reason:
+		const reason =
 			`Runaway guard paused ${unitType} ${unitId}: budget kept growing after ` +
 			`${config.diagnosticTurns} diagnostic turn(s). ` +
-				formatMetricSummary(unitMetrics),
+			formatMetricSummary(unitMetrics);
+		return {
+			action: "pause",
+			reason,
+			metadata: {
+				reason,
+				pausedAt: now,
+				unitType,
+				unitId,
+				diagnosticTurns: config.diagnosticTurns,
+				warningsSent: s.warningsSent,
+				thresholdReasons: reasons,
+				metrics: unitMetrics,
+				lastWarningMetrics: {
+					toolCalls: s.lastToolCalls,
+					sessionTokens: s.lastSessionTokens,
+					elapsedMs: s.lastElapsedMs,
+				},
+				thresholds: {
+					toolCallWarning: config.toolCallWarning,
+					tokenWarning: config.tokenWarning,
+					elapsedMs: config.elapsedMs,
+					changedFilesWarning: config.changedFilesWarning,
+					minIntervalMs: config.minIntervalMs,
+				},
+			},
 		};
 	}

--- a/src/resources/extensions/sf/auto-timers.ts
+++ b/src/resources/extensions/sf/auto-timers.ts
@ -13,11 +13,6 @@ import type {
 import { saveActivityLog } from "./activity-log.js";
 import { resolveAgentEndCancelled } from "./auto/resolve.js";
 import type { AutoSession } from "./auto/session.js";
-import { detectWorkingTreeActivity } from "./auto-supervisor.js";
-import {
-	type RecoveryContext,
-	recoverTimedOutUnit,
-} from "./auto-timeout-recovery.js";
 import {
 	collectSessionTokenUsage,
 	collectWorktreeFingerprint,
@ -25,6 +20,11 @@ import {
 	evaluateRunawayGuard,
 	resolveRunawayGuardConfig,
 } from "./auto-runaway-guard.js";
+import { detectWorkingTreeActivity } from "./auto-supervisor.js";
+import {
+	type RecoveryContext,
+	recoverTimedOutUnit,
+} from "./auto-timeout-recovery.js";
 import {
 	clearInFlightTools,
 	getInFlightToolCount,
@ -40,6 +40,7 @@ import {
 } from "./context-budget.js";
 import type { SFPreferences } from "./preferences.js";
 import { resolveAutoSupervisorConfig } from "./preferences.js";
+import { recordSelfFeedback } from "./self-feedback.js";
 import { getMilestoneSlices, getSliceTasks, isDbAvailable } from "./sf-db.js";
 import {
 	readUnitRuntimeRecord,
@ -328,8 +329,28 @@ export function startUnitSupervision(sctx: SupervisionContext): void {
 							phase: "paused",
 							lastProgressAt: Date.now(),
 							lastProgressKind: "runaway-guard",
+							runawayGuardPause: decision.metadata,
 						},
 					);
+					const unitParts = unitId.split("/");
+					recordSelfFeedback(
+						{
+							kind: "runaway-guard-hard-pause",
+							severity: "medium",
+							summary: decision.reason,
+							evidence: JSON.stringify(decision.metadata, null, 2),
+							suggestedFix:
+								"Review the paused unit's warning responses and runtime metrics to distinguish legitimate scope from loop/churn.",
+							occurredIn: {
+								unitType,
+								milestone: unitParts[0],
+								slice: unitParts[1],
+								task: unitParts.slice(2).join("/") || undefined,
+							},
+							source: "detector",
+						},
+						s.basePath,
+					);
 					ctx.ui.notify(decision.reason, "warning");
 					await pauseAuto(ctx, pi);
 					return;
--- a/src/resources/extensions/sf/bootstrap/db-tools.ts
+++ b/src/resources/extensions/sf/bootstrap/db-tools.ts
@ -9,6 +9,7 @@ import {
 	nextMilestoneId,
 } from "../guided-flow.js";
 import { loadEffectiveSFPreferences } from "../preferences.js";
+import { recordSelfFeedback } from "../self-feedback.js";
 import {
 	executeCompleteMilestone,
 	executePlanMilestone,
@ -634,6 +635,193 @@ export function registerDbTools(pi: ExtensionAPI): void {
 		"sf_milestone_generate_id",
 	);

+	// ─── sf_self_report ─────────────────────────────────────────────────
+	// Agent-callable bug-report channel. Records anomalies the agent observes
+	// in sf's own behavior so they accumulate in a backlog (forge's own
+	// .sf/BACKLOG.md when running on forge itself, ~/.sf/agent/upstream-feedback.jsonl
+	// otherwise). Severity drives whether the originating unit is also blocked
+	// pending an sf version bump.
+
+	const selfReportExecute = async (
+		_toolCallId: string,
+		params: any,
+		_signal: AbortSignal | undefined,
+		_onUpdate: unknown,
+		_ctx: unknown,
+	) => {
+		try {
+			const result = recordSelfFeedback(
+				{
+					kind: params.kind,
+					severity: params.severity,
+					summary: params.summary,
+					evidence: params.evidence,
+					suggestedFix: params.suggested_fix,
+					acceptanceCriteria: params.acceptance_criteria,
+					occurredIn: params.occurred_in,
+					source: "agent",
+				},
+				process.cwd(),
+			);
+			if (!result) {
+				return {
+					content: [
+						{
+							type: "text" as const,
+							text: "Error: failed to write self-feedback entry",
+						},
+					],
+					details: {
+						operation: "self_report",
+						error: "write_failed",
+					} as any,
+				};
+			}
+			const e = result.entry;
+			const blockNote = result.blocking
+				? ` (BLOCKING — unit will be held until sf is bumped past ${e.sfVersion} or entry ${e.id} is resolved)`
+				: "";
+			return {
+				content: [
+					{
+						type: "text" as const,
+						text: `Recorded self-feedback ${e.id} [${e.severity}] ${e.kind}${blockNote}`,
+					},
+				],
+				details: {
+					operation: "self_report",
+					id: e.id,
+					blocking: e.blocking,
+					repoIdentity: e.repoIdentity,
+					sfVersion: e.sfVersion,
+				} as any,
+			};
+		} catch (err) {
+			const msg = err instanceof Error ? err.message : String(err);
+			logError("tool", `sf_self_report tool failed: ${msg}`, {
+				tool: "sf_self_report",
+				error: String(err),
+			});
+			return {
+				content: [
+					{ type: "text" as const, text: `Error in sf_self_report: ${msg}` },
+				],
+				details: { operation: "self_report", error: msg } as any,
+			};
+		}
+	};
+
+	const selfReportTool = {
+		name: "sf_self_report",
+		label: "Self Report",
+		description:
+			"Record any thought about sf itself — bugs, missing features, prompt-quality issues, ideas, " +
+			"design speculations, agent friction — so it can be addressed in a future unit. " +
+			"Use this for any sf-internal observation: brittle gate predicates, advisory-downgrade " +
+			"swallowing real failures, but ALSO ambiguous prompts, missing context, friction in agent " +
+			"workflows, or speculative improvements. Over-reporting is preferred to under-reporting; " +
+			"dedup happens later. Do NOT use this for bugs in the user's project or for your own task " +
+			"work — only for sf-the-tool observations. Entries route automatically: when working on " +
+			"singularity-forge itself they land in .sf/BACKLOG.md; otherwise they land in a global " +
+			"~/.sf/upstream-feedback.jsonl.",
+		promptSnippet:
+			"Report any sf-internal observation: bug, missing feature, prompt issue, idea, friction",
+		promptGuidelines: [
+			"Use sf_self_report for ANY sf-internal observation — not just bugs. Acceptable kinds include: 'prompt-quality-issue' (you found a prompt ambiguous, contradictory, or missing context), 'improvement-idea' (a non-bug enhancement that would help), 'agent-friction' (workflow friction you worked around), 'design-thought' (broader speculation), 'missing-feature' (capability you wished sf had), as well as classic bug kinds like 'brittle-predicate' or 'git-empty-pathspec'.",
+			"Do NOT use this for bugs in the user's project, for your own task work, or to track your task's todo list. ONLY for observations about sf-the-tool itself.",
+			"Over-reporting is preferred to under-reporting at this stage. If you noticed it about sf, file it. Dedup and threshold-to-roadmap promotion are tracked as their own backlog items and will eventually clean noise.",
+			"Severity guide: low = cosmetic / nice-to-have / improvement idea. medium = noisy or imperfect or recurring friction. high = blocked the unit (sf-the-tool prevented you from completing the task). critical = needs immediate fix (currently treated as high until inline-fix dispatch lands).",
+			"high/critical entries mark the originating unit as blocked: it will not seal as success, and will be re-queued only after sf is bumped past the recorded version.",
+			"Provide concrete evidence — log excerpt, command, file path, error message, the literal prompt text that confused you, etc. Vague reports are not actionable; specific ones are.",
+			"If you have a hypothesis about the fix, include it as suggested_fix. Even a half-baked idea is more useful than nothing.",
+			"For high/critical entries, include acceptance_criteria — concrete conditions a future resolver must satisfy before calling this resolved. Without it, 'resolved' is just trust; with it, the resolver has a falsifiable bar. Phrase as 1. ... 2. ... 3. ... so each can be checked off independently.",
+			"occurred_in is auto-filled from the active auto.lock; only override if you're reporting from outside the current unit.",
+		],
+		parameters: Type.Object({
+			kind: Type.String({
+				description:
+					"Short stable identifier for the anomaly class (e.g. 'git-empty-pathspec', 'brittle-predicate', 'advisory-downgrade'). Reuse existing kinds when applicable.",
+			}),
+			severity: Type.Union(
+				[
+					Type.Literal("low"),
+					Type.Literal("medium"),
+					Type.Literal("high"),
+					Type.Literal("critical"),
+				],
+				{
+					description:
+						"low/medium = log and continue. high/critical = block this unit until sf is bumped or the entry is resolved.",
+				},
+			),
+			summary: Type.String({
+				description: "One-line description of the anomaly",
+			}),
+			evidence: Type.Optional(
+				Type.String({
+					description:
+						"Concrete artifact: log excerpt, command, file path, error message, etc.",
+				}),
+			),
+			suggested_fix: Type.Optional(
+				Type.String({
+					description:
+						"Optional hypothesis about how to fix this in sf source",
+				}),
+			),
+			acceptance_criteria: Type.Optional(
+				Type.String({
+					description:
+						"Optional reporter-written list of conditions a future resolver must satisfy before marking this resolved. Phrase as bullet points or a short numbered list. Example: '1. plan-quality.ts rejects grep -c predicates with a clear error. 2. existing predicates of that shape are flagged in BACKLOG. 3. test in plan-quality.test.ts covers the rejection.' Without this, resolution is just trust — with it, the resolver has a falsifiable bar to meet.",
+				}),
+			),
+			occurred_in: Type.Optional(
+				Type.Object(
+					{
+						milestone: Type.Optional(Type.String()),
+						slice: Type.Optional(Type.String()),
+						task: Type.Optional(Type.String()),
+						unitType: Type.Optional(Type.String()),
+					},
+					{
+						description:
+							"Override the auto-detected current unit. Usually leave unset — the tool reads .sf/auto.lock by default.",
+					},
+				),
+			),
+		}),
+		execute: selfReportExecute,
+		renderCall(args: any, theme: any) {
+			let text = theme.fg("toolTitle", theme.bold("self_report "));
+			if (args.severity)
+				text += theme.fg(
+					args.severity === "critical" || args.severity === "high"
+						? "error"
+						: "accent",
+					`[${args.severity}] `,
+				);
+			if (args.kind) text += theme.fg("muted", args.kind);
+			if (args.summary) text += theme.fg("dim", ` — ${args.summary}`);
+			return new Text(text, 0, 0);
+		},
+		renderResult(result: any, _options: any, theme: any) {
+			const d = result.details;
+			if (result.isError || d?.error) {
+				return new Text(
+					theme.fg("error", `Error: ${d?.error ?? "unknown"}`),
+					0,
+					0,
+				);
+			}
+			const blocking = d?.blocking ? " · BLOCKING" : "";
+			let text = theme.fg("success", `Recorded ${d?.id ?? ""}`);
+			text += theme.fg("dim", `${blocking}`);
+			return new Text(text, 0, 0);
+		},
+	};
+
+	pi.registerTool(selfReportTool);
+
 	// ─── sf_plan_milestone (sf_milestone_plan alias) ─────────────────────

 	const planMilestoneExecute = async (
--- a/src/resources/extensions/sf/bootstrap/register-hooks.ts
+++ b/src/resources/extensions/sf/bootstrap/register-hooks.ts
@ -185,6 +185,49 @@ export function registerHooks(
 			}
 		}
 		loadToolApiKeys();
+		// Drain self-feedback backlog: auto-resolve entries whose blocking
+		// sf-version constraint has been satisfied by the current sf bump,
+		// and surface entries that remain blocked to the operator. Done after
+		// other init so notifications appear in the same session-start sweep.
+		try {
+			const { triageBlockedEntries, markResolved } = await import(
+				"../self-feedback.js"
+			);
+			const triage = triageBlockedEntries(process.cwd());
+			const currentSfVersion = process.env.SF_VERSION || "unknown";
+			for (const e of triage.retry) {
+				markResolved(
+					e.id,
+					{
+						reason: `sf bumped past ${e.sfVersion} (was blocking on this version)`,
+						evidence: {
+							kind: "auto-version-bump",
+							fromVersion: e.sfVersion,
+							toVersion: currentSfVersion,
+						},
+					},
+					process.cwd(),
+				);
+				const occ = e.occurredIn;
+				const unit = occ
+					? [occ.milestone, occ.slice, occ.task].filter(Boolean).join("/") ||
+						occ.unitType ||
+						"(unknown unit)"
+					: "(unknown unit)";
+				ctx.ui?.notify?.(
+					`Self-feedback ${e.id} (${e.kind}) auto-resolved — sf bumped past ${e.sfVersion}. Originating unit ${unit} should be re-run.`,
+					"info",
+				);
+			}
+			if (triage.stillBlocked.length > 0) {
+				ctx.ui?.notify?.(
+					`${triage.stillBlocked.length} self-feedback entr${triage.stillBlocked.length === 1 ? "y" : "ies"} still blocked on prior sf versions. See .sf/BACKLOG.md or ~/.sf/agent/upstream-feedback.jsonl.`,
+					"warning",
+				);
+			}
+		} catch {
+			/* non-fatal — self-feedback drain must never block session start */
+		}
 	});

 	pi.on("session_switch", async (_event, ctx) => {
--- a/src/resources/extensions/sf/docs/preferences-reference.md
+++ b/src/resources/extensions/sf/docs/preferences-reference.md
@ -429,7 +429,11 @@ exact model IDs by default, and also support `*` globs plus `:suffix` shorthand.
 SF always restricts OpenRouter to `:free` models and OpenCode to zero-cost/free
 models. OpenCode Go stays unrestricted because it is the subscribed tier. You
 can still add an explicit OpenRouter allow-list to narrow that free subset
-further:
+further. Xiaomi regional token-plan aliases are hidden from normal selection;
+use `xiaomi/*` for the default AMS-backed endpoint. Claude Code is hidden from
+normal SF model selection because SF routes through managed provider keys here.
+Mistral live discovery hides non-selection endpoints such as embeddings, OCR,
+moderation, TTS/transcription/realtime audio, and private fine-tunes.

 ```yaml
 ---
--- a/src/resources/extensions/sf/export.ts
+++ b/src/resources/extensions/sf/export.ts
@ -77,6 +77,7 @@ export function writeExportFile(

 	if (format === "json") {
 		const report = {
+			schemaVersion: 1,
 			exportedAt: new Date().toISOString(),
 			project: projectName,
 			totals: visualizerData?.totals ?? getProjectTotals(units),
@ -290,6 +291,7 @@ export async function handleExport(

 	if (format === "json") {
 		const report = {
+			schemaVersion: 1,
 			exportedAt: new Date().toISOString(),
 			project: projectName,
 			totals: getProjectTotals(units),
--- a/src/resources/extensions/sf/extension-manifest.json
+++ b/src/resources/extensions/sf/extension-manifest.json
@ -14,7 +14,8 @@
 			"sf_decision_save",
 			"sf_summary_save",
 			"sf_requirement_update",
-			"sf_milestone_generate_id"
+			"sf_milestone_generate_id",
+			"sf_self_report"
 		],
 		"commands": ["sf", "kill", "worktree", "exit"],
 		"hooks": [
--- a/src/resources/extensions/sf/learning/data/model-benchmarks.json
+++ b/src/resources/extensions/sf/learning/data/model-benchmarks.json
@ -227,27 +227,27 @@
 		"long_context_ruler": 95,
 		"arena_elo": 1495,
 		"instruction_following": null,
-		"source": "MiniMax M2.7 model card + openrouter (SWE-Pro 56.22, Terminal Bench 2 57.0, GDPval-AA ELO 1495) + inheriting stable M2-family numbers (LCB, HLE, AIME, GPQA, MMLU-Pro) that M2.5/M2.7 didn't re-run but carry from the same weights family. SWE-bench Verified 80.2 published for M2.5 (≤ M2.7), BrowseComp 76.3 from M2.5 card. Context: weights support 1M tokens; individual endpoints (opencode-go, openrouter) may cap lower",
-		"context_window": 1048576,
+		"source": "MiniMax M2.7 model card + API docs (SWE-Pro 56.22, Terminal Bench 2 57.0, GDPval-AA ELO 1495) + inheriting stable M2-family numbers (LCB, HLE, AIME, GPQA, MMLU-Pro) that M2.5/M2.7 did not re-run but carry from the same weights family. SWE-bench Verified 80.2 published for M2.5, BrowseComp 76.3 from M2.5 card. API docs list M2.7 and M2.7-highspeed at the same performance tier with 204,800 context and 131,072 max output; highspeed changes serving speed/cost",
+		"context_window": 204800,
 		"max_output_tokens": 131072
 	},
 	"MiniMax-M2.7-highspeed": {
 		"swe_bench": null,
-		"swe_bench_verified": 76,
-		"live_code_bench": 80,
+		"swe_bench_verified": 80.2,
+		"live_code_bench": 83,
 		"human_eval": null,
-		"hle": 11,
-		"aime_2026": 74,
-		"gpqa": 74,
-		"mmlu_pro": 78,
+		"hle": 31.8,
+		"aime_2026": 78,
+		"gpqa": 78,
+		"mmlu_pro": 82,
 		"bbh": null,
-		"browse_comp": 72,
+		"browse_comp": 76.3,
 		"simple_qa": null,
 		"long_context_ruler": 95,
-		"arena_elo": null,
+		"arena_elo": 1495,
 		"instruction_following": null,
-		"source": "MiniMax M2.7-highspeed — fast tier of M2.7 trading ~5pp quality for throughput. Scores estimated from M2.7 baseline minus published highspeed tradeoff; same context/output limits",
-		"context_window": 131072,
+		"source": "MiniMax M2.7-highspeed API docs: same performance as M2.7, faster/agile serving at approximately 100 tps and higher cost. Use M2.7 benchmark scores; let cost/latency policy decide between tiers",
+		"context_window": 204800,
 		"max_output_tokens": 131072
 	},
 	"MiniMax-M2.5": {
@ -265,8 +265,27 @@
 		"long_context_ruler": 92,
 		"arena_elo": null,
 		"instruction_following": null,
-		"source": "MiniMax M2.5 official card: SWE-Bench Verified 80.2, Multi-SWE-Bench 51.3, BrowseComp 76.3 (w/ context mgmt). LCB/HLE/AIME/GPQA/MMLU-Pro inherited from M2 family baseline (same weights lineage). Context: 1M weights-level, endpoints may serve less",
-		"context_window": 1048576,
+		"source": "MiniMax M2.5 official card: SWE-Bench Verified 80.2, Multi-SWE-Bench 51.3, BrowseComp 76.3 (w/ context mgmt). LCB/HLE/AIME/GPQA/MMLU-Pro inherited from M2 family baseline (same weights lineage). API docs list M2.5 and M2.5-highspeed at the same performance tier with 204,800 context and 131,072 max output",
+		"context_window": 204800,
+		"max_output_tokens": 131072
+	},
+	"MiniMax-M2.5-highspeed": {
+		"swe_bench": null,
+		"swe_bench_verified": 80.2,
+		"live_code_bench": 83,
+		"human_eval": null,
+		"hle": 31.8,
+		"aime_2026": 78,
+		"gpqa": 78,
+		"mmlu_pro": 82,
+		"bbh": null,
+		"browse_comp": 76.3,
+		"simple_qa": null,
+		"long_context_ruler": 92,
+		"arena_elo": null,
+		"instruction_following": null,
+		"source": "MiniMax M2.5-highspeed API docs: same performance as M2.5, faster/agile serving at approximately 100 tps and higher cost. Use M2.5 benchmark scores; let cost/latency policy decide between tiers",
+		"context_window": 204800,
 		"max_output_tokens": 131072
 	},
 	"MiniMax-M2.1": {
--- a/src/resources/extensions/sf/learning/data/primary-provider-chain.json
+++ b/src/resources/extensions/sf/learning/data/primary-provider-chain.json
@ -1,3 +1,6 @@
-[
+{
+	"schemaVersion": 1,
+	"entries": [
 		{ "provider": "kimi-coding", "model": "kimi-for-coding", "priority": 0 }
-]
+	]
+}
--- a/src/resources/extensions/sf/learning/fallback-chain-writer.mjs
+++ b/src/resources/extensions/sf/learning/fallback-chain-writer.mjs
@ -74,7 +74,7 @@ import { dirname, join, resolve } from "node:path";
 import { cwd as getCwd } from "node:process";

 import { blendedRanking } from "./bayesian-blender.mjs";
-import primaryProviderChainEntries from "./data/primary-provider-chain.json" with {
+import primaryProviderChainConfig from "./data/primary-provider-chain.json" with {
 	type: "json",
 };
 import { computeUnitTypeScore } from "./loadCapabilityOverrides.mjs";
@ -85,6 +85,9 @@ const PRIORITY_STEP = 10;
 const DEFAULT_CHAIN_NAME = "default";
 const MAIN_CHAIN_NAME = "main";
 const PROJECT_SETTINGS_SUBPATH = ".sf/agent/settings.json";
+const primaryProviderChainEntries = Array.isArray(primaryProviderChainConfig)
+	? primaryProviderChainConfig
+	: (primaryProviderChainConfig.entries ?? []);

 /**
 * Compute blended ranking for a single unit type across every model we
--- a/src/resources/extensions/sf/learning/loadCapabilityOverrides.test.mjs
+++ b/src/resources/extensions/sf/learning/loadCapabilityOverrides.test.mjs
@ -235,10 +235,43 @@ test("loadCapabilityOverrides: all 40 models have populated dimension profiles",
 	}
 });

+test("loadCapabilityOverrides: MiniMax highspeed tiers do not take a quality penalty", async () => {
+	const { benchmarks } = await loadCapabilityOverrides();
+	const m27 = benchmarks["MiniMax-M2.7"];
+	const m27Highspeed = benchmarks["MiniMax-M2.7-highspeed"];
+	const m25 = benchmarks["MiniMax-M2.5"];
+	const m25Highspeed = benchmarks["MiniMax-M2.5-highspeed"];
+
+	assert.ok(m27, "MiniMax-M2.7 benchmark entry exists");
+	assert.ok(m27Highspeed, "MiniMax-M2.7-highspeed benchmark entry exists");
+	assert.ok(m25, "MiniMax-M2.5 benchmark entry exists");
+	assert.ok(m25Highspeed, "MiniMax-M2.5-highspeed benchmark entry exists");
+
+	for (const key of [
+		"swe_bench_verified",
+		"live_code_bench",
+		"hle",
+		"aime_2026",
+		"gpqa",
+		"mmlu_pro",
+		"browse_comp",
+	]) {
+		assert.strictEqual(m27Highspeed[key], m27[key], `M2.7 highspeed ${key}`);
+		assert.strictEqual(m25Highspeed[key], m25[key], `M2.5 highspeed ${key}`);
+	}
+
+	assert.strictEqual(m27Highspeed.context_window, 204800);
+	assert.strictEqual(m25Highspeed.context_window, 204800);
+});
+
 test("loadCapabilityOverrides: computeUnitTypeScore resolves provider wire ids to semantic benchmark keys", async () => {
 	const { overrides, weights } = await loadCapabilityOverrides();
 	assert.ok(overrides["kimi-k2.5"], "canonical Kimi K2.5 benchmark key exists");
-	assert.equal(overrides.k2p5, undefined, "k2p5 is a wire alias, not a benchmark key");
+	assert.equal(
+		overrides.k2p5,
+		undefined,
+		"k2p5 is a wire alias, not a benchmark key",
+	);

 	const prefixed = computeUnitTypeScore(
 		"kimi-coding/k2p5",
--- a/src/resources/extensions/sf/native-git-bridge.ts
+++ b/src/resources/extensions/sf/native-git-bridge.ts
@ -906,8 +906,15 @@ export function nativeAddAllWithExclusions(
 * Stage specific files.
 * Native: libgit2 index add.
 * Fallback: `git add -- <paths>`.
+ *
+ * No-ops on an empty paths array — `git add --` with no pathspec is an error
+ * shaped like `git add -- (none) failed`, surfaced to users via
+ * auto-post-unit's stage-failure path. Callers are expected to filter empty
+ * lists upstream; the early-return is defense-in-depth so a missed filter
+ * doesn't manifest as a confusing diagnostic.
 */
 export function nativeAddPaths(basePath: string, paths: string[]): void {
+	if (paths.length === 0) return;
 	const native = loadNative();
 	if (native) {
 		native.gitAddPaths(basePath, paths);
--- a/src/resources/extensions/sf/preferences-models.ts
+++ b/src/resources/extensions/sf/preferences-models.ts
@ -71,6 +71,13 @@ const OPENCODE_FREE_MODEL_IDS = new Set([
 	"nemotron-3-super-free",
 ]);

+const HIDDEN_MODEL_PROVIDERS = new Set([
+	"claude-code",
+	"xiaomi-token-plan-ams",
+	"xiaomi-token-plan-cn",
+	"xiaomi-token-plan-sgp",
+]);
+
 function resolveProviderModelAllowList(
 	providerModelAllow: ProviderModelAllowList | undefined,
 	provider: string,
@ -109,6 +116,23 @@ function isZeroCost(cost: ProviderPolicyModel["cost"] | undefined): boolean {
 	return !!cost && cost.input === 0 && cost.output === 0 && cost.cacheRead === 0 && cost.cacheWrite === 0;
 }

+function isMistralSelectionModel(modelId: string): boolean {
+	const modelKey = modelId.trim().toLowerCase();
+	if (
+		modelKey.startsWith("ft:") ||
+		modelKey.includes("embed") ||
+		modelKey.includes("moderation") ||
+		modelKey.includes("ocr") ||
+		modelKey.includes("voxtral") ||
+		modelKey.includes("transcribe") ||
+		modelKey.includes("tts") ||
+		modelKey.includes("realtime")
+	) {
+		return false;
+	}
+	return true;
+}
+
 function isModelAllowedByBuiltInProviderPolicy(
 	provider: string,
 	modelId: string,
@ -116,6 +140,12 @@ function isModelAllowedByBuiltInProviderPolicy(
 ): boolean {
 	const providerKey = provider.toLowerCase();
 	const modelKey = modelId.trim().toLowerCase();
+	if (HIDDEN_MODEL_PROVIDERS.has(providerKey)) {
+		return false;
+	}
+	if (providerKey === "mistral") {
+		return isMistralSelectionModel(modelId);
+	}
 	if (providerKey === "openrouter") {
 		return providerModelAllowEntryMatches(":free", modelKey);
 	}
@ -170,7 +200,8 @@ export function filterModelsByProviderModelAllow<
 		(!providerModelAllow || Object.keys(providerModelAllow).length === 0) &&
 		(!providerModelBlock || Object.keys(providerModelBlock).length === 0) &&
 		!models.some((model) =>
-			["openrouter", "opencode"].includes(model.provider.toLowerCase()),
+			["mistral", "openrouter", "opencode"].includes(model.provider.toLowerCase()) ||
+			HIDDEN_MODEL_PROVIDERS.has(model.provider.toLowerCase()),
 		)
 	)
 		return [...models];
--- a/src/resources/extensions/sf/prompts/complete-milestone.md
+++ b/src/resources/extensions/sf/prompts/complete-milestone.md
@ -70,6 +70,7 @@ If work falls into the second bucket, do not fail the milestone just because it
   - `deviations` (string) — Deviations from the original plan
 12. Update `.sf/PROJECT.md`: use the `write` tool with `path: ".sf/PROJECT.md"` and `content` containing the full updated document reflecting milestone completion and current project state. Do NOT use the `edit` tool for this — PROJECT.md is a full-document refresh.
 13. Review all slice summaries for cross-cutting lessons, patterns, or gotchas that emerged during this milestone. Append any non-obvious, reusable insights to `.sf/KNOWLEDGE.md`.
+13b. Review `.sf/BACKLOG.md` (if present — it lives only when sf is dogfooded on forge) and the global `~/.sf/agent/upstream-feedback.jsonl`. For any sf-internal anomaly that recurred across multiple slices in this milestone but is not yet captured in either log, file it now via `sf_self_report`. The milestone-close agent is the last line of defense for systemic sf bugs that single-task agents missed.
 14. Do not commit manually — the system auto-commits your changes after this unit completes.
 - Say: "Milestone {{milestoneId}} complete."

--- a/src/resources/extensions/sf/prompts/complete-slice.md
+++ b/src/resources/extensions/sf/prompts/complete-slice.md
@ -31,6 +31,7 @@ Then:
 8. Draft the UAT content you will pass as `uatContent` — a concrete UAT script with real test cases derived from the slice plan and task summaries. Include preconditions, numbered steps with expected outcomes, and edge cases. This must NOT be a placeholder or generic template — tailor every test case to what this slice actually built.
 9. Review task summaries for `key_decisions`. Append any significant decisions to `.sf/DECISIONS.md` if missing.
 10. Review task summaries for patterns, gotchas, or non-obvious lessons learned. If any would save future agents from repeating investigation or hitting the same issues, append them to `.sf/KNOWLEDGE.md`. Only add entries that are genuinely useful — don't pad with obvious observations.
+10b. Scan task summaries and the slice's activity log for sf-internal anomalies that the per-task agents may not have reported individually — repeated `Git stage failed`, `Verification failed … advisory`, `Safety: N unexpected file change(s)`, brittle gate predicates, etc. For any genuine sf-the-tool defect that surfaced during this slice but was NOT already filed via `sf_self_report`, file it now via `sf_self_report` with appropriate severity. This is the slice-level sweep — task-level agents file individual reports during execution; the slice-close agent catches systemic issues only visible across multiple tasks.
 11. Call `sf_complete_slice` with the camelCase fields `milestoneId`, `sliceId`, `sliceTitle`, `oneLiner`, `narrative`, `verification`, and `uatContent`, plus any optional enrichment fields you have. Do NOT manually mark the roadmap checkbox — the tool writes to the DB, renders `{{sliceSummaryPath}}` and `{{sliceUatPath}}`, and updates the ROADMAP.md projection automatically.
 12. Do not run git commands — the system commits your changes and handles any merge after this unit succeeds.
 13. Update `.sf/PROJECT.md` if it exists — refresh current state if needed: use the `write` tool with `path: ".sf/PROJECT.md"` and `content` containing the full updated document reflecting current project state. Do NOT use the `edit` tool for this — PROJECT.md is a full-document refresh.
--- a/src/resources/extensions/sf/prompts/execute-task.md
+++ b/src/resources/extensions/sf/prompts/execute-task.md
@ -43,7 +43,7 @@ Then:
   - If you need a one-off script, scratch file, generated fixture, or temporary helper to understand or verify the work, either delete it before completion or promote it into the durable artifact named by the task plan.
   - Do not leave duplicate sources of truth. When temporary/seed data is normalized into a canonical location, update downstream code to read only the canonical path and remove or clearly mark the old copy as non-authoritative.
   - Do not satisfy verification with an ad-hoc helper when the task asks for a durable harness, command, test, or report. The durable planned artifact must own the repeatable check.
-   - Before calling `sf_complete_task`, inspect `git status --short` and make sure every changed/untracked file is intentional, in-scope, and either listed in the task plan/summary or explicitly explained as a local adaptation.
+   - Before calling `sf_complete_task` with `milestoneId`, `sliceId`, and `taskId`, inspect `git status --short` and make sure every changed/untracked file is intentional, in-scope, and either listed in the task plan/summary or explicitly explained as a local adaptation.
 6. Write or update tests as part of execution — tests are verification, not an afterthought. If the slice plan defines test files in its Verification section and this is the first task, create them (they should initially fail).
 7. When implementing non-trivial runtime behavior (async flows, API boundaries, background processes, error paths), add or preserve agent-usable observability. Skip this for simple changes where it doesn't apply.

@ -81,6 +81,8 @@ Then:
    - For compile/typecheck failures that mention an unknown method, function, type, import, or package member, verify the exact local API before editing. Use the language's own tooling or installed source (`go doc`, `go list`, module cache/source grep, `tsc` types, generated declarations, or equivalent). Do not invent adjacent method names.
    - After a compile-repair edit, rerun the narrow failing command immediately before more feature work. If two repair attempts leave the same unknown-symbol class, stop broad edits and write a precise handoff/blocker summary.
 17. **Blocker discovery:** If execution reveals that the remaining slice plan is fundamentally invalid — not just a bug or minor deviation, but a plan-invalidating finding like a wrong API, missing capability, or architectural mismatch — set `blocker_discovered: true` in the task summary frontmatter and describe the blocker clearly in the summary narrative. Do NOT set `blocker_discovered: true` for ordinary debugging, minor deviations, or issues that can be fixed within the current task or the remaining plan. This flag triggers an automatic replan of the slice.
+17b. **sf-internal anomalies and observations:** If during execution you observe sf-the-tool misbehaving (empty `git add --` pathspecs, brittle gate predicates, advisory-downgrade hiding real failures, false safety floods), find a prompt ambiguous or contradictory, hit workflow friction, or have an idea that would make sf better — call `sf_self_report`. Use `prompt-quality-issue`, `improvement-idea`, `agent-friction`, or `design-thought` kinds for non-bug observations alongside the classic bug kinds. Severity guide: `low`/`medium` for cosmetic / noisy / nice-to-have (sf continues); `high`/`critical` only when the sf issue actually prevents the task from sealing correctly (this blocks the unit). For high/critical, include `acceptance_criteria` so a future resolver has a falsifiable bar. This is distinct from `blocker_discovered` (which is about the user's plan, not about sf). Over-reporting is preferred to under-reporting at this stage.
+17c. **If your task picks up a backlog entry:** Read the entry's `acceptanceCriteria` (in `.sf/BACKLOG.md` or `.sf/self-feedback.jsonl`). Confirm each criterion is satisfied by your fix before considering the entry resolved. When you complete the task, the system will eventually run `markResolved` with structured evidence — for now, cite the entry id and which criteria you met in your task summary's `narrative` so the resolution is traceable. Do not silently fix and move on.
 18. If you made an architectural, pattern, library, or observability decision during this task that downstream work should know about, append it to `.sf/DECISIONS.md` (read the template at `~/.sf/agent/extensions/sf/templates/decisions.md` if the file doesn't exist yet). Not every task produces decisions — only append when a meaningful choice was made.
 19. If you discover a non-obvious rule, recurring gotcha, or useful pattern during execution, append it to `.sf/KNOWLEDGE.md`. Only add entries that would save future agents from repeating your investigation. Don't add obvious things.
 20. Read the template at `~/.sf/agent/extensions/sf/templates/task-summary.md`
--- a/src/resources/extensions/sf/prompts/guided-plan-milestone.md
+++ b/src/resources/extensions/sf/prompts/guided-plan-milestone.md
@ -1,6 +1,6 @@
 Plan milestone {{milestoneId}} ("{{milestoneTitle}}"). Read `.sf/DECISIONS.md` if it exists — respect existing decisions. Read `.sf/REQUIREMENTS.md` if it exists and treat Active requirements as the capability contract. If `REQUIREMENTS.md` is missing, continue in legacy compatibility mode but explicitly note missing requirement coverage. Use the **Roadmap** output template below to shape the milestone planning payload you send to `sf_plan_milestone`. Call `sf_plan_milestone` to persist the milestone planning fields and render `{{milestoneId}}-ROADMAP.md` from DB state. Do **not** write `{{milestoneId}}-ROADMAP.md`, `ROADMAP.md`, or other planning artifacts manually. If planning produces structural decisions, append them to `.sf/DECISIONS.md`. {{skillActivation}} Fill the Horizontal Checklist section with cross-cutting concerns considered during planning (requirements re-read, decisions re-evaluated, graceful shutdown, revenue paths, auth boundary, shared resources, reconnection). Omit for trivial milestones.

-Before calling `sf_plan_milestone`, run a bounded **Vision Alignment Meeting** for the milestone and roadmap. This is allowed to be broader and more nuanced than slice planning. Include at least these participant lenses:
+Before calling `sf_plan_milestone`, run a bounded **Vision Alignment Meeting** for the milestone and roadmap as a real multi-agent review. Use the `subagent` tool in `mode: "debate"` with `rounds: 2` and a separate task for each participant lens below. Do **not** merely simulate every participant inside this planner response. If the `subagent` tool is unavailable or fails after one retry, record that explicitly in `trigger` and run the structured meeting inline as a degraded fallback. This is allowed to be broader and more nuanced than slice planning. Include at least these participant lenses:
 - Product Manager
 - User Advocate
 - Customer Panel
--- a/src/resources/extensions/sf/prompts/plan-milestone.md
+++ b/src/resources/extensions/sf/prompts/plan-milestone.md
@ -43,7 +43,7 @@ If milestone research exists (inlined above), trust those findings and skip redu

 Narrate your decomposition reasoning — why you're grouping work this way, what risks are driving the order, what verification strategy you're choosing and why. Use complete sentences rather than planner shorthand or fragmentary notes.

-Before you persist the roadmap, run a bounded **Vision Alignment Meeting**. This is broader than slice planning and should feel allowed to be chatty and nuanced. Gather the strongest additions, cuts, and ordering changes from these participant lenses:
+Before you persist the roadmap, run a bounded **Vision Alignment Meeting** as a real multi-agent review. Use the `subagent` tool in `mode: "debate"` with `rounds: 2` and a separate task for each participant lens below. Do **not** merely simulate every participant inside this planner response. If the `subagent` tool is unavailable or fails after one retry, record that explicitly in `trigger` and run the structured meeting inline as a degraded fallback. This is broader than slice planning and should feel allowed to be chatty and nuanced. Gather the strongest additions, cuts, and ordering changes from these participant lenses:
 - **Product Manager:** what is the real product move and what should the roadmap prove?
 - **User Advocate:** what must matter for the user experience and trust surface?
 - **Customer Panel:** multiple likely customer viewpoints, not a single flattened “user”.
--- a/src/resources/extensions/sf/safety/content-validator.ts
+++ b/src/resources/extensions/sf/safety/content-validator.ts
@ -6,6 +6,7 @@
 */

 import { existsSync, readFileSync } from "node:fs";
+import { lint as lintMarkdown } from "markdownlint/sync";
 import { inspectMilestoneRoadmapMarkdown } from "../milestone-quality.js";
 import { inspectSlicePlanMarkdown } from "../plan-quality.js";
 import { parseRoadmapSlices } from "../roadmap-slices.js";
@ -52,14 +53,71 @@ export function validateContent(

 type ContentValidatorFn = (content: string) => ContentViolation[];

+const LEAKED_JSON_FIELD_RE =
+	/(^|[,{]\s*)"(?:successCriteria|proofLevel|integrationClosure|observabilityImpact|planningMeeting|visionMeeting|tasks|updatedTasks|removedTaskIds|verify|expectedOutput)"\s*:|",\s*"[A-Za-z][A-Za-z0-9_]*"\s*:/m;
+
+const MARKDOWNLINT_CONFIG = {
+	default: true,
+	// Generated plans often contain long commands/tables and repeated headings
+	// across role sections; keep markdownlint focused on structural hazards.
+	MD013: false, // line length
+	MD024: false, // duplicate heading text
+	MD033: false, // inline HTML
+};
+
 const VALIDATORS: Record<string, ContentValidatorFn> = {
 	"plan-slice": validatePlanSlice,
 	"plan-milestone": validatePlanMilestone,
+	"plan-task": validateGenericMarkdownArtifact,
+	"replan-slice": validatePlanSlice,
+	"execute-task": validateGenericMarkdownArtifact,
+	"complete-slice": validateGenericMarkdownArtifact,
+	"validate-milestone": validateGenericMarkdownArtifact,
 };

-function validatePlanSlice(content: string): ContentViolation[] {
+function validateCommonArtifactText(content: string): ContentViolation[] {
 	const violations: ContentViolation[] = [];

+	if (content.includes("\\n")) {
+		violations.push({
+			severity: "warning",
+			reason:
+				"Artifact contains literal escaped newline text (`\\n`) instead of rendered newlines",
+		});
+	}
+
+	const leakedJson = content.match(LEAKED_JSON_FIELD_RE);
+	if (leakedJson) {
+		violations.push({
+			severity: "warning",
+			reason: `Artifact appears to contain leaked JSON field syntax near '${leakedJson[0].slice(0, 80)}'`,
+		});
+	}
+
+	return violations;
+}
+
+function validateMarkdownHygiene(content: string): ContentViolation[] {
+	const results = lintMarkdown({
+		strings: { artifact: content },
+		config: MARKDOWNLINT_CONFIG,
+	});
+	return (results.artifact ?? []).slice(0, 10).map((issue) => ({
+		severity: "warning",
+		reason: `Markdown lint ${issue.ruleNames.join("/")} at line ${issue.lineNumber}: ${issue.ruleDescription}`,
+	}));
+}
+
+function validateGenericMarkdownArtifact(content: string): ContentViolation[] {
+	return [
+		...validateCommonArtifactText(content),
+		...validateMarkdownHygiene(content),
+	];
+}
+
+function validatePlanSlice(content: string): ContentViolation[] {
+	const violations: ContentViolation[] = validateGenericMarkdownArtifact(content);
+
 	// Must have at least 1 task entry — single-task slices are valid (#3649)
 	const taskCount = (content.match(/- \[[ x]\] \*\*T\d+/g) || []).length;
 	if (taskCount < 1) {
@ -99,7 +157,7 @@ function validatePlanSlice(content: string): ContentViolation[] {
 }

 function validatePlanMilestone(content: string): ContentViolation[] {
-	const violations: ContentViolation[] = [];
+	const violations: ContentViolation[] = validateGenericMarkdownArtifact(content);

 	// Must have at least 1 slice entry. Roadmaps are normally rendered as a
 	// Slice Overview table, so use the canonical parser instead of only looking
--- a/src/resources/extensions/sf/self-feedback.ts
+++ b/src/resources/extensions/sf/self-feedback.ts
@ -0,0 +1,433 @@
+/**
+ * Self-Feedback channel — sf records its own anomalies (caught by runtime
+ * detectors or reported via the sf_self_report tool) so they can be addressed
+ * by future units.
+ *
+ * Routing:
+ *  - When the current project IS singularity-forge itself (detected via
+ *    package.json `name`), entries land in two places:
+ *      • `<basePath>/.sf/BACKLOG.md`           — human-readable summary
+ *      • `<basePath>/.sf/self-feedback.jsonl`  — structured source of truth
+ *    The jsonl is what reads use. The markdown is for humans browsing the dir.
+ *  - For any other project, entries land in
+ *    `~/.sf/agent/upstream-feedback.jsonl` — a global cross-project log so
+ *    anomalies in sf's behavior are not lost when sf is dogfooded on
+ *    third-party codebases.
+ *
+ * Severity → blocking semantics:
+ *  - low/medium: log-and-continue. Bug accumulates in the backlog.
+ *  - high:       blocking — the unit that produced the report must not seal
+ *                successfully. On next auto session-start, getBlockedEntries()
+ *                returns this entry; the dispatcher checks whether sfVersion
+ *                has bumped since the report and re-queues if so.
+ *  - critical:   reserved for inline-fix mode (forge only). For now treated
+ *                as `high`. The dispatcher hook is wired in a follow-up.
+ *
+ * This module is intentionally I/O-only and side-effect-free relative to
+ * the in-memory workflow state. Write failures must NEVER propagate —
+ * sf keeps running.
+ */
+
+import {
+	appendFileSync,
+	existsSync,
+	mkdirSync,
+	readFileSync,
+	writeFileSync,
+} from "node:fs";
+import { homedir } from "node:os";
+import { dirname, join } from "node:path";
+
+const SF_HOME = process.env.SF_HOME || join(homedir(), ".sf");
+const UPSTREAM_LOG = join(SF_HOME, "agent", "upstream-feedback.jsonl");
+const BACKLOG_HEADER =
+	"# SF Self-Feedback Backlog\n\n" +
+	"Anomalies caught during auto runs (by runtime detectors or via the\n" +
+	"`sf_self_report` tool). Each row is a candidate work item for sf to\n" +
+	"address in itself. Source-of-truth records live in `self-feedback.jsonl`.\n\n" +
+	"Blocking entries (severity high+) hold their originating unit until\n" +
+	"`sf` is bumped past the version recorded with the entry, or the entry\n" +
+	"is explicitly resolved.\n\n" +
+	"| Timestamp | Kind | Severity | Blocking | sfVersion | Unit | Summary |\n" +
+	"|---|---|---|---|---|---|---|\n";
+
+export type SelfFeedbackSeverity = "critical" | "high" | "medium" | "low";
+
+export interface SelfFeedbackOccurredIn {
+	milestone?: string;
+	slice?: string;
+	task?: string;
+	unitType?: string;
+}
+
+export interface SelfFeedbackEntry {
+	kind: string;
+	severity: SelfFeedbackSeverity;
+	summary: string;
+	evidence?: string;
+	suggestedFix?: string;
+	/**
+	 * Optional reporter-written list of conditions that must be true before the
+	 * entry can be considered resolved. When present, a resolver should cite
+	 * which criteria they satisfied via markResolved's `criteriaMet` arg.
+	 * Without this, "resolved" only means "someone called markResolved" —
+	 * which is exactly the half-fix risk the channel is meant to surface.
+	 */
+	acceptanceCriteria?: string;
+	occurredIn?: SelfFeedbackOccurredIn;
+	source: "agent" | "detector";
+}
+
+/**
+ * Structured evidence cited when an entry is marked resolved. One of the
+ * non-reason fields should be present so the resolution is traceable.
+ *  - `kind: "auto-version-bump"` — auto-resolved by session_start when sf
+ *    was bumped past the recorded version. No human action.
+ *  - `kind: "agent-fix"` — agent landed a code fix. Cite commitSha and
+ *    optionally testPath / summaryNarrative.
+ *  - `kind: "human-clear"` — operator manually cleared via doctor or by
+ *    editing BACKLOG.md. reason should explain why.
+ *  - `kind: "promoted-to-requirement"` — entry was promoted to a REQUIREMENTS
+ *    row by the threshold-promotion sweeper.
+ */
+export type ResolutionEvidence =
+	| {
+			kind: "auto-version-bump";
+			fromVersion: string;
+			toVersion: string;
+	  }
+	| {
+			kind: "agent-fix";
+			commitSha?: string;
+			testPath?: string;
+			summaryNarrative?: string;
+	  }
+	| { kind: "human-clear" }
+	| { kind: "promoted-to-requirement"; requirementId: string };
+
+export interface PersistedSelfFeedbackEntry extends SelfFeedbackEntry {
+	id: string;
+	ts: string;
+	basePath: string;
+	repoIdentity: "forge" | "external";
+	sfVersion: string;
+	blocking: boolean;
+	resolvedAt?: string;
+	resolvedReason?: string;
+	resolvedBySfVersion?: string;
+	resolvedEvidence?: ResolutionEvidence;
+	/** IDs from `acceptanceCriteria` (or hashed lines) the resolver claimed satisfied. Only relevant when the entry had acceptanceCriteria set. */
+	resolvedCriteriaMet?: string[];
+}
+
+export interface RecordResult {
+	entry: PersistedSelfFeedbackEntry;
+	/** True when callers should treat the originating unit as blocked. */
+	blocking: boolean;
+}
+
+// ─── Identity & version helpers ────────────────────────────────────────────
+
+function isForgeRepo(basePath: string): boolean {
+	try {
+		const pkgPath = join(basePath, "package.json");
+		if (!existsSync(pkgPath)) return false;
+		const pkg = JSON.parse(readFileSync(pkgPath, "utf-8"));
+		return pkg?.name === "singularity-forge";
+	} catch {
+		return false;
+	}
+}
+
+function getCurrentSfVersion(): string {
+	if (process.env.SF_VERSION && process.env.SF_VERSION !== "0.0.0") {
+		return process.env.SF_VERSION;
+	}
+	return "unknown";
+}
+
+function deriveBlocking(severity: SelfFeedbackSeverity): boolean {
+	return severity === "high" || severity === "critical";
+}
+
+function newId(): string {
+	const ts = Date.now().toString(36);
+	const rnd = Math.random().toString(36).slice(2, 8);
+	return `sf-${ts}-${rnd}`;
+}
+
+// ─── Path helpers ──────────────────────────────────────────────────────────
+
+function projectJsonlPath(basePath: string): string {
+	return join(basePath, ".sf", "self-feedback.jsonl");
+}
+
+function projectMarkdownPath(basePath: string): string {
+	return join(basePath, ".sf", "BACKLOG.md");
+}
+
+function ensureDir(path: string): void {
+	const dir = dirname(path);
+	if (!existsSync(dir)) mkdirSync(dir, { recursive: true });
+}
+
+// ─── Writers ───────────────────────────────────────────────────────────────
+
+function appendJsonl(path: string, entry: PersistedSelfFeedbackEntry): void {
+	ensureDir(path);
+	appendFileSync(path, `${JSON.stringify(entry)}\n`, "utf-8");
+}
+
+function appendBacklogRow(
+	basePath: string,
+	entry: PersistedSelfFeedbackEntry,
+): void {
+	const path = projectMarkdownPath(basePath);
+	ensureDir(path);
+	if (!existsSync(path)) writeFileSync(path, BACKLOG_HEADER, "utf-8");
+	const unit = formatUnitCell(entry.occurredIn);
+	const summary = escapeCell(entry.summary);
+	const blocking = entry.blocking ? "yes" : "no";
+	const row =
+		`| ${entry.ts} | ${entry.kind} | ${entry.severity} | ${blocking} | ${entry.sfVersion} | ${unit} | ${summary} |\n`;
+	appendFileSync(path, row, "utf-8");
+	if (entry.evidence || entry.suggestedFix) {
+		const detail =
+			`\n<details><summary>${entry.id} — ${entry.kind}</summary>\n\n` +
+			(entry.evidence
+				? `**Evidence:**\n\n\`\`\`\n${entry.evidence}\n\`\`\`\n\n`
+				: "") +
+			(entry.suggestedFix
+				? `**Suggested fix:** ${entry.suggestedFix}\n\n`
+				: "") +
+			`</details>\n`;
+		appendFileSync(path, detail, "utf-8");
+	}
+}
+
+function formatUnitCell(occurred?: SelfFeedbackOccurredIn): string {
+	if (!occurred) return "—";
+	const parts = [occurred.milestone, occurred.slice, occurred.task].filter(
+		Boolean,
+	);
+	if (parts.length === 0) return occurred.unitType ?? "—";
+	const path = parts.join("/");
+	return occurred.unitType ? `${occurred.unitType} ${path}` : path;
+}
+
+function escapeCell(text: string): string {
+	return text.replace(/\|/g, "\\|").replace(/\n/g, " ").trim();
+}
+
+// ─── Auto-fill from active unit ───────────────────────────────────────────
+
+function readActiveUnit(basePath: string): SelfFeedbackOccurredIn | undefined {
+	try {
+		const lockPath = join(basePath, ".sf", "auto.lock");
+		if (!existsSync(lockPath)) return undefined;
+		const lock = JSON.parse(readFileSync(lockPath, "utf-8"));
+		const id: string | undefined = lock?.unitId;
+		if (!id) return { unitType: lock?.unitType };
+		const [milestone, slice, task] = id.split("/");
+		return { milestone, slice, task, unitType: lock?.unitType };
+	} catch {
+		return undefined;
+	}
+}
+
+// ─── Public API ────────────────────────────────────────────────────────────
+
+/**
+ * Record a self-feedback entry. Non-fatal — write errors are swallowed and
+ * a null result is returned so callers can branch on success without try/catch.
+ */
+export function recordSelfFeedback(
+	entry: SelfFeedbackEntry,
+	basePath: string = process.cwd(),
+): RecordResult | null {
+	try {
+		const occurredIn = entry.occurredIn ?? readActiveUnit(basePath);
+		const persisted: PersistedSelfFeedbackEntry = {
+			...entry,
+			occurredIn,
+			id: newId(),
+			ts: new Date().toISOString(),
+			basePath,
+			repoIdentity: isForgeRepo(basePath) ? "forge" : "external",
+			sfVersion: getCurrentSfVersion(),
+			blocking: deriveBlocking(entry.severity),
+		};
+		if (persisted.repoIdentity === "forge") {
+			appendJsonl(projectJsonlPath(basePath), persisted);
+			appendBacklogRow(basePath, persisted);
+		} else {
+			appendJsonl(UPSTREAM_LOG, persisted);
+		}
+		return { entry: persisted, blocking: persisted.blocking };
+	} catch {
+		return null;
+	}
+}
+
+/**
+ * Read all entries from the appropriate channel for `basePath`.
+ * Reads only the jsonl source-of-truth; the markdown is purely human-facing.
+ */
+export function readAllSelfFeedback(
+	basePath: string = process.cwd(),
+): PersistedSelfFeedbackEntry[] {
+	const path = isForgeRepo(basePath)
+		? projectJsonlPath(basePath)
+		: UPSTREAM_LOG;
+	try {
+		if (!existsSync(path)) return [];
+		const out: PersistedSelfFeedbackEntry[] = [];
+		for (const line of readFileSync(path, "utf-8").split("\n")) {
+			if (!line.trim()) continue;
+			try {
+				out.push(JSON.parse(line) as PersistedSelfFeedbackEntry);
+			} catch {
+				/* skip malformed lines */
+			}
+		}
+		return out;
+	} catch {
+		return [];
+	}
+}
+
+/**
+ * Return blocking entries that have not been resolved.
+ * Used by auto's session-start hook to drive retry-on-bump.
+ */
+export function getBlockedEntries(
+	basePath: string = process.cwd(),
+): PersistedSelfFeedbackEntry[] {
+	return readAllSelfFeedback(basePath).filter(
+		(e) => e.blocking && !e.resolvedAt,
+	);
+}
+
+export interface ResolutionInput {
+	reason: string;
+	evidence: ResolutionEvidence;
+	criteriaMet?: string[];
+}
+
+/**
+ * Mark an entry as resolved. Rewrites the jsonl source-of-truth in place
+ * (entries are append-only otherwise; resolution is the one mutation we
+ * support so blocking entries don't trigger re-queue forever).
+ *
+ * Resolution requires structured `evidence` so the fix is traceable:
+ *  - `agent-fix` should cite a commit SHA or test path
+ *  - `auto-version-bump` is for session_start's automatic resolver
+ *  - `human-clear` is the catch-all for operator interventions
+ *  - `promoted-to-requirement` is for the threshold-promotion sweeper
+ *
+ * If the entry has `acceptanceCriteria`, callers SHOULD pass `criteriaMet`
+ * naming which criteria were satisfied. (Not enforced — entries without
+ * acceptanceCriteria are common during the bootstrap of this channel.)
+ *
+ * The corresponding BACKLOG.md row is *not* mutated — markdown is human-
+ * authored space; humans can strike-through resolved rows or trim them.
+ */
+export function markResolved(
+	entryId: string,
+	resolution: ResolutionInput,
+	basePath: string = process.cwd(),
+): boolean {
+	const path = isForgeRepo(basePath)
+		? projectJsonlPath(basePath)
+		: UPSTREAM_LOG;
+	try {
+		if (!existsSync(path)) return false;
+		const lines = readFileSync(path, "utf-8").split("\n");
+		const out: string[] = [];
+		let mutated = false;
+		for (const line of lines) {
+			if (!line.trim()) {
+				out.push(line);
+				continue;
+			}
+			try {
+				const e = JSON.parse(line) as PersistedSelfFeedbackEntry;
+				if (e.id === entryId && !e.resolvedAt) {
+					e.resolvedAt = new Date().toISOString();
+					e.resolvedReason = resolution.reason;
+					e.resolvedBySfVersion = getCurrentSfVersion();
+					e.resolvedEvidence = resolution.evidence;
+					if (resolution.criteriaMet) {
+						e.resolvedCriteriaMet = resolution.criteriaMet;
+					}
+					mutated = true;
+					out.push(JSON.stringify(e));
+				} else {
+					out.push(line);
+				}
+			} catch {
+				out.push(line);
+			}
+		}
+		if (mutated) {
+			writeFileSync(path, out.join("\n"), "utf-8");
+		}
+		return mutated;
+	} catch {
+		return false;
+	}
+}
+
+/**
+ * Compare two semver strings. Returns positive if a > b, 0 if equal, negative
+ * if a < b. Tolerant of pre-release / non-numeric segments by falling back
+ * to lexicographic compare for those.
+ */
+function compareSemver(a: string, b: string): number {
+	const partsA = a.split(".");
+	const partsB = b.split(".");
+	for (let i = 0; i < Math.max(partsA.length, partsB.length); i++) {
+		const pa = partsA[i] ?? "0";
+		const pb = partsB[i] ?? "0";
+		const na = Number.parseInt(pa, 10);
+		const nb = Number.parseInt(pb, 10);
+		if (Number.isNaN(na) || Number.isNaN(nb)) {
+			if (pa === pb) continue;
+			return pa < pb ? -1 : 1;
+		}
+		if (na !== nb) return na - nb;
+	}
+	return 0;
+}
+
+/**
+ * For each blocked entry, decide whether the running sf is newer than the
+ * one that recorded the entry. If yes, the dispatcher should retry the
+ * entry's originating unit; if no, leave it blocked.
+ *
+ * Returns the entries split by retry-eligibility for the dispatcher to act on.
+ */
+export interface BlockedTriage {
+	retry: PersistedSelfFeedbackEntry[];
+	stillBlocked: PersistedSelfFeedbackEntry[];
+}
+
+export function triageBlockedEntries(
+	basePath: string = process.cwd(),
+): BlockedTriage {
+	const current = getCurrentSfVersion();
+	const retry: PersistedSelfFeedbackEntry[] = [];
+	const stillBlocked: PersistedSelfFeedbackEntry[] = [];
+	for (const e of getBlockedEntries(basePath)) {
+		if (current === "unknown" || e.sfVersion === "unknown") {
+			stillBlocked.push(e);
+			continue;
+		}
+		if (compareSemver(current, e.sfVersion) > 0) {
+			retry.push(e);
+		} else {
+			stillBlocked.push(e);
+		}
+	}
+	return { retry, stillBlocked };
+}
--- a/src/resources/extensions/sf/tests/auto-runaway-guard.test.ts
+++ b/src/resources/extensions/sf/tests/auto-runaway-guard.test.ts
@ -9,8 +9,8 @@ import {
 	clearRunawayGuardState,
 	collectWorktreeFingerprint,
 	evaluateRunawayGuard,
-	resetRunawayGuardState,
 	type RunawayGuardConfig,
+	resetRunawayGuardState,
 } from "../auto-runaway-guard.ts";

 function config(overrides: Partial<RunawayGuardConfig> = {}): RunawayGuardConfig {
@ -106,6 +106,19 @@ test("runaway guard pauses after diagnostic turns when budget keeps growing", ()
 	assert.equal(paused.action, "pause");
 	if (paused.action !== "pause") throw new Error("expected pause");
 	assert.match(paused.reason, /budget kept growing/);
+	assert.equal(paused.metadata.reason, paused.reason);
+	assert.equal(paused.metadata.unitType, "research-slice");
+	assert.equal(paused.metadata.unitId, "S04");
+	assert.equal(paused.metadata.diagnosticTurns, 2);
+	assert.equal(paused.metadata.warningsSent, 2);
+	assert.equal(paused.metadata.metrics.toolCalls, 100);
+	assert.equal(paused.metadata.lastWarningMetrics.toolCalls, 80);
+	assert.deepEqual(paused.metadata.thresholds.toolCallWarning, 60);
+	assert.ok(
+		paused.metadata.thresholdReasons.some((reason) =>
+			reason.includes("tool calls"),
+		),
+	);
 });

 test("runaway guard catches changed-file churn relative to unit start", () => {
--- a/src/resources/extensions/sf/tests/complete-slice-string-coercion.test.ts
+++ b/src/resources/extensions/sf/tests/complete-slice-string-coercion.test.ts
@ -5,6 +5,7 @@ import * as fs from "node:fs";
 import * as os from "node:os";
 import * as path from "node:path";
 import { afterEach, beforeEach, describe, test } from "node:test";
+import { parse as parseYaml } from "yaml";
 import {
 	closeDatabase,
 	insertMilestone,
@ -280,4 +281,67 @@ describe("handleCompleteSlice with coerced string arrays (#3565)", () => {
 			assert.match(summary, /Tests pass/);
 		}
 	});
+
+	test("handler rejects unsafe path segments before writing artifacts", async () => {
+		const params = makeValidSliceParams();
+		params.sliceId = "../outside";
+
+		const result = await handleCompleteSlice(params, basePath);
+		assert.ok("error" in result, "unsafe sliceId should be rejected");
+		if ("error" in result) {
+			assert.match(result.error, /safe path segment/);
+		}
+		assert.equal(
+			fs.existsSync(path.join(basePath, ".sf", "milestones", "M001", "outside")),
+			false,
+			"rejected unsafe ID should not write outside slice path",
+		);
+	});
+
+	test("handler rejects empty required content", async () => {
+		const params = makeValidSliceParams();
+		params.uatContent = "   ";
+
+		const result = await handleCompleteSlice(params, basePath);
+		assert.ok("error" in result, "empty UAT content should be rejected");
+		if ("error" in result) {
+			assert.match(result.error, /uatContent must be a non-empty string/);
+		}
+	});
+
+	test("handler rejects leaked JSON field syntax in markdown fields", async () => {
+		const params = makeValidSliceParams();
+		params.narrative = '"verification": "pretend pass"';
+
+		const result = await handleCompleteSlice(params, basePath);
+		assert.ok("error" in result, "leaked JSON fields should be rejected");
+		if ("error" in result) {
+			assert.match(result.error, /leaked JSON fields/);
+		}
+	});
+
+	test("handler renders YAML-hostile frontmatter values as parseable YAML", async () => {
+		const params = makeValidSliceParams();
+		params.keyFiles = ["src/foo:bar.ts", "---", "path # comment"];
+		params.provides = ["api: v1", "- not a nested list"];
+		params.requires = [{ slice: "S00", provides: "base: infra" }];
+
+		const result = await handleCompleteSlice(params, basePath);
+		assert.ok(!("error" in result), "handler should succeed");
+		if ("error" in result) return;
+
+		const summary = fs.readFileSync(result.summaryPath, "utf-8");
+		const frontmatter = summary.match(/^---\n([\s\S]*?)\n---\n/);
+		assert.ok(frontmatter, "summary should include YAML frontmatter");
+		const parsed = parseYaml(frontmatter[1]) as Record<string, unknown>;
+		assert.deepEqual(parsed["key_files"], [
+			"src/foo:bar.ts",
+			"---",
+			"path # comment",
+		]);
+		assert.deepEqual(parsed["provides"], ["api: v1", "- not a nested list"]);
+		assert.deepEqual(parsed["requires"], [
+			{ slice: "S00", provides: "base: infra" },
+		]);
+	});
 });
--- a/src/resources/extensions/sf/tests/complete-task.test.ts
+++ b/src/resources/extensions/sf/tests/complete-task.test.ts
@ -144,7 +144,7 @@ console.log("\n=== complete-task: schema v5 migration ===");
 	const versionRow = adapter
 		.prepare("SELECT MAX(version) as v FROM schema_version")
 		.get();
-	assertEq(versionRow?.["v"], 20, "schema version should be 20");
+	assertEq(versionRow?.["v"], 21, "schema version should be 21");

 	// Verify all 4 new tables exist
 	const tables = adapter
@ -900,6 +900,81 @@ console.log(
 	cleanup(dbPath);
 }

+// ═══════════════════════════════════════════════════════════════════════════
+// complete-task: semantic validation hardening
+// ═══════════════════════════════════════════════════════════════════════════
+
+console.log("\n=== complete-task: semantic validation hardening ===");
+{
+	const dbPath = tempDbPath();
+	openDatabase(dbPath);
+	const { basePath } = createTempProject();
+
+	insertMilestone({ id: "M001", title: "Test Milestone" });
+	insertSlice({ id: "S01", milestoneId: "M001", title: "Test Slice" });
+
+	const traversal = await handleCompleteTask(
+		{ ...makeValidParams(), taskId: "../outside" },
+		basePath,
+	);
+	assertTrue("error" in traversal, "path-traversal taskId should be rejected");
+	if ("error" in traversal) {
+		assertMatch(traversal.error, /safe path segment/, "safe segment error");
+	}
+	assertTrue(
+		!fs.existsSync(
+			path.join(
+				basePath,
+				".sf",
+				"milestones",
+				"M001",
+				"slices",
+				"S01",
+				"outside-SUMMARY.md",
+			),
+		),
+		"rejected traversal should not write summary",
+	);
+
+	const emptyVerification = await handleCompleteTask(
+		{ ...makeValidParams(), verification: "   " },
+		basePath,
+	);
+	assertTrue(
+		"error" in emptyVerification,
+		"empty required verification should be rejected",
+	);
+	if ("error" in emptyVerification) {
+		assertMatch(
+			emptyVerification.error,
+			/verification must be a non-empty string/,
+			"empty verification error",
+		);
+	}
+
+	const leakedJson = await handleCompleteTask(
+		{ ...makeValidParams(), narrative: '"verification": "fake"' },
+		basePath,
+	);
+	assertTrue("error" in leakedJson, "leaked JSON fields should be rejected");
+	if ("error" in leakedJson) {
+		assertMatch(leakedJson.error, /leaked JSON fields/, "leaked field error");
+	}
+
+	const escapedNewline = await handleCompleteTask(
+		{ ...makeValidParams(), narrative: "Line one\\nLine two" },
+		basePath,
+	);
+	assertTrue(!("error" in escapedNewline), "escaped newlines should normalize");
+	if (!("error" in escapedNewline)) {
+		const summary = fs.readFileSync(escapedNewline.summaryPath, "utf-8");
+		assertMatch(summary, /Line one\nLine two/, "escaped newline normalized");
+	}
+
+	cleanupDir(basePath);
+	cleanup(dbPath);
+}
+
 // ═══════════════════════════════════════════════════════════════════════════

 report();
--- a/src/resources/extensions/sf/tests/content-validator.test.ts
+++ b/src/resources/extensions/sf/tests/content-validator.test.ts
@ -101,3 +101,59 @@ test("validateContent still warns for zero-slice roadmaps", () => {
 		);
 	});
 });
+
+test("validateContent reports markdownlint structural warnings", () => {
+	withRoadmap(
+		[
+			"# M001: Test",
+			"### Skipped Heading Level",
+			"Text.",
+			"",
+			"## Slice Overview",
+			"| ID | Slice | Risk | Depends | Done | After this |",
+			"|----|-------|------|---------|------|------------|",
+			"| S01 | First slice | low | - | [ ] | a real slice exists. |",
+			"",
+		].join("\n"),
+		(path) => {
+			const violations = validateContent("plan-milestone", path);
+			assert.ok(
+				violations.some((v) =>
+					v.reason.includes("Markdown lint MD001/heading-increment"),
+				),
+			);
+		},
+	);
+});
+
+test("validateContent reports literal escaped newline corruption", () => {
+	withRoadmap(
+		roadmapWithSlices([
+			"| S01 | First slice | low | - | [ ] | line one\\nline two |",
+		]),
+		(path) => {
+			const violations = validateContent("plan-milestone", path);
+			assert.ok(
+				violations.some((v) =>
+					v.reason.includes("literal escaped newline text"),
+				),
+			);
+		},
+	);
+});
+
+test("validateContent reports leaked JSON field syntax in artifacts", () => {
+	withRoadmap(
+		roadmapWithSlices([
+			'| S01 | First slice | low | - | [ ] | proof", "integrationClosure": "bad" |',
+		]),
+		(path) => {
+			const violations = validateContent("plan-milestone", path);
+			assert.ok(
+				violations.some((v) =>
+					v.reason.includes("leaked JSON field syntax"),
+				),
+			);
+		},
+	);
+});
--- a/src/resources/extensions/sf/tests/headless-query.test.ts
+++ b/src/resources/extensions/sf/tests/headless-query.test.ts
@ -109,6 +109,20 @@ function createExecutingFixture(base: string): void {
 **Goal:** Implement something.
 **Demo:** It works.

+## Adversarial Review
+
+### Partner Review
+
+The query fixture has enough planning context to expose the active task.
+
+### Combatant Review
+
+Without review content the state machine should keep the slice in planning.
+
+### Architect Review
+
+The fixture mirrors the persisted slice-plan contract used by normal planning.
+
 ## Tasks

 - [ ] **T01: First Task** — Do the first thing
@ -141,6 +155,7 @@ describe("headless query", () => {
 		const snap = result.data as QuerySnapshot;

 		assert.equal(result.exitCode, 0);
+		assert.equal(snap.schemaVersion, 1);
 		// state
 		assert.equal(snap.state.phase, "executing");
 		assert.equal(snap.state.activeMilestone!.id, "M001");
--- a/src/resources/extensions/sf/tests/plan-milestone.test.ts
+++ b/src/resources/extensions/sf/tests/plan-milestone.test.ts
@ -251,6 +251,62 @@ test("handlePlanMilestone rejects invalid payloads", async () => {
 	}
 });

+test("handlePlanMilestone rejects leaked JSON fields in nested planning text", async () => {
+	const base = makeTmpBase();
+	const dbPath = join(base, ".sf", "sf.db");
+	openDatabase(dbPath);
+
+	try {
+		const params = validParams();
+		const result = await handlePlanMilestone(
+			{
+				...params,
+				slices: [
+					{
+						...params.slices![0]!,
+						proofLevel:
+							'Contract proof.", "integrationClosure": "leaked field"',
+					},
+				],
+			},
+			base,
+		);
+		assert.ok("error" in result);
+		assert.match(
+			result.error,
+			/validation failed: slices\[0\]\.proofLevel appears to contain leaked JSON fields/,
+		);
+	} finally {
+		cleanup(base);
+	}
+});
+
+test("handlePlanMilestone normalizes escaped newlines in planning arrays", async () => {
+	const base = makeTmpBase();
+	const dbPath = join(base, ".sf", "sf.db");
+	openDatabase(dbPath);
+
+	try {
+		const result = await handlePlanMilestone(
+			{
+				...validParams(),
+				successCriteria: ["First criterion\\nSecond criterion"],
+			},
+			base,
+		);
+		assert.ok(
+			!("error" in result),
+			`unexpected error: ${"error" in result ? result.error : ""}`,
+		);
+		const milestone = getMilestone("M001");
+		assert.deepEqual(milestone?.success_criteria, [
+			"First criterion\nSecond criterion",
+		]);
+	} finally {
+		cleanup(base);
+	}
+});
+
 test("handlePlanMilestone scaffolds common milestone slices from templateId", async () => {
 	const base = makeTmpBase();
 	const dbPath = join(base, ".sf", "sf.db");
--- a/src/resources/extensions/sf/tests/plan-slice.test.ts
+++ b/src/resources/extensions/sf/tests/plan-slice.test.ts
@ -268,6 +268,60 @@ test("handlePlanSlice rejects invalid payloads", async () => {
 	}
 });

+test("handlePlanSlice normalizes escaped newlines before rendering", async () => {
+	const base = makeTmpBase();
+	openDatabase(join(base, ".sf", "sf.db"));
+
+	try {
+		seedParentSlice();
+		const result = await handlePlanSlice(
+			{
+				...validParams(),
+				successCriteria: "First criterion\\nSecond criterion",
+			},
+			base,
+		);
+		assert.ok(
+			!("error" in result),
+			`unexpected error: ${"error" in result ? result.error : ""}`,
+		);
+
+		const renderedPlan = readFileSync(
+			join(base, ".sf", "milestones", "M001", "slices", "S02", "S02-PLAN.md"),
+			"utf-8",
+		);
+		assert.match(renderedPlan, /- First criterion\n- Second criterion/);
+		assert.doesNotMatch(renderedPlan, /\\n/);
+	} finally {
+		cleanup(base);
+	}
+});
+
+test("handlePlanSlice rejects leaked JSON fields inside markdown text fields", async () => {
+	const base = makeTmpBase();
+	openDatabase(join(base, ".sf", "sf.db"));
+
+	try {
+		seedParentSlice();
+		const result = await handlePlanSlice(
+			{
+				...validParams(),
+				proofLevel:
+					'Contract test passes.", "integrationClosure": "This belongs in another field"',
+			},
+			base,
+		);
+
+		assert.ok("error" in result);
+		assert.match(
+			result.error,
+			/validation failed: proofLevel appears to contain leaked JSON fields/,
+		);
+	} finally {
+		cleanup(base);
+	}
+});
+
 test("handlePlanSlice rejects missing, null, and empty planningMeeting with explicit guidance", async () => {
 	const base = makeTmpBase();
 	openDatabase(join(base, ".sf", "sf.db"));
--- a/src/resources/extensions/sf/tests/plan-task.test.ts
+++ b/src/resources/extensions/sf/tests/plan-task.test.ts
@ -130,6 +130,67 @@ test("handlePlanTask rejects invalid payloads", async () => {
 	}
 });

+test("handlePlanTask rejects leaked JSON fields in markdown text", async () => {
+	const base = makeTmpBase();
+	openDatabase(join(base, ".sf", "sf.db"));
+
+	try {
+		seedParent();
+		const result = await handlePlanTask(
+			{
+				...validParams(),
+				description:
+					'Implement this.", "verify": "this belongs in another field"',
+			},
+			base,
+		);
+		assert.ok("error" in result);
+		assert.match(
+			result.error,
+			/validation failed: description appears to contain leaked JSON fields/,
+		);
+	} finally {
+		cleanup(base);
+	}
+});
+
+test("handlePlanTask normalizes escaped newlines in markdown text", async () => {
+	const base = makeTmpBase();
+	openDatabase(join(base, ".sf", "sf.db"));
+
+	try {
+		seedParent();
+		const result = await handlePlanTask(
+			{
+				...validParams(),
+				description: "First step\\nSecond step",
+			},
+			base,
+		);
+		assert.ok(
+			!("error" in result),
+			`unexpected error: ${"error" in result ? result.error : ""}`,
+		);
+		const taskPlan = readFileSync(
+			join(
+				base,
+				".sf",
+				"milestones",
+				"M001",
+				"slices",
+				"S02",
+				"tasks",
+				"T02-PLAN.md",
+			),
+			"utf-8",
+		);
+		assert.match(taskPlan, /First step\nSecond step/);
+		assert.doesNotMatch(taskPlan, /\\n/);
+	} finally {
+		cleanup(base);
+	}
+});
+
 test("handlePlanTask rejects missing parent slice", async () => {
 	const base = makeTmpBase();
 	openDatabase(join(base, ".sf", "sf.db"));
--- a/src/resources/extensions/sf/tests/prompt-contracts.test.ts
+++ b/src/resources/extensions/sf/tests/prompt-contracts.test.ts
@ -344,6 +344,18 @@ test("plan-milestone prompt references DB-backed planning tool and explicitly fo
 	);
 });

+test("plan-milestone prompts require a real subagent vision meeting", () => {
+	for (const name of ["plan-milestone", "guided-plan-milestone"] as const) {
+		const prompt = readPrompt(name);
+		assert.match(prompt, /Vision Alignment Meeting/);
+		assert.match(prompt, /subagent/);
+		assert.match(prompt, /mode: "debate"/);
+		assert.match(prompt, /rounds: 2/);
+		assert.match(prompt, /Do \*\*not\*\* merely simulate every participant/i);
+		assert.match(prompt, /degraded fallback/i);
+	}
+});
+
 test("guided-plan-milestone prompt references DB-backed planning tool and explicitly forbids manual roadmap writes", () => {
 	const prompt = readPrompt("guided-plan-milestone");
 	assert.match(prompt, /sf_plan_milestone/);
--- a/src/resources/extensions/sf/tests/provider-model-allow.test.ts
+++ b/src/resources/extensions/sf/tests/provider-model-allow.test.ts
@ -144,6 +144,58 @@ test("provider_model_allow: OpenCode defaults to free models while OpenCode Go s
 	);
 });

+test("provider_model_allow: hides Xiaomi token-plan regional aliases", () => {
+	const models = [
+		{ provider: "xiaomi", id: "mimo-v2-pro" },
+		{ provider: "xiaomi-token-plan-ams", id: "mimo-v2-pro" },
+		{ provider: "xiaomi-token-plan-cn", id: "mimo-v2-pro" },
+		{ provider: "xiaomi-token-plan-sgp", id: "mimo-v2-pro" },
+		{ provider: "opencode-go", id: "mimo-v2-pro" },
+	];
+
+	const filtered = filterModelsByProviderModelAllow(models, undefined);
+
+	assert.deepEqual(
+		filtered.map((m) => `${m.provider}/${m.id}`),
+		["xiaomi/mimo-v2-pro", "opencode-go/mimo-v2-pro"],
+	);
+});
+
+test("provider_model_allow: hides Claude Code from normal selection", () => {
+	const models = [
+		{ provider: "claude-code", id: "sonnet" },
+		{ provider: "kimi-coding", id: "kimi-for-coding" },
+	];
+
+	const filtered = filterModelsByProviderModelAllow(models, undefined);
+
+	assert.deepEqual(
+		filtered.map((m) => `${m.provider}/${m.id}`),
+		["kimi-coding/kimi-for-coding"],
+	);
+});
+
+test("provider_model_allow: hides Mistral non-selection endpoints", () => {
+	const models = [
+		{ provider: "mistral", id: "mistral-large-latest" },
+		{ provider: "mistral", id: "codestral-latest" },
+		{ provider: "mistral", id: "mistral-embed" },
+		{ provider: "mistral", id: "mistral-ocr-latest" },
+		{ provider: "mistral", id: "voxtral-mini-tts-latest" },
+		{ provider: "mistral", id: "ft:codestral-latest:abc" },
+	];
+
+	const filtered = filterModelsByProviderModelAllow(models, undefined);
+
+	assert.deepEqual(
+		filtered.map((m) => `${m.provider}/${m.id}`),
+		[
+			"mistral/mistral-large-latest",
+			"mistral/codestral-latest",
+		],
+	);
+});
+
 test("provider_model_block: blocks matching models even when provider is otherwise unrestricted", () => {
 	const models = [
 		{ provider: "minimax", id: "MiniMax-M2.7" },
--- a/src/resources/extensions/sf/tests/unit-runtime.test.ts
+++ b/src/resources/extensions/sf/tests/unit-runtime.test.ts
@ -64,6 +64,60 @@ console.log("\n=== runtime record write/read/update ===");
 		"wrapup-warning-sent",
 		"updated phase readable",
 	);
+
+	const paused = writeUnitRuntimeRecord(
+		base,
+		"execute-task",
+		"M100/S02/T09",
+		1000,
+		{
+			phase: "paused",
+			runawayGuardPause: {
+				reason: "Runaway guard paused execute-task M100/S02/T09",
+				pausedAt: 3000,
+				unitType: "execute-task",
+				unitId: "M100/S02/T09",
+				diagnosticTurns: 2,
+				warningsSent: 2,
+				thresholdReasons: ["80 tool calls (warning 60)"],
+				metrics: {
+					toolCalls: 100,
+					sessionTokens: 1_000_000,
+					elapsedMs: 240_000,
+					changedFiles: 0,
+					worktreeChangedSinceStart: false,
+					topTools: { read: 70, bash: 30 },
+				},
+				lastWarningMetrics: {
+					toolCalls: 80,
+					sessionTokens: 750_000,
+					elapsedMs: 180_000,
+				},
+				thresholds: {
+					toolCallWarning: 60,
+					tokenWarning: 1_000_000,
+					elapsedMs: 1_200_000,
+					changedFilesWarning: 75,
+					minIntervalMs: 120_000,
+				},
+			},
+		},
+	);
+	assert.equal(
+		paused.runawayGuardPause?.metrics.toolCalls,
+		100,
+		"runaway metadata written",
+	);
+	const loadedPaused = readUnitRuntimeRecord(
+		base,
+		"execute-task",
+		"M100/S02/T09",
+	);
+	assert.equal(
+		loadedPaused?.runawayGuardPause?.reason,
+		"Runaway guard paused execute-task M100/S02/T09",
+		"runaway metadata persisted",
+	);
 }

 console.log("\n=== execute-task durability inspection ===");
--- a/src/resources/extensions/sf/tests/verification-gate.test.ts
+++ b/src/resources/extensions/sf/tests/verification-gate.test.ts
@ -1412,3 +1412,104 @@ describe("verification-gate: integration — gate + evidence JSON", () => {
 		assert.equal(json.auditWarnings[0].fixAvailable, true);
 	});
 });
+
+// ─── Real Package.json Script Tests (T03) ───────────────────────────────────
+
+describe("verification-gate: real package.json scripts", () => {
+	test("discoverCommands finds lint and test from real package.json", () => {
+		const result = discoverCommands({ cwd: process.cwd() });
+		// The real package.json has "lint" and "test" scripts
+		assert.ok(
+			result.commands.includes("npm run lint"),
+			"should discover npm run lint",
+		);
+		assert.ok(
+			result.commands.includes("npm run test"),
+			"should discover npm run test",
+		);
+		assert.equal(result.source, "package-json");
+	});
+
+	test("real package.json gate: lint fails, test times out → gate fails", () => {
+		const result = runVerificationGate({
+			cwd: process.cwd(),
+			commandTimeoutMs: 5_000,
+		});
+		assert.equal(result.discoverySource, "package-json");
+		assert.equal(result.passed, false, "gate should fail because lint fails");
+		assert.equal(result.checks.length, 2, "should run lint and test");
+
+		const lintCheck = result.checks.find((c) => c.command === "npm run lint");
+		const testCheck = result.checks.find((c) => c.command === "npm run test");
+
+		assert.ok(lintCheck, "lint check should exist");
+		assert.ok(testCheck, "test check should exist");
+		assert.notEqual(lintCheck.exitCode, 0, "lint should fail (biome errors exist)");
+		assert.ok(lintCheck.durationMs >= 0, "lint should have duration");
+		// test may time out (exit code non-zero) or exit 127; either is fine
+		assert.notEqual(testCheck.exitCode, 0, "test should fail or time out");
+		assert.ok(testCheck.durationMs >= 0, "test should have duration");
+	});
+
+	test("real typecheck:extensions passes → gate passes", () => {
+		const result = runVerificationGate({
+			cwd: process.cwd(),
+			preferenceCommands: ["npm run typecheck:extensions"],
+			commandTimeoutMs: 30_000,
+		});
+		assert.equal(result.passed, true, "typecheck:extensions should pass");
+		assert.equal(result.checks.length, 1);
+		assert.equal(result.checks[0].command, "npm run typecheck:extensions");
+		assert.equal(result.checks[0].exitCode, 0);
+		assert.ok(result.checks[0].durationMs >= 0);
+	});
+
+	test("real lint fails → gate fails with exit code 1", () => {
+		const result = runVerificationGate({
+			cwd: process.cwd(),
+			preferenceCommands: ["npm run lint"],
+			commandTimeoutMs: 10_000,
+		});
+		assert.equal(result.passed, false, "lint should fail");
+		assert.equal(result.checks.length, 1);
+		assert.equal(result.checks[0].command, "npm run lint");
+		assert.equal(result.checks[0].exitCode, 1, "lint should exit 1");
+		assert.ok(
+			result.checks[0].stdout.includes("Found") ||
+				result.checks[0].stderr.includes("error"),
+			"should have error output",
+		);
+	});
+
+	test("mixed real commands: typecheck passes, lint fails → gate fails", () => {
+		const result = runVerificationGate({
+			cwd: process.cwd(),
+			preferenceCommands: ["npm run typecheck:extensions", "npm run lint"],
+			commandTimeoutMs: 30_000,
+		});
+		assert.equal(result.passed, false, "gate should fail because lint fails");
+		assert.equal(result.checks.length, 2);
+
+		const typeCheck = result.checks.find(
+			(c) => c.command === "npm run typecheck:extensions",
+		);
+		const lintCheck = result.checks.find((c) => c.command === "npm run lint");
+
+		assert.ok(typeCheck, "typecheck check should exist");
+		assert.ok(lintCheck, "lint check should exist");
+		assert.equal(typeCheck.exitCode, 0, "typecheck should pass");
+		assert.equal(lintCheck.exitCode, 1, "lint should fail");
+	});
+
+	test("preference commands override real package.json discovery", () => {
+		const result = runVerificationGate({
+			cwd: process.cwd(),
+			preferenceCommands: ["echo override"],
+		});
+		assert.equal(result.passed, true);
+		assert.equal(result.discoverySource, "preference");
+		assert.deepStrictEqual(result.checks.map((c) => c.command), [
+			"echo override",
+		]);
+	});
+});
--- a/src/resources/extensions/sf/tests/workflow-projections.test.ts
+++ b/src/resources/extensions/sf/tests/workflow-projections.test.ts
@ -138,6 +138,17 @@ test("workflow-projections: renderPlanContent includes ## Tasks section", () =>
 	assert.ok(content.includes("## Tasks"));
 });

+test("workflow-projections: renderPlanContent includes top-level verification from task commands", () => {
+	const content = renderPlanContent(makeSlice(), [
+		makeTask({ verify: "npm test src/middleware/auth.test.ts" }),
+		makeTask({ id: "T02", verify: "npm run typecheck" }),
+	]);
+
+	assert.ok(content.includes("## Verification"));
+	assert.ok(content.includes("- npm test src/middleware/auth.test.ts"));
+	assert.ok(content.includes("- npm run typecheck"));
+});
+
 test("workflow-projections: renderPlanContent includes adversarial review and planning meeting sections", () => {
 	const slice = makeSlice({
 		adversarial_partner:
--- a/src/resources/extensions/sf/tools/complete-slice.ts
+++ b/src/resources/extensions/sf/tools/complete-slice.ts
@ -29,6 +29,11 @@ import { invalidateStateCache } from "../state.js";
 import { isClosedStatus } from "../status-guards.js";
 import type { CompleteSliceParams } from "../types.js";
 import { checkOwnership, sliceUnitKey } from "../unit-ownership.js";
+import {
+	normalizePlanningText,
+	normalizeRequiredPlanningText,
+	validateSafePathSegment,
+} from "../validation.js";
 import { appendEvent } from "../workflow-events.js";
 import { logError, logWarning } from "../workflow-logger.js";
 import { writeManifest } from "../workflow-manifest.js";
@ -51,6 +56,146 @@ function errorMessage(error: unknown): string {
 	return error instanceof Error ? error.message : String(error);
 }

+function yamlScalar(value: string): string {
+	if (/^[A-Za-z0-9_.-]+$/.test(value)) return value;
+	return JSON.stringify(value);
+}
+
+function yamlList(values: string[], empty: string): string {
+	return values.length > 0
+		? values.map((value) => `  - ${yamlScalar(value)}`).join("\n")
+		: empty;
+}
+
+function normalizeStringArray(value: unknown, field: string): string[] {
+	const items = Array.isArray(value)
+		? value
+		: typeof value === "string" && value.trim()
+			? value
+					.split(/\n/)
+					.map((line) => line.replace(/^[\s\-*•]+/, "").trim())
+					.filter(Boolean)
+			: [];
+	return items.map((item, index) =>
+		normalizePlanningText(String(item), `${field}[${index}]`),
+	);
+}
+
+function normalizeObjectArray<T extends Record<string, string>>(
+	value: unknown,
+	field: string,
+	shape: Record<keyof T, string>,
+): T[] {
+	if (!Array.isArray(value)) return [];
+	return value.map((item, index) => {
+		if (!item || typeof item !== "object" || Array.isArray(item)) {
+			throw new Error(`${field}[${index}] must be an object`);
+		}
+		const record = item as Record<string, unknown>;
+		const next = {} as T;
+		for (const [key, label] of Object.entries(shape) as Array<
+			[keyof T, string]
+		>) {
+			next[key] = normalizePlanningText(
+				String(record[String(key)] ?? ""),
+				`${field}[${index}].${label}`,
+			) as T[typeof key];
+		}
+		return next as T;
+	});
+}
+
+function normalizeCompleteSliceParams(
+	params: CompleteSliceParams,
+): CompleteSliceParams {
+	return {
+		...params,
+		sliceId: validateSafePathSegment(params.sliceId, "sliceId"),
+		milestoneId: validateSafePathSegment(params.milestoneId, "milestoneId"),
+		sliceTitle: normalizeRequiredPlanningText(params.sliceTitle, "sliceTitle"),
+		oneLiner: normalizeRequiredPlanningText(params.oneLiner, "oneLiner"),
+		narrative: normalizeRequiredPlanningText(params.narrative, "narrative"),
+		verification: normalizeRequiredPlanningText(
+			params.verification,
+			"verification",
+		),
+		uatContent: normalizeRequiredPlanningText(params.uatContent, "uatContent"),
+		keyFiles: normalizeStringArray(params.keyFiles, "keyFiles"),
+		keyDecisions: normalizeStringArray(params.keyDecisions, "keyDecisions"),
+		patternsEstablished: normalizeStringArray(
+			params.patternsEstablished,
+			"patternsEstablished",
+		),
+		observabilitySurfaces: normalizeStringArray(
+			params.observabilitySurfaces,
+			"observabilitySurfaces",
+		),
+		provides: normalizeStringArray(params.provides, "provides"),
+		affects: normalizeStringArray(params.affects, "affects"),
+		drillDownPaths: normalizeStringArray(
+			params.drillDownPaths,
+			"drillDownPaths",
+		),
+		requirementsSurfaced: normalizeStringArray(
+			params.requirementsSurfaced,
+			"requirementsSurfaced",
+		),
+		deviations:
+			params.deviations === undefined
+				? undefined
+				: normalizePlanningText(params.deviations, "deviations"),
+		knownLimitations:
+			params.knownLimitations === undefined
+				? undefined
+				: normalizePlanningText(params.knownLimitations, "knownLimitations"),
+		followUps:
+			params.followUps === undefined
+				? undefined
+				: normalizePlanningText(params.followUps, "followUps"),
+		operationalReadiness:
+			params.operationalReadiness === undefined
+				? undefined
+				: normalizePlanningText(
+						params.operationalReadiness,
+						"operationalReadiness",
+					),
+		requirementsAdvanced: normalizeObjectArray<{ id: string; how: string }>(
+			params.requirementsAdvanced,
+			"requirementsAdvanced",
+			{ id: "id", how: "how" },
+		),
+		requirementsValidated: normalizeObjectArray<{
+			id: string;
+			proof: string;
+		}>(params.requirementsValidated, "requirementsValidated", {
+			id: "id",
+			proof: "proof",
+		}),
+		requirementsInvalidated: normalizeObjectArray<{
+			id: string;
+			what: string;
+		}>(params.requirementsInvalidated, "requirementsInvalidated", {
+			id: "id",
+			what: "what",
+		}),
+		filesModified: normalizeObjectArray<{
+			path: string;
+			description: string;
+		}>(params.filesModified, "filesModified", {
+			path: "path",
+			description: "description",
+		}),
+		requires: normalizeObjectArray<{ slice: string; provides: string }>(
+			params.requires,
+			"requires",
+			{
+				slice: "slice",
+				provides: "provides",
+			},
+		),
+	};
+}
+
 async function writeMarkdownBeforeDb(
 	filePath: string,
 	content: string,
@ -106,44 +251,40 @@ function renderSliceSummaryMarkdown(params: CompleteSliceParams): string {
 	const filesModified = params.filesModified ?? [];

 	const providesYaml =
-		provides.length > 0
-			? provides.map((p) => `  - ${p}`).join("\n")
-			: "  - (none)";
+		provides.length > 0 ? yamlList(provides, "  - (none)") : "  - (none)";

 	const requiresYaml =
 		requires.length > 0
 			? requires
-					.map((r) => `  - slice: ${r.slice}\n    provides: ${r.provides}`)
+					.map(
+						(r) =>
+							`  - slice: ${yamlScalar(r.slice)}\n    provides: ${yamlScalar(r.provides)}`,
+					)
 					.join("\n")
 			: "  []";

-	const affectsYaml =
-		affects.length > 0 ? affects.map((a) => `  - ${a}`).join("\n") : "  []";
+	const affectsYaml = affects.length > 0 ? yamlList(affects, "  []") : "  []";

 	const keyFilesYaml =
-		keyFiles.length > 0
-			? keyFiles.map((f) => `  - ${f}`).join("\n")
-			: "  - (none)";
+		keyFiles.length > 0 ? yamlList(keyFiles, "  - (none)") : "  - (none)";

 	const keyDecisionsYaml =
 		keyDecisions.length > 0
-			? keyDecisions.map((d) => `  - ${d}`).join("\n")
+			? yamlList(keyDecisions, "  - (none)")
 			: "  - (none)";

 	const patternsYaml =
 		patternsEstablished.length > 0
-			? patternsEstablished.map((p) => `  - ${p}`).join("\n")
+			? yamlList(patternsEstablished, "  - (none)")
 			: "  - (none)";

 	const observabilityYaml =
 		observabilitySurfaces.length > 0
-			? observabilitySurfaces.map((o) => `  - ${o}`).join("\n")
+			? yamlList(observabilitySurfaces, "  - none")
 			: "  - none";

 	const drillDownYaml =
-		drillDownPaths.length > 0
-			? drillDownPaths.map((d) => `  - ${d}`).join("\n")
-			: "  []";
+		drillDownPaths.length > 0 ? yamlList(drillDownPaths, "  []") : "  []";

 	// Requirements sections
 	const reqAdvanced =
@ -274,9 +415,16 @@ ${params.uatContent}
 * 7. Invalidate caches
 */
 export async function handleCompleteSlice(
-	params: CompleteSliceParams,
+	paramsInput: CompleteSliceParams,
 	basePath: string,
 ): Promise<CompleteSliceResult | { error: string }> {
+	let params: CompleteSliceParams;
+	try {
+		params = normalizeCompleteSliceParams(paramsInput);
+	} catch (error) {
+		return { error: errorMessage(error) };
+	}
+
 	// ── Validate required fields ────────────────────────────────────────────
 	if (
 		!params.sliceId ||
--- a/src/resources/extensions/sf/tools/complete-task.ts
+++ b/src/resources/extensions/sf/tools/complete-task.ts
@ -30,6 +30,11 @@ import { invalidateStateCache } from "../state.js";
 import { isClosedStatus } from "../status-guards.js";
 import type { CompleteTaskParams } from "../types.js";
 import { checkOwnership, taskUnitKey } from "../unit-ownership.js";
+import {
+	normalizePlanningText,
+	normalizeRequiredPlanningText,
+	validateSafePathSegment,
+} from "../validation.js";
 import { appendEvent } from "../workflow-events.js";
 import { logError, logWarning } from "../workflow-logger.js";
 import { writeManifest } from "../workflow-manifest.js";
@ -72,7 +77,7 @@ function taskGateFieldForId(
 * Normalize a list parameter that may arrive as a string (newline-delimited
 * bullet list from the LLM) into a string array (#3361).
 */
-function normalizeListParam(value: unknown): string[] {
+function normalizeListParam(value: unknown, field = "list"): string[] {
 	if (Array.isArray(value)) return value.map(String);
 	if (typeof value === "string" && value.trim()) {
 		return value
@ -83,6 +88,98 @@ function normalizeListParam(value: unknown): string[] {
 	return [];
 }

+function normalizeStringListParam(value: unknown, field: string): string[] {
+	return normalizeListParam(value, field).map((item, index) =>
+		normalizePlanningText(item, `${field}[${index}]`),
+	);
+}
+
+function normalizeVerificationEvidence(
+	value: unknown,
+): CompleteTaskParams["verificationEvidence"] {
+	if (!Array.isArray(value)) return [];
+	return value.map((entry, index) => {
+		if (typeof entry === "string") {
+			return {
+				command: normalizePlanningText(
+					entry,
+					`verificationEvidence[${index}]`,
+				),
+				exitCode: -1,
+				verdict: "unknown (coerced from string)",
+				durationMs: 0,
+			};
+		}
+		if (!entry || typeof entry !== "object") {
+			throw new Error(`verificationEvidence[${index}] must be an object or string`);
+		}
+		const record = entry as Record<string, unknown>;
+		const exitCode =
+			typeof record["exitCode"] === "number" && Number.isFinite(record["exitCode"])
+				? record["exitCode"]
+				: -1;
+		const durationMs =
+			typeof record["durationMs"] === "number" &&
+			Number.isFinite(record["durationMs"])
+				? record["durationMs"]
+				: 0;
+		return {
+			command: normalizePlanningText(
+				String(record["command"] ?? ""),
+				`verificationEvidence[${index}].command`,
+			),
+			exitCode,
+			verdict: normalizePlanningText(
+				String(record["verdict"] ?? "unknown"),
+				`verificationEvidence[${index}].verdict`,
+			),
+			durationMs,
+		};
+	});
+}
+
+function normalizeCompleteTaskParams(
+	params: CompleteTaskParams,
+): CompleteTaskParams {
+	return {
+		...params,
+		taskId: validateSafePathSegment(params.taskId, "taskId"),
+		sliceId: validateSafePathSegment(params.sliceId, "sliceId"),
+		milestoneId: validateSafePathSegment(params.milestoneId, "milestoneId"),
+		oneLiner: normalizeRequiredPlanningText(params.oneLiner, "oneLiner"),
+		narrative: normalizeRequiredPlanningText(params.narrative, "narrative"),
+		verification: normalizeRequiredPlanningText(
+			params.verification,
+			"verification",
+		),
+		keyFiles: normalizeStringListParam(params.keyFiles, "keyFiles"),
+		keyDecisions: normalizeStringListParam(params.keyDecisions, "keyDecisions"),
+		deviations:
+			params.deviations === undefined
+				? undefined
+				: normalizePlanningText(params.deviations, "deviations"),
+		knownIssues:
+			params.knownIssues === undefined
+				? undefined
+				: normalizePlanningText(params.knownIssues, "knownIssues"),
+		failureModes:
+			params.failureModes === undefined
+				? undefined
+				: normalizePlanningText(params.failureModes, "failureModes"),
+		loadProfile:
+			params.loadProfile === undefined
+				? undefined
+				: normalizePlanningText(params.loadProfile, "loadProfile"),
+		negativeTests:
+			params.negativeTests === undefined
+				? undefined
+				: normalizePlanningText(params.negativeTests, "negativeTests"),
+		verificationEvidence: normalizeVerificationEvidence(
+			params.verificationEvidence,
+		),
+	};
+}
+
 async function ensureWritableParent(filePath: string): Promise<void> {
 	const parentDir = dirname(filePath);
 	await fs.mkdir(parentDir, { recursive: true });
@ -155,9 +252,16 @@ function paramsToTaskRow(
 * 6. Invalidate caches
 */
 export async function handleCompleteTask(
-	params: CompleteTaskParams,
+	paramsInput: CompleteTaskParams,
 	basePath: string,
 ): Promise<CompleteTaskResult | { error: string }> {
+	let params: CompleteTaskParams;
+	try {
+		params = normalizeCompleteTaskParams(paramsInput);
+	} catch (error) {
+		return { error: errorMessage(error) };
+	}
+
 	// ── Validate required fields ────────────────────────────────────────────
 	if (
 		!params.taskId ||
--- a/src/resources/extensions/sf/tools/plan-milestone.ts
+++ b/src/resources/extensions/sf/tools/plan-milestone.ts
@ -16,7 +16,11 @@ import {
 } from "../sf-db.js";
 import { invalidateStateCache } from "../state.js";
 import { isClosedStatus } from "../status-guards.js";
-import { isNonEmptyString, validateStringArray } from "../validation.js";
+import {
+	isNonEmptyString,
+	normalizePlanningText,
+	normalizePlanningTextArray,
+} from "../validation.js";
 import { appendEvent } from "../workflow-events.js";
 import { logWarning } from "../workflow-logger.js";
 import { writeManifest } from "../workflow-manifest.js";
@ -98,7 +102,13 @@ function validateRiskEntries(
 				`keyRisks[${index}] must include non-empty risk and whyItMatters`,
 			);
 		}
-		return { risk, whyItMatters };
+		return {
+			risk: normalizePlanningText(risk, `keyRisks[${index}].risk`),
+			whyItMatters: normalizePlanningText(
+				whyItMatters,
+				`keyRisks[${index}].whyItMatters`,
+			),
+		};
 	});
 }

@ -127,7 +137,20 @@ function validateProofStrategy(value: unknown): Array<{
 				`proofStrategy[${index}] must include non-empty riskOrUnknown, retireIn, and whatWillBeProven`,
 			);
 		}
-		return { riskOrUnknown, retireIn, whatWillBeProven };
+		return {
+			riskOrUnknown: normalizePlanningText(
+				riskOrUnknown,
+				`proofStrategy[${index}].riskOrUnknown`,
+			),
+			retireIn: normalizePlanningText(
+				retireIn,
+				`proofStrategy[${index}].retireIn`,
+			),
+			whatWillBeProven: normalizePlanningText(
+				whatWillBeProven,
+				`proofStrategy[${index}].whatWillBeProven`,
+			),
+		};
 	});
 }

@ -190,20 +213,82 @@ function validateSlices(value: unknown): PlanMilestoneSliceInput[] {
 			);

 		return {
-			sliceId,
-			title,
-			risk,
-			depends,
-			demo,
-			goal,
+			sliceId: normalizePlanningText(sliceId, `slices[${index}].sliceId`),
+			title: normalizePlanningText(title, `slices[${index}].title`),
+			risk: normalizePlanningText(risk, `slices[${index}].risk`),
+			depends: depends.map((item, depIndex) =>
+				normalizePlanningText(item, `slices[${index}].depends[${depIndex}]`),
+			),
+			demo: normalizePlanningText(demo, `slices[${index}].demo`),
+			goal: normalizePlanningText(goal, `slices[${index}].goal`),
+			successCriteria: normalizePlanningText(
 				successCriteria,
+				`slices[${index}].successCriteria`,
+			),
+			proofLevel: normalizePlanningText(
 				proofLevel,
+				`slices[${index}].proofLevel`,
+			),
+			integrationClosure: normalizePlanningText(
 				integrationClosure,
+				`slices[${index}].integrationClosure`,
+			),
+			observabilityImpact: normalizePlanningText(
 				observabilityImpact,
+				`slices[${index}].observabilityImpact`,
+			),
 		};
 	});
 }

+function normalizeVisionMeeting(
+	meeting: VisionAlignmentMeetingRecord,
+): VisionAlignmentMeetingRecord {
+	return {
+		trigger: normalizePlanningText(meeting.trigger, "visionMeeting.trigger"),
+		pm: normalizePlanningText(meeting.pm, "visionMeeting.pm"),
+		userAdvocate: normalizePlanningText(
+			meeting.userAdvocate,
+			"visionMeeting.userAdvocate",
+		),
+		customerPanel: normalizePlanningText(
+			meeting.customerPanel,
+			"visionMeeting.customerPanel",
+		),
+		business: normalizePlanningText(meeting.business, "visionMeeting.business"),
+		researcher: normalizePlanningText(
+			meeting.researcher,
+			"visionMeeting.researcher",
+		),
+		deliveryLead: normalizePlanningText(
+			meeting.deliveryLead,
+			"visionMeeting.deliveryLead",
+		),
+		partner: normalizePlanningText(meeting.partner, "visionMeeting.partner"),
+		combatant: normalizePlanningText(
+			meeting.combatant,
+			"visionMeeting.combatant",
+		),
+		architect: normalizePlanningText(
+			meeting.architect,
+			"visionMeeting.architect",
+		),
+		moderator: normalizePlanningText(
+			meeting.moderator,
+			"visionMeeting.moderator",
+		),
+		weightedSynthesis: normalizePlanningText(
+			meeting.weightedSynthesis,
+			"visionMeeting.weightedSynthesis",
+		),
+		confidenceByArea: normalizePlanningText(
+			meeting.confidenceByArea,
+			"visionMeeting.confidenceByArea",
+		),
+		recommendedRoute: meeting.recommendedRoute,
+	};
+}
+
 function validateParams(params: PlanMilestoneParams): PlanMilestoneParams {
 	if (!isNonEmptyString(params?.milestoneId))
 		throw new Error("milestoneId is required");
@ -224,28 +309,49 @@ function validateParams(params: PlanMilestoneParams): PlanMilestoneParams {

 	return {
 		...params,
+		milestoneId: normalizePlanningText(params.milestoneId, "milestoneId"),
+		title: normalizePlanningText(params.title, "title"),
+		vision: normalizePlanningText(params.vision, "vision"),
 		dependsOn: params.dependsOn
-			? validateStringArray(params.dependsOn, "dependsOn")
+			? normalizePlanningTextArray(params.dependsOn, "dependsOn")
 			: [],
 		// Apply defaults for optional enrichment fields (#2771)
 		successCriteria: params.successCriteria
-			? validateStringArray(params.successCriteria, "successCriteria")
+			? normalizePlanningTextArray(params.successCriteria, "successCriteria")
 			: [],
 		keyRisks: params.keyRisks ? validateRiskEntries(params.keyRisks) : [],
 		proofStrategy: params.proofStrategy
 			? validateProofStrategy(params.proofStrategy)
 			: [],
-		verificationContract: params.verificationContract ?? "",
-		verificationIntegration: params.verificationIntegration ?? "",
-		verificationOperational: params.verificationOperational ?? "",
-		verificationUat: params.verificationUat ?? "",
+		verificationContract: normalizePlanningText(
+			params.verificationContract ?? "",
+			"verificationContract",
+		),
+		verificationIntegration: normalizePlanningText(
+			params.verificationIntegration ?? "",
+			"verificationIntegration",
+		),
+		verificationOperational: normalizePlanningText(
+			params.verificationOperational ?? "",
+			"verificationOperational",
+		),
+		verificationUat: normalizePlanningText(
+			params.verificationUat ?? "",
+			"verificationUat",
+		),
 		definitionOfDone: params.definitionOfDone
-			? validateStringArray(params.definitionOfDone, "definitionOfDone")
+			? normalizePlanningTextArray(params.definitionOfDone, "definitionOfDone")
 			: [],
-		requirementCoverage: params.requirementCoverage ?? "Not provided.",
-		boundaryMapMarkdown: params.boundaryMapMarkdown ?? "Not provided.",
+		requirementCoverage: normalizePlanningText(
+			params.requirementCoverage ?? "Not provided.",
+			"requirementCoverage",
+		),
+		boundaryMapMarkdown: normalizePlanningText(
+			params.boundaryMapMarkdown ?? "Not provided.",
+			"boundaryMapMarkdown",
+		),
 		visionMeeting: hasStructuredVisionAlignmentMeeting(params.visionMeeting)
-			? params.visionMeeting
+			? normalizeVisionMeeting(params.visionMeeting)
 			: undefined,
 		slices: validateSlices(slicesInput),
 	};
--- a/src/resources/extensions/sf/tools/plan-slice.ts
+++ b/src/resources/extensions/sf/tools/plan-slice.ts
@ -18,7 +18,7 @@ import {
 import { invalidateStateCache } from "../state.js";
 import { isClosedStatus } from "../status-guards.js";
 import type { GateId } from "../types.js";
-import { isNonEmptyString } from "../validation.js";
+import { isNonEmptyString, normalizePlanningText } from "../validation.js";
 import { appendEvent } from "../workflow-events.js";
 import { logWarning } from "../workflow-logger.js";
 import { writeManifest } from "../workflow-manifest.js";
@ -136,16 +136,24 @@ function validateTasks(value: unknown): PlanSliceTaskInput[] {
 		}

 		return {
-			taskId,
-			title,
+			taskId: normalizePlanningText(taskId, `tasks[${index}].taskId`),
+			title: normalizePlanningText(title, `tasks[${index}].title`),
+			description: normalizePlanningText(
 				description,
-			estimate,
+				`tasks[${index}].description`,
+			),
+			estimate: normalizePlanningText(estimate, `tasks[${index}].estimate`),
 			files,
-			verify,
+			verify: normalizePlanningText(verify, `tasks[${index}].verify`),
 			inputs,
 			expectedOutput,
 			observabilityImpact:
-				typeof observabilityImpact === "string" ? observabilityImpact : "",
+				typeof observabilityImpact === "string"
+					? normalizePlanningText(
+							observabilityImpact,
+							`tasks[${index}].observabilityImpact`,
+						)
+					: "",
 		};
 	});
 }
@ -164,10 +172,23 @@ function validateParams(params: PlanSliceParams): PlanSliceParams {
 	return {
 		...params,
 		// Apply defaults for optional enrichment fields (#2771)
-		successCriteria: params.successCriteria ?? "Not provided.",
-		proofLevel: params.proofLevel ?? "Not provided.",
-		integrationClosure: params.integrationClosure ?? "Not provided.",
-		observabilityImpact: params.observabilityImpact ?? "Not provided.",
+		goal: normalizePlanningText(params.goal, "goal"),
+		successCriteria: normalizePlanningText(
+			params.successCriteria ?? "Not provided.",
+			"successCriteria",
+		),
+		proofLevel: normalizePlanningText(
+			params.proofLevel ?? "Not provided.",
+			"proofLevel",
+		),
+		integrationClosure: normalizePlanningText(
+			params.integrationClosure ?? "Not provided.",
+			"integrationClosure",
+		),
+		observabilityImpact: normalizePlanningText(
+			params.observabilityImpact ?? "Not provided.",
+			"observabilityImpact",
+		),
 		adversarialReview: hasCompleteAdversarialReview(params.adversarialReview)
 			? {
 					partner: params.adversarialReview!.partner!.trim(),
@ -180,27 +201,65 @@ function validateParams(params: PlanSliceParams): PlanSliceParams {
 					architect: "Missing architect review.",
 				},
 		planningMeeting: {
-			trigger: planningMeeting.trigger.trim(),
-			pm: planningMeeting.pm.trim(),
+			trigger: normalizePlanningText(planningMeeting.trigger, "planningMeeting.trigger"),
+			pm: normalizePlanningText(planningMeeting.pm, "planningMeeting.pm"),
 			...(isNonEmptyString(planningMeeting.userAdvocate)
-				? { userAdvocate: planningMeeting.userAdvocate.trim() }
+				? {
+						userAdvocate: normalizePlanningText(
+							planningMeeting.userAdvocate,
+							"planningMeeting.userAdvocate",
+						),
+					}
 				: {}),
 			...(isNonEmptyString(planningMeeting.customerPanel)
-				? { customerPanel: planningMeeting.customerPanel.trim() }
+				? {
+						customerPanel: normalizePlanningText(
+							planningMeeting.customerPanel,
+							"planningMeeting.customerPanel",
+						),
+					}
 				: {}),
 			...(isNonEmptyString(planningMeeting.business)
-				? { business: planningMeeting.business.trim() }
+				? {
+						business: normalizePlanningText(
+							planningMeeting.business,
+							"planningMeeting.business",
+						),
+					}
 				: {}),
-			researcher: planningMeeting.researcher.trim(),
+			researcher: normalizePlanningText(
+				planningMeeting.researcher,
+				"planningMeeting.researcher",
+			),
 			...(isNonEmptyString(planningMeeting.deliveryLead)
-				? { deliveryLead: planningMeeting.deliveryLead.trim() }
+				? {
+						deliveryLead: normalizePlanningText(
+							planningMeeting.deliveryLead,
+							"planningMeeting.deliveryLead",
+						),
+					}
 				: {}),
-			partner: planningMeeting.partner.trim(),
-			combatant: planningMeeting.combatant.trim(),
-			architect: planningMeeting.architect.trim(),
-			moderator: planningMeeting.moderator.trim(),
+			partner: normalizePlanningText(
+				planningMeeting.partner,
+				"planningMeeting.partner",
+			),
+			combatant: normalizePlanningText(
+				planningMeeting.combatant,
+				"planningMeeting.combatant",
+			),
+			architect: normalizePlanningText(
+				planningMeeting.architect,
+				"planningMeeting.architect",
+			),
+			moderator: normalizePlanningText(
+				planningMeeting.moderator,
+				"planningMeeting.moderator",
+			),
 			recommendedRoute: planningMeeting.recommendedRoute,
-			confidenceSummary: planningMeeting.confidenceSummary.trim(),
+			confidenceSummary: normalizePlanningText(
+				planningMeeting.confidenceSummary,
+				"planningMeeting.confidenceSummary",
+			),
 		},
 		tasks: validateTasks(params.tasks),
 	};
--- a/src/resources/extensions/sf/tools/plan-task.ts
+++ b/src/resources/extensions/sf/tools/plan-task.ts
@ -9,7 +9,11 @@ import {
 } from "../sf-db.js";
 import { invalidateStateCache } from "../state.js";
 import { isClosedStatus } from "../status-guards.js";
-import { isNonEmptyString, validateStringArray } from "../validation.js";
+import {
+	isNonEmptyString,
+	normalizePlanningText,
+	normalizePlanningTextArray,
+} from "../validation.js";
 import { appendEvent } from "../workflow-events.js";
 import { logWarning } from "../workflow-logger.js";
 import { writeManifest } from "../workflow-manifest.js";
@ -64,12 +68,26 @@ function validateParams(params: PlanTaskParams): PlanTaskParams {

 	return {
 		...params,
-		files: validateStringArray(params.files, "files"),
-		inputs: validateStringArray(params.inputs, "inputs"),
-		expectedOutput: validateStringArray(
+		milestoneId: normalizePlanningText(params.milestoneId, "milestoneId"),
+		sliceId: normalizePlanningText(params.sliceId, "sliceId"),
+		taskId: normalizePlanningText(params.taskId, "taskId"),
+		title: normalizePlanningText(params.title, "title"),
+		description: normalizePlanningText(params.description, "description"),
+		estimate: normalizePlanningText(params.estimate, "estimate"),
+		verify: normalizePlanningText(params.verify, "verify"),
+		files: normalizePlanningTextArray(params.files, "files"),
+		inputs: normalizePlanningTextArray(params.inputs, "inputs"),
+		expectedOutput: normalizePlanningTextArray(
 			params.expectedOutput,
 			"expectedOutput",
 		),
+		observabilityImpact:
+			params.observabilityImpact === undefined
+				? undefined
+				: normalizePlanningText(
+						params.observabilityImpact,
+						"observabilityImpact",
+					),
 	};
 }

--- a/src/resources/extensions/sf/tools/replan-slice.ts
+++ b/src/resources/extensions/sf/tools/replan-slice.ts
@ -19,7 +19,11 @@ import {
 } from "../sf-db.js";
 import { invalidateStateCache } from "../state.js";
 import { isClosedStatus } from "../status-guards.js";
-import { isNonEmptyString } from "../validation.js";
+import {
+	isNonEmptyString,
+	normalizePlanningText,
+	normalizePlanningTextArray,
+} from "../validation.js";
 import { appendEvent } from "../workflow-events.js";
 import { logWarning } from "../workflow-logger.js";
 import { writeManifest } from "../workflow-manifest.js";
@ -96,7 +100,91 @@ function validateParams(params: ReplanSliceParams): ReplanSliceParams {
 			throw new Error(`updatedTasks[${i}].title is required`);
 	}

-	return params;
+	return {
+		...params,
+		milestoneId: normalizePlanningText(params.milestoneId, "milestoneId"),
+		sliceId: normalizePlanningText(params.sliceId, "sliceId"),
+		blockerTaskId: normalizePlanningText(
+			params.blockerTaskId,
+			"blockerTaskId",
+		),
+		blockerDescription: normalizePlanningText(
+			params.blockerDescription,
+			"blockerDescription",
+		),
+		whatChanged: normalizePlanningText(params.whatChanged, "whatChanged"),
+		goal:
+			params.goal === undefined
+				? undefined
+				: normalizePlanningText(params.goal, "goal"),
+		successCriteria:
+			params.successCriteria === undefined
+				? undefined
+				: normalizePlanningText(params.successCriteria, "successCriteria"),
+		proofLevel:
+			params.proofLevel === undefined
+				? undefined
+				: normalizePlanningText(params.proofLevel, "proofLevel"),
+		integrationClosure:
+			params.integrationClosure === undefined
+				? undefined
+				: normalizePlanningText(params.integrationClosure, "integrationClosure"),
+		observabilityImpact:
+			params.observabilityImpact === undefined
+				? undefined
+				: normalizePlanningText(
+						params.observabilityImpact,
+						"observabilityImpact",
+					),
+		removedTaskIds: normalizePlanningTextArray(
+			params.removedTaskIds,
+			"removedTaskIds",
+		),
+		updatedTasks: params.updatedTasks.map((task, index) => ({
+			...task,
+			taskId: normalizePlanningText(
+				task.taskId,
+				`updatedTasks[${index}].taskId`,
+			),
+			title: normalizePlanningText(task.title, `updatedTasks[${index}].title`),
+			description:
+				task.description === undefined
+					? ""
+					: normalizePlanningText(
+							task.description,
+							`updatedTasks[${index}].description`,
+						),
+			estimate:
+				task.estimate === undefined
+					? ""
+					: normalizePlanningText(
+							task.estimate,
+							`updatedTasks[${index}].estimate`,
+						),
+			files: task.files
+				? normalizePlanningTextArray(task.files, `updatedTasks[${index}].files`)
+				: [],
+			verify:
+				task.verify === undefined
+					? ""
+					: normalizePlanningText(
+							task.verify,
+							`updatedTasks[${index}].verify`,
+						),
+			inputs: task.inputs
+				? normalizePlanningTextArray(
+						task.inputs,
+						`updatedTasks[${index}].inputs`,
+					)
+				: [],
+			expectedOutput: task.expectedOutput
+				? normalizePlanningTextArray(
+						task.expectedOutput,
+						`updatedTasks[${index}].expectedOutput`,
+					)
+				: [],
+		})),
+	};
 }

 export async function handleReplanSlice(
--- a/src/resources/extensions/sf/unit-runtime.ts
+++ b/src/resources/extensions/sf/unit-runtime.ts
@ -7,6 +7,7 @@ import {
 	writeFileSync,
 } from "node:fs";
 import { join } from "node:path";
+import type { RunawayGuardPauseMetadata } from "./auto-runaway-guard.js";
 import {
 	countMustHavesMentionedInSummary,
 	loadFile,
@ -58,6 +59,7 @@ export interface AutoUnitRuntimeRecord {
 	recovery?: ExecuteTaskRecoveryStatus;
 	recoveryAttempts?: number;
 	lastRecoveryReason?: "idle" | "hard";
+	runawayGuardPause?: RunawayGuardPauseMetadata;
 }

 function runtimeDir(basePath: string): string {
@ -123,6 +125,7 @@ export function writeUnitRuntimeRecord(
 		recovery: updates.recovery ?? prev?.recovery,
 		recoveryAttempts: updates.recoveryAttempts ?? prev?.recoveryAttempts ?? 0,
 		lastRecoveryReason: updates.lastRecoveryReason ?? prev?.lastRecoveryReason,
+		runawayGuardPause: updates.runawayGuardPause ?? prev?.runawayGuardPause,
 	};
 	writeFileSync(path, JSON.stringify(next, null, 2) + "\n", "utf-8");
 	_runtimeCache.set(path, next);
--- a/src/resources/extensions/sf/validation.ts
+++ b/src/resources/extensions/sf/validation.ts
@ -7,6 +7,56 @@ export function isNonEmptyString(value: unknown): value is string {
 	return typeof value === "string" && value.trim().length > 0;
 }

+const LEAKED_JSON_FIELD_RE =
+	/(^|[,{]\s*)"(?:successCriteria|proofLevel|integrationClosure|observabilityImpact|planningMeeting|visionMeeting|tasks|updatedTasks|removedTaskIds|verify|expectedOutput|oneLiner|narrative|verification|uatContent|keyFiles|keyDecisions|deviations|knownIssues|knownLimitations|followUps|operationalReadiness)"\s*:|",\s*"[A-Za-z][A-Za-z0-9_]*"\s*:/;
+
+/**
+ * Normalize freeform model-authored markdown fields before persistence.
+ * Tool schemas prove the outer type, but models can still pass JSON-ish debris
+ * or escaped newlines inside a valid string.
+ */
+export function normalizePlanningText(value: string, field: string): string {
+	const normalized = value.replace(/\\n/g, "\n").trim();
+	if (LEAKED_JSON_FIELD_RE.test(normalized)) {
+		throw new Error(`${field} appears to contain leaked JSON fields`);
+	}
+	return normalized;
+}
+
+export function normalizeRequiredPlanningText(
+	value: string,
+	field: string,
+): string {
+	const normalized = normalizePlanningText(value, field);
+	if (normalized.length === 0) {
+		throw new Error(`${field} must be a non-empty string`);
+	}
+	return normalized;
+}
+
+export function validateSafePathSegment(value: string, field: string): string {
+	const normalized = normalizeRequiredPlanningText(value, field);
+	if (
+		normalized === "." ||
+		normalized === ".." ||
+		normalized.includes("/") ||
+		normalized.includes("\\") ||
+		/[\0\r\n]/.test(normalized)
+	) {
+		throw new Error(`${field} must be a safe path segment`);
+	}
+	return normalized;
+}
+
+export function normalizePlanningTextArray(
+	value: unknown,
+	field: string,
+): string[] {
+	return validateStringArray(value, field).map((item, index) =>
+		normalizePlanningText(item, `${field}[${index}]`),
+	);
+}
+
 /**
 * Validate that `value` is an array of non-empty strings.
 * Throws with a message referencing `field` on failure.
--- a/src/resources/extensions/sf/workflow-projections.ts
+++ b/src/resources/extensions/sf/workflow-projections.ts
@ -174,6 +174,18 @@ export function renderPlanContent(
 		lines.push("");
 	}

+	const verificationCommands = taskRows
+		.map((task) => task.verify?.trim())
+		.filter((verify): verify is string => Boolean(verify));
+	if (verificationCommands.length > 0) {
+		lines.push("## Verification");
+		lines.push("");
+		for (const command of verificationCommands) {
+			lines.push(command.startsWith("-") ? command : `- ${command}`);
+		}
+		lines.push("");
+	}
+
 	lines.push("## Tasks");

 	for (const task of taskRows) {
--- a/src/tests/headless-cli-surface.test.ts
+++ b/src/tests/headless-cli-surface.test.ts
@ -291,6 +291,7 @@ test("HeadlessJsonResult satisfies expected shape", () => {
 	// Type-level assertion: construct a valid object and verify it compiles.
 	// At runtime, verify all required keys exist.
 	const result: HeadlessJsonResult = {
+		schemaVersion: 1,
 		status: "success",
 		exitCode: 0,
 		duration: 12345,
@ -305,6 +306,7 @@ test("HeadlessJsonResult satisfies expected shape", () => {
 		events: 42,
 	};
 	assert.equal(result.status, "success");
+	assert.equal(result.schemaVersion, 1);
 	assert.equal(result.exitCode, 0);
 	assert.equal(typeof result.duration, "number");
 	assert.ok(result.cost);
@ -319,6 +321,7 @@ test("HeadlessJsonResult satisfies expected shape", () => {

 test("HeadlessJsonResult accepts optional fields", () => {
 	const result: HeadlessJsonResult = {
+		schemaVersion: 1,
 		status: "blocked",
 		exitCode: 10,
 		sessionId: "sess-abc",
--- a/src/tests/headless-events.test.ts
+++ b/src/tests/headless-events.test.ts
@ -198,6 +198,7 @@ import {
 	isBlockedNotification,
 	isInteractiveHeadlessTool,
 	isPauseNotification,
+	isMilestoneReadyText,
 	isTerminalNotification,
 	mapStatusToExitCode,
 	shouldArmHeadlessIdleTimeout,
@ -292,6 +293,15 @@ test("isAutoResumeScheduledNotification detects provider auto-resume notices", (
 	);
 });

+test("isMilestoneReadyText detects ready marker in assistant text stream", () => {
+	assert.equal(isMilestoneReadyText("Milestone M007 ready."), true);
+	assert.equal(isMilestoneReadyText("**Milestone M007 ready.**"), true);
+	assert.equal(
+		isMilestoneReadyText("Planning complete, but the milestone is not ready yet."),
+		false,
+	);
+});
+
 test("isPauseNotification detects pause banners separately from auto-resume notices", () => {
 	assert.equal(
 		isPauseNotification({
--- a/src/tests/integration/e2e-headless.test.ts
+++ b/src/tests/integration/e2e-headless.test.ts
@ -212,6 +212,7 @@ test("headless --output-format json emits a single HeadlessJsonResult on stdout"
 	}

 	// Assert HeadlessJsonResult shape
+	assert.equal(parsed.schemaVersion, 1, "result should have schemaVersion 1");
 	assert.equal(
 		typeof parsed.status,
 		"string",
--- a/src/tests/parse-cli-args.test.ts
+++ b/src/tests/parse-cli-args.test.ts
@ -102,6 +102,12 @@ describe("parseCliArgs — list flags and accumulators", () => {
 		assert.equal(flags.listModels, true);
 		assert.equal(flags.print, true);
 	});
+
+	test("--discover enables live model-list verification", () => {
+		const flags = parse("--discover", "--list-models", "zai");
+		assert.equal(flags.discover, true);
+		assert.equal(flags.listModels, "zai");
+	});
 });

 describe("parseCliArgs — web mode flags", () => {