fix(triage-apply): strict plan validation + custom-runner guard + per-decision failures

Codex review follow-up (2026-05-14) addressed all three remaining issues from the earlier rescue pass: 1. Strict plan validation. parseTriagePlanStrict refuses the WHOLE plan on any malformed item instead of silently dropping. Enforces: - completion marker "Self-feedback triage complete" present - exactly one fenced ```yaml block - every decision has non-empty id + outcome ∈ {fix, promote, close} - outcome-specific required fields (close → reason; promote → reason + requirement_id; fix → proposed_approach) - duplicate ids rejected - when expectedIds is supplied, decisions must cover the candidate set exactly — no extras (hallucinated ids), no missing Returns ParseTriagePlanResult with {plan, error} so the caller can surface the specific failure reason. 2. Custom-runner trust guard. runTriageApply refuses an injected options.agentRunner unless allowUntrustedRunner is also explicitly set. Production callers cannot inject a runner. Without this guard a custom runner could side-channel-mutate the ledger despite the read-only tool override (codex Q2). 3. Per-decision failure surfacing. applyTriagePlan now returns {resolvedIds, rejectedIds, pendingFixIds} instead of just resolvedIds. runTriageApply reports ok=false if rejectedIds is non-empty, with the count + ids in the error message. Mutations still happen one-by-one (no SQL transaction wrapping) but the failure is no longer silent (codex Q3). Tests: src/tests/headless-triage-apply.test.ts now covers: - agree-path runs both agents in order; apply fails on missing ledger entry → ok=false, rejectedIds populated (the realistic contract for a test fixture without a seeded DB) - custom runner without allowUntrustedRunner refuses, agentRunner never invoked - rubber-duck disagrees → clean pause, ok=false, agreed=false - decider fails → skip rubber-duck - unknown id in plan rejected before review Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-14 17:19:12 +02:00 · 2026-05-14 17:19:12 +02:00 · 289bf9e264
commit 289bf9e264
parent d8ce433c7a
2 changed files with 465 additions and 61 deletions
--- a/src/headless-triage.ts
+++ b/src/headless-triage.ts
@ -103,7 +103,7 @@ interface AgentRunResult {
 type AgentRunner = (
 	agent: AgentConfig,
 	task: string,
-	options?: { tools?: string[]; model?: string },
+	options?: { tools?: string[]; model?: string; cwd?: string },
 ) => Promise<AgentRunResult>;

 /**
@ -128,56 +128,219 @@ export interface TriageDecision {
 	requirementId?: string;
 }

-export function parseTriagePlan(text: string): TriageDecision[] | null {
-	if (typeof text !== "string" || text.length === 0) return null;
-	const fenceMatch = text.match(/```ya?ml\s*\n([\s\S]*?)\n```/i);
-	if (!fenceMatch) return null;
-	const yamlBody = fenceMatch[1];
+export interface ParseTriagePlanResult {
+	plan: TriageDecision[] | null;
+	error: string | null;
+}
+
+const COMPLETION_MARKER = "Self-feedback triage complete";
+
+/**
+ * Parse a decider output into a strict decision plan, or return a
+ * structured error explaining what's wrong. Codex review 2026-05-14
+ * follow-up: refuse the whole plan if ANY item is malformed instead of
+ * silently dropping items — partial-trust on a triage plan is worse
+ * than no apply at all.
+ *
+ * Validates:
+ *   1. The completion marker is present (signals the decider finished).
+ *   2. A single fenced ```yaml block with key `decisions:` exists.
+ *   3. Every item has `id` (non-empty string) and `outcome` ∈
+ *      {fix, promote, close}.
+ *   4. Outcome-specific required fields:
+ *        - close   → reason (non-empty) + evidence_kind (defaults to
+ *                    human-clear if omitted, but if provided must be
+ *                    a non-empty string).
+ *        - promote → reason + requirement_id (non-empty strings).
+ *        - fix     → proposed_approach (non-empty).
+ *   5. If `expectedIds` is supplied (the candidate set the decider was
+ *      shown), every decision id must be in that set and every expected
+ *      id must have exactly one decision — no extras, no missing.
+ */
+export function parseTriagePlanStrict(
+	text: string,
+	expectedIds?: string[],
+): ParseTriagePlanResult {
+	if (typeof text !== "string" || text.length === 0) {
+		return { plan: null, error: "decider output was empty" };
+	}
+	if (!text.includes(COMPLETION_MARKER)) {
+		return {
+			plan: null,
+			error: `decider output is missing the completion marker "${COMPLETION_MARKER}" — treating as incomplete`,
+		};
+	}
+	const fenceMatches = Array.from(
+		text.matchAll(/```ya?ml\s*\n([\s\S]*?)\n```/gi),
+	);
+	if (fenceMatches.length === 0) {
+		return {
+			plan: null,
+			error: "decider output has no fenced yaml block with the decision plan",
+		};
+	}
+	if (fenceMatches.length > 1) {
+		return {
+			plan: null,
+			error: `decider output has ${fenceMatches.length} fenced yaml blocks — the contract is exactly one`,
+		};
+	}
+	const yamlBody = fenceMatches[0][1];
 	let parsed: unknown;
 	try {
 		parsed = parseYaml(yamlBody);
-	} catch {
-		return null;
+	} catch (err) {
+		return {
+			plan: null,
+			error: `decision plan failed to parse as yaml: ${
+				err instanceof Error ? err.message : String(err)
+			}`,
+		};
 	}
 	const root = parsed as Record<string, unknown> | null;
 	const decisions = root?.decisions;
-	if (!Array.isArray(decisions)) return null;
+	if (!Array.isArray(decisions)) {
+		return {
+			plan: null,
+			error: "decision plan must have a top-level `decisions:` array",
+		};
+	}
+	if (decisions.length === 0) {
+		return {
+			plan: null,
+			error: "decision plan has zero decisions — nothing to apply",
+		};
+	}
 	const out: TriageDecision[] = [];
-	for (const raw of decisions) {
-		if (!raw || typeof raw !== "object") continue;
+	const seenIds = new Set<string>();
+	for (let i = 0; i < decisions.length; i++) {
+		const raw = decisions[i];
+		if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
+			return {
+				plan: null,
+				error: `decisions[${i}] is not an object`,
+			};
+		}
 		const item = raw as Record<string, unknown>;
-		const id = typeof item.id === "string" ? item.id : null;
+		const id = typeof item.id === "string" ? item.id.trim() : "";
+		if (!id) {
+			return {
+				plan: null,
+				error: `decisions[${i}] is missing a non-empty id`,
+			};
+		}
+		if (seenIds.has(id)) {
+			return {
+				plan: null,
+				error: `decisions[${i}] duplicates id "${id}" — each ledger entry must appear exactly once`,
+			};
+		}
+		seenIds.add(id);
 		const outcome = item.outcome;
-		if (!id) continue;
-		if (outcome !== "fix" && outcome !== "promote" && outcome !== "close")
-			continue;
+		if (outcome !== "fix" && outcome !== "promote" && outcome !== "close") {
+			return {
+				plan: null,
+				error: `decisions[${i}] (id=${id}) has invalid outcome "${outcome}" — expected fix|promote|close`,
+			};
+		}
 		const decision: TriageDecision = { id, outcome };
-		if (typeof item.evidence_kind === "string") {
-			decision.evidenceKind = item.evidence_kind;
-		} else if (typeof item.evidenceKind === "string") {
-			decision.evidenceKind = item.evidenceKind;
+		const reason = typeof item.reason === "string" ? item.reason.trim() : "";
+		const evidenceKindRaw =
+			typeof item.evidence_kind === "string"
+				? item.evidence_kind
+				: typeof item.evidenceKind === "string"
+					? (item.evidenceKind as string)
+					: "";
+		if (evidenceKindRaw.trim()) decision.evidenceKind = evidenceKindRaw.trim();
+		if (reason) decision.reason = reason;
+		const proposedApproachRaw =
+			typeof item.proposed_approach === "string"
+				? item.proposed_approach
+				: typeof item.proposedApproach === "string"
+					? (item.proposedApproach as string)
+					: "";
+		if (proposedApproachRaw.trim()) {
+			decision.proposedApproach = proposedApproachRaw.trim();
 		}
-		if (typeof item.reason === "string") decision.reason = item.reason;
-		if (typeof item.proposed_approach === "string") {
-			decision.proposedApproach = item.proposed_approach;
-		} else if (typeof item.proposedApproach === "string") {
-			decision.proposedApproach = item.proposedApproach;
+		const requirementIdRaw =
+			typeof item.requirement_id === "string"
+				? item.requirement_id
+				: typeof item.requirementId === "string"
+					? (item.requirementId as string)
+					: "";
+		if (requirementIdRaw.trim()) {
+			decision.requirementId = requirementIdRaw.trim();
 		}
-		if (typeof item.requirement_id === "string") {
-			decision.requirementId = item.requirement_id;
-		} else if (typeof item.requirementId === "string") {
-			decision.requirementId = item.requirementId;
+
+		// Outcome-specific required fields.
+		if (outcome === "close" && !decision.reason) {
+			return {
+				plan: null,
+				error: `decisions[${i}] (id=${id}, outcome=close) is missing required field reason`,
+			};
+		}
+		if (outcome === "promote") {
+			if (!decision.reason) {
+				return {
+					plan: null,
+					error: `decisions[${i}] (id=${id}, outcome=promote) is missing required field reason`,
+				};
+			}
+			if (!decision.requirementId) {
+				return {
+					plan: null,
+					error: `decisions[${i}] (id=${id}, outcome=promote) is missing required field requirement_id`,
+				};
+			}
+		}
+		if (outcome === "fix" && !decision.proposedApproach) {
+			return {
+				plan: null,
+				error: `decisions[${i}] (id=${id}, outcome=fix) is missing required field proposed_approach`,
+			};
 		}
 		out.push(decision);
 	}
-	return out;
+
+	// Expected-id coverage check: if the caller knows which entries the
+	// decider was shown, ensure the plan covers them exactly.
+	if (Array.isArray(expectedIds) && expectedIds.length > 0) {
+		const expected = new Set(expectedIds);
+		for (const decision of out) {
+			if (!expected.has(decision.id)) {
+				return {
+					plan: null,
+					error: `decision id "${decision.id}" is not in the candidate set the decider was shown — possible hallucination`,
+				};
+			}
+		}
+		for (const id of expected) {
+			if (!seenIds.has(id)) {
+				return {
+					plan: null,
+					error: `candidate id "${id}" has no decision in the plan — incomplete coverage`,
+				};
+			}
+		}
+	}
+
+	return { plan: out, error: null };
+}
+
+/**
+ * Backwards-compat wrapper for the lenient parser used by existing tests
+ * and the test fixtures. Returns just the plan, no error explanation.
+ * Production callers should use parseTriagePlanStrict.
+ */
+export function parseTriagePlan(text: string): TriageDecision[] | null {
+	return parseTriagePlanStrict(text).plan;
 }

 function buildSfPrintLaunchArgs(
 	promptPath: string,
 	task: string,
 	agent: AgentConfig,
-	options: { tools?: string[]; model?: string } = {},
+	options: { tools?: string[]; model?: string; cwd?: string } = {},
 ): { command: string; args: string[] } {
 	const sfBinPath = process.env.SF_BIN_PATH || process.argv[1];
 	const baseArgs = [
@ -203,7 +366,7 @@ function buildSfPrintLaunchArgs(
 async function defaultAgentRunner(
 	agent: AgentConfig,
 	task: string,
-	options: { tools?: string[]; model?: string } = {},
+	options: { tools?: string[]; model?: string; cwd?: string } = {},
 ): Promise<AgentRunResult> {
 	const tmpDir = mkdtempSync(join(tmpdir(), "sf-triage-agent-"));
 	const promptPath = join(tmpDir, `${agent.name}.md`);
@ -221,7 +384,7 @@ async function defaultAgentRunner(
 	};
 	const composed =
 		promptPartsModule.composeAgentPrompt?.(agent, {
-			cwd: process.cwd(),
+			cwd: options.cwd ?? process.cwd(),
 			surface: "headless",
 			tools: options.tools ?? agent.tools,
 		}) ?? agent.systemPrompt;
@ -236,7 +399,7 @@ async function defaultAgentRunner(
 		);
 		return await new Promise<AgentRunResult>((resolve) => {
 			const proc = spawn(launch.command, launch.args, {
-				cwd: process.cwd(),
+				cwd: options.cwd ?? process.cwd(),
 				env: process.env,
 				shell: false,
 				stdio: ["ignore", "pipe", "pipe"],
@ -297,6 +460,18 @@ async function emitTriageApplyJournal(
 	}
 }

+export interface RunTriageApplyResult {
+	ok: boolean;
+	agreed: boolean;
+	error?: string;
+	deciderOutput?: string;
+	reviewOutput?: string;
+	resolvedIds: string[];
+	rejectedIds?: string[];
+	pendingFixIds?: string[];
+	flowId: string;
+}
+
 export async function runTriageApply(
 	cwd: string,
 	prompt: string,
@ -304,16 +479,17 @@ export async function runTriageApply(
 		model?: string;
 		agentRunner?: AgentRunner;
 		candidateCount?: number;
+		// Expected ledger ids the decider was shown. When supplied, the
+		// strict plan parser refuses any plan that adds new ids or omits
+		// expected ones.
+		expectedIds?: string[];
+		// Test escape hatch. Production callers MUST NOT set this. Required
+		// to pass a custom agentRunner because an arbitrary runner could
+		// side-channel-mutate the ledger despite the read-only tool override
+		// the orchestrator enforces (codex review 2026-05-14 follow-up).
+		allowUntrustedRunner?: boolean;
 	} = {},
-): Promise<{
-	ok: boolean;
-	agreed: boolean;
-	error?: string;
-	deciderOutput?: string;
-	reviewOutput?: string;
-	resolvedIds: string[];
-	flowId: string;
-}> {
+): Promise<RunTriageApplyResult> {
 	const flowId = `triage-apply-${randomUUID()}`;
 	let seq = 0;
 	const emit = (eventType: string, data: Record<string, unknown> = {}) =>
@ -348,8 +524,8 @@ export async function runTriageApply(
 	// Trusted-source guard (codex review 2026-05-14): when --apply will
 	// mutate the ledger, BOTH agents must be SF-shipped built-ins. A
 	// project-level override could silently disable rubber-duck's
-	// independence — operators can still customize behavior, but they
-	// have to use --review mode where mutations aren't auto-applied.
+	// independence. Operators can still customize behavior for inspect
+	// workflows, but --apply uses only the shipped review contract.
 	if (triageDecider.source !== "builtin" || rubberDuck.source !== "builtin") {
 		await emit("triage-apply-failed", {
 			reason: "untrusted-agent-source",
@ -359,12 +535,26 @@ export async function runTriageApply(
 		return {
 			ok: false,
 			agreed: false,
-			error: `Refusing to --apply with non-builtin agents (triage-decider=${triageDecider.source}, rubber-duck=${rubberDuck.source}). Use --review for inspect-only mode, or remove the project/user override.`,
+			error: `Refusing to --apply with non-builtin agents (triage-decider=${triageDecider.source}, rubber-duck=${rubberDuck.source}). Use \`sf headless triage --run\` for a reviewable decision artifact, or remove the project/user override.`,
 			resolvedIds: [],
 			flowId,
 		};
 	}

+	// Custom-runner guard (codex review follow-up): an injected agentRunner
+	// can side-channel-mutate the ledger despite the read-only tool override.
+	// Only allow it when allowUntrustedRunner is explicitly set (test path).
+	if (options.agentRunner && !options.allowUntrustedRunner) {
+		await emit("triage-apply-failed", { reason: "untrusted-runner" });
+		return {
+			ok: false,
+			agreed: false,
+			error:
+				"runTriageApply: a custom agentRunner was supplied without allowUntrustedRunner. Production callers cannot inject a runner — only tests can, via the explicit allowUntrustedRunner option.",
+			resolvedIds: [],
+			flowId,
+		};
+	}
 	const runner = options.agentRunner ?? defaultAgentRunner;

 	// Phase 1: triage-decider runs in PLAN-ONLY mode. Drop resolve_issue
@ -373,6 +563,7 @@ export async function runTriageApply(
 	// emits a YAML decision plan; we parse it post-hoc.
 	const decider = await runner(triageDecider, prompt, {
 		model: options.model,
+		cwd,
 		tools: ["view", "grep", "glob", "git_log"],
 	});
 	await emit("triage-apply-decider-finished", {
@ -391,22 +582,31 @@ export async function runTriageApply(
 		};
 	}

-	// Parse the structured plan. If the decider didn't emit a valid plan
-	// (missing fenced block, malformed YAML, no decisions key), refuse to
-	// apply — we can't audit what we can't read.
-	const plan = parseTriagePlan(decider.output);
-	if (!plan || plan.length === 0) {
-		await emit("triage-apply-failed", { reason: "no-plan" });
+	// Parse the structured plan in strict mode. Refuses on any malformed
+	// item, missing completion marker, multiple yaml blocks, or mismatch
+	// against the expected candidate set (when supplied). Partial trust on
+	// a triage plan is worse than no apply at all.
+	const parseResult = parseTriagePlanStrict(
+		decider.output,
+		options.expectedIds,
+	);
+	if (!parseResult.plan) {
+		await emit("triage-apply-failed", {
+			reason: "no-plan",
+			parseError: parseResult.error,
+		});
 		return {
 			ok: false,
 			agreed: false,
-			error:
-				"triage-decider did not emit a parseable decision plan (expected a fenced ```yaml block with a `decisions:` key)",
+			error: `triage-decider plan rejected: ${
+				parseResult.error ?? "unknown parse error"
+			}`,
 			deciderOutput: decider.output,
 			resolvedIds: [],
 			flowId,
 		};
 	}
+	const plan = parseResult.plan;
 	await emit("triage-apply-plan-parsed", {
 		decisionCount: plan.length,
 		outcomes: plan.reduce<Record<string, number>>((acc, d) => {
@ -431,6 +631,7 @@ export async function runTriageApply(
 	].join("\n");
 	const review = await runner(rubberDuck, reviewTask, {
 		model: options.model,
+		cwd,
 		tools: ["view", "grep", "glob", "git_log", "query_journal"],
 	});
 	const agreed = /^rubber-duck:\s*agree\b/im.test(review.output.trim());
@ -470,21 +671,38 @@ export async function runTriageApply(
 		};
 	}

-	// Phase 3: apply the plan. We (this runner) call resolve_issue for
+	// Phase 3: apply the plan. We (this runner) call markResolved for
 	// each close/promote decision; fix decisions get surfaced for the
 	// operator but never auto-mutate. Mutations happen ONCE, post-review,
 	// and the resolvedIds we return reflect actual ledger state.
-	const resolvedIds = await applyTriagePlan(cwd, plan, emit);
+	const applyResult = await applyTriagePlan(cwd, plan, emit);
+	// Per-decision failure surfacing (codex review follow-up): if any
+	// approved close/promote failed to apply, runTriageApply reports
+	// ok=false. Operator sees both partial success (resolvedIds) and
+	// partial failure (rejectedIds) so they can investigate.
+	const approvedMutationCount = plan.filter((d) => d.outcome !== "fix").length;
+	const hasFailures = applyResult.rejectedIds.length > 0;
 	return {
-		ok: true,
+		ok: !hasFailures,
 		agreed: true,
+		error: hasFailures
+			? `${applyResult.rejectedIds.length} of ${approvedMutationCount} approved mutations failed: ${applyResult.rejectedIds.join(", ")}`
+			: undefined,
 		deciderOutput: decider.output,
 		reviewOutput: review.output,
-		resolvedIds,
+		resolvedIds: applyResult.resolvedIds,
+		rejectedIds: applyResult.rejectedIds,
+		pendingFixIds: applyResult.pendingFixIds,
 		flowId,
 	};
 }

+interface ApplyTriagePlanResult {
+	resolvedIds: string[];
+	rejectedIds: string[];
+	pendingFixIds: string[];
+}
+
 /**
 * Apply an approved decision plan. Calls markResolved (via the SF
 * extension's self-feedback writer, which runs the existing writer-
@ -492,14 +710,19 @@ export async function runTriageApply(
 * agent-fix, etc.) for each close/promote decision. Fix decisions are
 * not auto-applied; they require operator implementation work.
 *
- * Returns the list of ledger ids that were actually closed/promoted.
+ * Returns three lists: resolvedIds (actually mutated), rejectedIds
+ * (writer-layer refused OR markResolved threw), pendingFixIds (fix
+ * decisions surfaced for operator handoff). runTriageApply uses the
+ * rejectedIds count to decide ok=true vs ok=false.
 */
 async function applyTriagePlan(
 	cwd: string,
 	plan: TriageDecision[],
 	emit: (eventType: string, data?: Record<string, unknown>) => Promise<void>,
-): Promise<string[]> {
+): Promise<ApplyTriagePlanResult> {
 	const resolvedIds: string[] = [];
+	const rejectedIds: string[] = [];
+	const pendingFixIds: string[] = [];
 	const sfModule = (await jiti.import(sfExtensionPath("self-feedback"))) as {
 		markResolved?: (
 			entryId: string,
@ -511,12 +734,19 @@ async function applyTriagePlan(
 		await emit("triage-apply-mutation-failed", {
 			reason: "markResolved-unavailable",
 		});
-		return resolvedIds;
+		// Every approved close/promote becomes a rejection — can't mutate
+		// anything if the writer module isn't loadable.
+		for (const decision of plan) {
+			if (decision.outcome === "fix") pendingFixIds.push(decision.id);
+			else rejectedIds.push(decision.id);
+		}
+		return { resolvedIds, rejectedIds, pendingFixIds };
 	}
 	for (const decision of plan) {
 		if (decision.outcome === "fix") {
 			// Fix decisions are operator handoffs — surface in the report
 			// (via the caller's deciderOutput / decision plan), don't mutate.
+			pendingFixIds.push(decision.id);
 			await emit("triage-apply-fix-pending-operator", { id: decision.id });
 			continue;
 		}
@ -540,6 +770,7 @@ async function applyTriagePlan(
 					evidenceKind,
 				});
 			} else {
+				rejectedIds.push(decision.id);
 				await emit("triage-apply-mutation-rejected", {
 					id: decision.id,
 					outcome: decision.outcome,
@ -548,13 +779,14 @@ async function applyTriagePlan(
 				});
 			}
 		} catch (err) {
+			rejectedIds.push(decision.id);
 			await emit("triage-apply-mutation-failed", {
 				id: decision.id,
 				error: err instanceof Error ? err.message : String(err),
 			});
 		}
 	}
-	return resolvedIds;
+	return { resolvedIds, rejectedIds, pendingFixIds };
 }

 /**
@ -698,6 +930,7 @@ export async function handleTriage(
 			model: options.model,
 			agentRunner: options.agentRunner,
 			candidateCount: candidates.length,
+			expectedIds: candidates.map((candidate) => candidate.id),
 		});
 		const payload = {
 			ok: result.ok,
--- a/src/tests/headless-triage-apply.test.ts
+++ b/src/tests/headless-triage-apply.test.ts
@ -0,0 +1,171 @@
+import assert from "node:assert/strict";
+import { mkdirSync, mkdtempSync, rmSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import { afterEach, beforeEach, test } from "vitest";
+
+import { runTriageApply } from "../headless-triage.js";
+
+const tempDirs: string[] = [];
+let originalAgentDir: string | undefined;
+const deciderPlan = [
+	"```yaml",
+	"decisions:",
+	"  - id: sf-test-1",
+	"    outcome: close",
+	"    evidence_kind: human-clear",
+	"    reason: stale",
+	"```",
+	"Self-feedback triage complete.",
+].join("\n");
+
+function makeProject(): string {
+	const dir = mkdtempSync(join(tmpdir(), "sf-headless-triage-"));
+	tempDirs.push(dir);
+	mkdirSync(join(dir, ".sf"), { recursive: true });
+	return dir;
+}
+
+afterEach(() => {
+	if (originalAgentDir === undefined) delete process.env.SF_CODING_AGENT_DIR;
+	else process.env.SF_CODING_AGENT_DIR = originalAgentDir;
+	while (tempDirs.length > 0) {
+		rmSync(tempDirs.pop()!, { recursive: true, force: true });
+	}
+});
+
+beforeEach(() => {
+	originalAgentDir = process.env.SF_CODING_AGENT_DIR;
+	const dir = mkdtempSync(join(tmpdir(), "sf-headless-agent-dir-"));
+	tempDirs.push(dir);
+	process.env.SF_CODING_AGENT_DIR = dir;
+});
+
+test("runTriageApply_when_rubber_duck_agrees_runs_both_agents_in_order", async () => {
+	const project = makeProject();
+	const calls: string[] = [];
+	const result = await runTriageApply(project, "triage prompt", {
+		candidateCount: 1,
+		allowUntrustedRunner: true,
+		agentRunner: async (agent) => {
+			calls.push(agent.name);
+			if (agent.name === "triage-decider") {
+				return {
+					ok: true,
+					output: deciderPlan,
+					exitCode: 0,
+				};
+			}
+			return { ok: true, output: "rubber-duck: agree", exitCode: 0 };
+		},
+	});
+
+	// Orchestration contract: rubber-duck agreed, so the apply phase ran.
+	// The actual mutation on sf-test-1 fails because the test project has
+	// no real ledger entry — that's the rejectedIds path, not a contract
+	// violation. ok=false on apply failures is itself the contract
+	// (codex review 2026-05-14 follow-up).
+	assert.equal(result.agreed, true, "rubber-duck must agree");
+	assert.deepEqual(
+		calls,
+		["triage-decider", "rubber-duck"],
+		"decider must run before rubber-duck",
+	);
+	assert.equal(result.ok, false, "apply should fail on missing ledger entry");
+	assert.deepEqual(result.rejectedIds, ["sf-test-1"]);
+	assert.deepEqual(result.resolvedIds, []);
+});
+
+test("runTriageApply_when_custom_runner_without_trust_flag_refuses", async () => {
+	// Defense-in-depth: an injected agentRunner can side-channel-mutate
+	// the ledger despite the read-only tool override. Only test paths
+	// that explicitly set allowUntrustedRunner can pass a custom runner.
+	const project = makeProject();
+	let calls = 0;
+	const result = await runTriageApply(project, "triage prompt", {
+		// NOTE: allowUntrustedRunner intentionally omitted.
+		agentRunner: async () => {
+			calls += 1;
+			return { ok: true, output: deciderPlan, exitCode: 0 };
+		},
+	});
+
+	assert.equal(result.ok, false);
+	assert.equal(
+		calls,
+		0,
+		"agentRunner must not be invoked when trust flag is absent",
+	);
+	assert.match(result.error ?? "", /allowUntrustedRunner|custom agentRunner/i);
+});
+
+test("runTriageApply_when_rubber_duck_disagrees_blocks_completion", async () => {
+	const project = makeProject();
+	const result = await runTriageApply(project, "triage prompt", {
+		allowUntrustedRunner: true,
+		agentRunner: async (agent) => {
+			if (agent.name === "triage-decider") {
+				return {
+					ok: true,
+					output: deciderPlan,
+					exitCode: 0,
+				};
+			}
+			return {
+				ok: true,
+				output: "## Concern 1:\nThe cited closure is not supported.",
+				exitCode: 0,
+			};
+		},
+	});
+
+	assert.equal(result.ok, false);
+	assert.equal(result.agreed, false);
+	assert.equal(
+		result.error,
+		"rubber-duck disagreed — pausing for operator review",
+	);
+});
+
+test("runTriageApply_when_decider_fails_skips_rubber_duck", async () => {
+	const project = makeProject();
+	const calls: string[] = [];
+	const result = await runTriageApply(project, "triage prompt", {
+		allowUntrustedRunner: true,
+		agentRunner: async (agent) => {
+			calls.push(agent.name);
+			return {
+				ok: false,
+				output: "",
+				stderr: "model failed",
+				exitCode: 1,
+			};
+		},
+	});
+
+	assert.equal(result.ok, false);
+	assert.equal(result.error, "model failed");
+	assert.deepEqual(calls, ["triage-decider"]);
+});
+
+test("runTriageApply_when_decider_outputs_unknown_id_rejects_plan_before_review", async () => {
+	const project = makeProject();
+	const calls: string[] = [];
+	const result = await runTriageApply(project, "triage prompt", {
+		candidateCount: 1,
+		expectedIds: ["sf-expected-1"],
+		allowUntrustedRunner: true,
+		agentRunner: async (agent) => {
+			calls.push(agent.name);
+			return {
+				ok: true,
+				output: deciderPlan,
+				exitCode: 0,
+			};
+		},
+	});
+
+	assert.equal(result.ok, false);
+	assert.match(result.error ?? "", /not in the candidate set/);
+	assert.deepEqual(calls, ["triage-decider"]);
+});