diff --git a/src/headless-triage.ts b/src/headless-triage.ts index ddffe0cad..d1bf8a01d 100644 --- a/src/headless-triage.ts +++ b/src/headless-triage.ts @@ -103,7 +103,7 @@ interface AgentRunResult { type AgentRunner = ( agent: AgentConfig, task: string, - options?: { tools?: string[]; model?: string }, + options?: { tools?: string[]; model?: string; cwd?: string }, ) => Promise; /** @@ -128,56 +128,219 @@ export interface TriageDecision { requirementId?: string; } -export function parseTriagePlan(text: string): TriageDecision[] | null { - if (typeof text !== "string" || text.length === 0) return null; - const fenceMatch = text.match(/```ya?ml\s*\n([\s\S]*?)\n```/i); - if (!fenceMatch) return null; - const yamlBody = fenceMatch[1]; +export interface ParseTriagePlanResult { + plan: TriageDecision[] | null; + error: string | null; +} + +const COMPLETION_MARKER = "Self-feedback triage complete"; + +/** + * Parse a decider output into a strict decision plan, or return a + * structured error explaining what's wrong. Codex review 2026-05-14 + * follow-up: refuse the whole plan if ANY item is malformed instead of + * silently dropping items — partial-trust on a triage plan is worse + * than no apply at all. + * + * Validates: + * 1. The completion marker is present (signals the decider finished). + * 2. A single fenced ```yaml block with key `decisions:` exists. + * 3. Every item has `id` (non-empty string) and `outcome` ∈ + * {fix, promote, close}. + * 4. Outcome-specific required fields: + * - close → reason (non-empty) + evidence_kind (defaults to + * human-clear if omitted, but if provided must be + * a non-empty string). + * - promote → reason + requirement_id (non-empty strings). + * - fix → proposed_approach (non-empty). + * 5. If `expectedIds` is supplied (the candidate set the decider was + * shown), every decision id must be in that set and every expected + * id must have exactly one decision — no extras, no missing. + */ +export function parseTriagePlanStrict( + text: string, + expectedIds?: string[], +): ParseTriagePlanResult { + if (typeof text !== "string" || text.length === 0) { + return { plan: null, error: "decider output was empty" }; + } + if (!text.includes(COMPLETION_MARKER)) { + return { + plan: null, + error: `decider output is missing the completion marker "${COMPLETION_MARKER}" — treating as incomplete`, + }; + } + const fenceMatches = Array.from( + text.matchAll(/```ya?ml\s*\n([\s\S]*?)\n```/gi), + ); + if (fenceMatches.length === 0) { + return { + plan: null, + error: "decider output has no fenced yaml block with the decision plan", + }; + } + if (fenceMatches.length > 1) { + return { + plan: null, + error: `decider output has ${fenceMatches.length} fenced yaml blocks — the contract is exactly one`, + }; + } + const yamlBody = fenceMatches[0][1]; let parsed: unknown; try { parsed = parseYaml(yamlBody); - } catch { - return null; + } catch (err) { + return { + plan: null, + error: `decision plan failed to parse as yaml: ${ + err instanceof Error ? err.message : String(err) + }`, + }; } const root = parsed as Record | null; const decisions = root?.decisions; - if (!Array.isArray(decisions)) return null; + if (!Array.isArray(decisions)) { + return { + plan: null, + error: "decision plan must have a top-level `decisions:` array", + }; + } + if (decisions.length === 0) { + return { + plan: null, + error: "decision plan has zero decisions — nothing to apply", + }; + } const out: TriageDecision[] = []; - for (const raw of decisions) { - if (!raw || typeof raw !== "object") continue; + const seenIds = new Set(); + for (let i = 0; i < decisions.length; i++) { + const raw = decisions[i]; + if (!raw || typeof raw !== "object" || Array.isArray(raw)) { + return { + plan: null, + error: `decisions[${i}] is not an object`, + }; + } const item = raw as Record; - const id = typeof item.id === "string" ? item.id : null; + const id = typeof item.id === "string" ? item.id.trim() : ""; + if (!id) { + return { + plan: null, + error: `decisions[${i}] is missing a non-empty id`, + }; + } + if (seenIds.has(id)) { + return { + plan: null, + error: `decisions[${i}] duplicates id "${id}" — each ledger entry must appear exactly once`, + }; + } + seenIds.add(id); const outcome = item.outcome; - if (!id) continue; - if (outcome !== "fix" && outcome !== "promote" && outcome !== "close") - continue; + if (outcome !== "fix" && outcome !== "promote" && outcome !== "close") { + return { + plan: null, + error: `decisions[${i}] (id=${id}) has invalid outcome "${outcome}" — expected fix|promote|close`, + }; + } const decision: TriageDecision = { id, outcome }; - if (typeof item.evidence_kind === "string") { - decision.evidenceKind = item.evidence_kind; - } else if (typeof item.evidenceKind === "string") { - decision.evidenceKind = item.evidenceKind; + const reason = typeof item.reason === "string" ? item.reason.trim() : ""; + const evidenceKindRaw = + typeof item.evidence_kind === "string" + ? item.evidence_kind + : typeof item.evidenceKind === "string" + ? (item.evidenceKind as string) + : ""; + if (evidenceKindRaw.trim()) decision.evidenceKind = evidenceKindRaw.trim(); + if (reason) decision.reason = reason; + const proposedApproachRaw = + typeof item.proposed_approach === "string" + ? item.proposed_approach + : typeof item.proposedApproach === "string" + ? (item.proposedApproach as string) + : ""; + if (proposedApproachRaw.trim()) { + decision.proposedApproach = proposedApproachRaw.trim(); } - if (typeof item.reason === "string") decision.reason = item.reason; - if (typeof item.proposed_approach === "string") { - decision.proposedApproach = item.proposed_approach; - } else if (typeof item.proposedApproach === "string") { - decision.proposedApproach = item.proposedApproach; + const requirementIdRaw = + typeof item.requirement_id === "string" + ? item.requirement_id + : typeof item.requirementId === "string" + ? (item.requirementId as string) + : ""; + if (requirementIdRaw.trim()) { + decision.requirementId = requirementIdRaw.trim(); } - if (typeof item.requirement_id === "string") { - decision.requirementId = item.requirement_id; - } else if (typeof item.requirementId === "string") { - decision.requirementId = item.requirementId; + + // Outcome-specific required fields. + if (outcome === "close" && !decision.reason) { + return { + plan: null, + error: `decisions[${i}] (id=${id}, outcome=close) is missing required field reason`, + }; + } + if (outcome === "promote") { + if (!decision.reason) { + return { + plan: null, + error: `decisions[${i}] (id=${id}, outcome=promote) is missing required field reason`, + }; + } + if (!decision.requirementId) { + return { + plan: null, + error: `decisions[${i}] (id=${id}, outcome=promote) is missing required field requirement_id`, + }; + } + } + if (outcome === "fix" && !decision.proposedApproach) { + return { + plan: null, + error: `decisions[${i}] (id=${id}, outcome=fix) is missing required field proposed_approach`, + }; } out.push(decision); } - return out; + + // Expected-id coverage check: if the caller knows which entries the + // decider was shown, ensure the plan covers them exactly. + if (Array.isArray(expectedIds) && expectedIds.length > 0) { + const expected = new Set(expectedIds); + for (const decision of out) { + if (!expected.has(decision.id)) { + return { + plan: null, + error: `decision id "${decision.id}" is not in the candidate set the decider was shown — possible hallucination`, + }; + } + } + for (const id of expected) { + if (!seenIds.has(id)) { + return { + plan: null, + error: `candidate id "${id}" has no decision in the plan — incomplete coverage`, + }; + } + } + } + + return { plan: out, error: null }; +} + +/** + * Backwards-compat wrapper for the lenient parser used by existing tests + * and the test fixtures. Returns just the plan, no error explanation. + * Production callers should use parseTriagePlanStrict. + */ +export function parseTriagePlan(text: string): TriageDecision[] | null { + return parseTriagePlanStrict(text).plan; } function buildSfPrintLaunchArgs( promptPath: string, task: string, agent: AgentConfig, - options: { tools?: string[]; model?: string } = {}, + options: { tools?: string[]; model?: string; cwd?: string } = {}, ): { command: string; args: string[] } { const sfBinPath = process.env.SF_BIN_PATH || process.argv[1]; const baseArgs = [ @@ -203,7 +366,7 @@ function buildSfPrintLaunchArgs( async function defaultAgentRunner( agent: AgentConfig, task: string, - options: { tools?: string[]; model?: string } = {}, + options: { tools?: string[]; model?: string; cwd?: string } = {}, ): Promise { const tmpDir = mkdtempSync(join(tmpdir(), "sf-triage-agent-")); const promptPath = join(tmpDir, `${agent.name}.md`); @@ -221,7 +384,7 @@ async function defaultAgentRunner( }; const composed = promptPartsModule.composeAgentPrompt?.(agent, { - cwd: process.cwd(), + cwd: options.cwd ?? process.cwd(), surface: "headless", tools: options.tools ?? agent.tools, }) ?? agent.systemPrompt; @@ -236,7 +399,7 @@ async function defaultAgentRunner( ); return await new Promise((resolve) => { const proc = spawn(launch.command, launch.args, { - cwd: process.cwd(), + cwd: options.cwd ?? process.cwd(), env: process.env, shell: false, stdio: ["ignore", "pipe", "pipe"], @@ -297,6 +460,18 @@ async function emitTriageApplyJournal( } } +export interface RunTriageApplyResult { + ok: boolean; + agreed: boolean; + error?: string; + deciderOutput?: string; + reviewOutput?: string; + resolvedIds: string[]; + rejectedIds?: string[]; + pendingFixIds?: string[]; + flowId: string; +} + export async function runTriageApply( cwd: string, prompt: string, @@ -304,16 +479,17 @@ export async function runTriageApply( model?: string; agentRunner?: AgentRunner; candidateCount?: number; + // Expected ledger ids the decider was shown. When supplied, the + // strict plan parser refuses any plan that adds new ids or omits + // expected ones. + expectedIds?: string[]; + // Test escape hatch. Production callers MUST NOT set this. Required + // to pass a custom agentRunner because an arbitrary runner could + // side-channel-mutate the ledger despite the read-only tool override + // the orchestrator enforces (codex review 2026-05-14 follow-up). + allowUntrustedRunner?: boolean; } = {}, -): Promise<{ - ok: boolean; - agreed: boolean; - error?: string; - deciderOutput?: string; - reviewOutput?: string; - resolvedIds: string[]; - flowId: string; -}> { +): Promise { const flowId = `triage-apply-${randomUUID()}`; let seq = 0; const emit = (eventType: string, data: Record = {}) => @@ -348,8 +524,8 @@ export async function runTriageApply( // Trusted-source guard (codex review 2026-05-14): when --apply will // mutate the ledger, BOTH agents must be SF-shipped built-ins. A // project-level override could silently disable rubber-duck's - // independence — operators can still customize behavior, but they - // have to use --review mode where mutations aren't auto-applied. + // independence. Operators can still customize behavior for inspect + // workflows, but --apply uses only the shipped review contract. if (triageDecider.source !== "builtin" || rubberDuck.source !== "builtin") { await emit("triage-apply-failed", { reason: "untrusted-agent-source", @@ -359,12 +535,26 @@ export async function runTriageApply( return { ok: false, agreed: false, - error: `Refusing to --apply with non-builtin agents (triage-decider=${triageDecider.source}, rubber-duck=${rubberDuck.source}). Use --review for inspect-only mode, or remove the project/user override.`, + error: `Refusing to --apply with non-builtin agents (triage-decider=${triageDecider.source}, rubber-duck=${rubberDuck.source}). Use \`sf headless triage --run\` for a reviewable decision artifact, or remove the project/user override.`, resolvedIds: [], flowId, }; } + // Custom-runner guard (codex review follow-up): an injected agentRunner + // can side-channel-mutate the ledger despite the read-only tool override. + // Only allow it when allowUntrustedRunner is explicitly set (test path). + if (options.agentRunner && !options.allowUntrustedRunner) { + await emit("triage-apply-failed", { reason: "untrusted-runner" }); + return { + ok: false, + agreed: false, + error: + "runTriageApply: a custom agentRunner was supplied without allowUntrustedRunner. Production callers cannot inject a runner — only tests can, via the explicit allowUntrustedRunner option.", + resolvedIds: [], + flowId, + }; + } const runner = options.agentRunner ?? defaultAgentRunner; // Phase 1: triage-decider runs in PLAN-ONLY mode. Drop resolve_issue @@ -373,6 +563,7 @@ export async function runTriageApply( // emits a YAML decision plan; we parse it post-hoc. const decider = await runner(triageDecider, prompt, { model: options.model, + cwd, tools: ["view", "grep", "glob", "git_log"], }); await emit("triage-apply-decider-finished", { @@ -391,22 +582,31 @@ export async function runTriageApply( }; } - // Parse the structured plan. If the decider didn't emit a valid plan - // (missing fenced block, malformed YAML, no decisions key), refuse to - // apply — we can't audit what we can't read. - const plan = parseTriagePlan(decider.output); - if (!plan || plan.length === 0) { - await emit("triage-apply-failed", { reason: "no-plan" }); + // Parse the structured plan in strict mode. Refuses on any malformed + // item, missing completion marker, multiple yaml blocks, or mismatch + // against the expected candidate set (when supplied). Partial trust on + // a triage plan is worse than no apply at all. + const parseResult = parseTriagePlanStrict( + decider.output, + options.expectedIds, + ); + if (!parseResult.plan) { + await emit("triage-apply-failed", { + reason: "no-plan", + parseError: parseResult.error, + }); return { ok: false, agreed: false, - error: - "triage-decider did not emit a parseable decision plan (expected a fenced ```yaml block with a `decisions:` key)", + error: `triage-decider plan rejected: ${ + parseResult.error ?? "unknown parse error" + }`, deciderOutput: decider.output, resolvedIds: [], flowId, }; } + const plan = parseResult.plan; await emit("triage-apply-plan-parsed", { decisionCount: plan.length, outcomes: plan.reduce>((acc, d) => { @@ -431,6 +631,7 @@ export async function runTriageApply( ].join("\n"); const review = await runner(rubberDuck, reviewTask, { model: options.model, + cwd, tools: ["view", "grep", "glob", "git_log", "query_journal"], }); const agreed = /^rubber-duck:\s*agree\b/im.test(review.output.trim()); @@ -470,21 +671,38 @@ export async function runTriageApply( }; } - // Phase 3: apply the plan. We (this runner) call resolve_issue for + // Phase 3: apply the plan. We (this runner) call markResolved for // each close/promote decision; fix decisions get surfaced for the // operator but never auto-mutate. Mutations happen ONCE, post-review, // and the resolvedIds we return reflect actual ledger state. - const resolvedIds = await applyTriagePlan(cwd, plan, emit); + const applyResult = await applyTriagePlan(cwd, plan, emit); + // Per-decision failure surfacing (codex review follow-up): if any + // approved close/promote failed to apply, runTriageApply reports + // ok=false. Operator sees both partial success (resolvedIds) and + // partial failure (rejectedIds) so they can investigate. + const approvedMutationCount = plan.filter((d) => d.outcome !== "fix").length; + const hasFailures = applyResult.rejectedIds.length > 0; return { - ok: true, + ok: !hasFailures, agreed: true, + error: hasFailures + ? `${applyResult.rejectedIds.length} of ${approvedMutationCount} approved mutations failed: ${applyResult.rejectedIds.join(", ")}` + : undefined, deciderOutput: decider.output, reviewOutput: review.output, - resolvedIds, + resolvedIds: applyResult.resolvedIds, + rejectedIds: applyResult.rejectedIds, + pendingFixIds: applyResult.pendingFixIds, flowId, }; } +interface ApplyTriagePlanResult { + resolvedIds: string[]; + rejectedIds: string[]; + pendingFixIds: string[]; +} + /** * Apply an approved decision plan. Calls markResolved (via the SF * extension's self-feedback writer, which runs the existing writer- @@ -492,14 +710,19 @@ export async function runTriageApply( * agent-fix, etc.) for each close/promote decision. Fix decisions are * not auto-applied; they require operator implementation work. * - * Returns the list of ledger ids that were actually closed/promoted. + * Returns three lists: resolvedIds (actually mutated), rejectedIds + * (writer-layer refused OR markResolved threw), pendingFixIds (fix + * decisions surfaced for operator handoff). runTriageApply uses the + * rejectedIds count to decide ok=true vs ok=false. */ async function applyTriagePlan( cwd: string, plan: TriageDecision[], emit: (eventType: string, data?: Record) => Promise, -): Promise { +): Promise { const resolvedIds: string[] = []; + const rejectedIds: string[] = []; + const pendingFixIds: string[] = []; const sfModule = (await jiti.import(sfExtensionPath("self-feedback"))) as { markResolved?: ( entryId: string, @@ -511,12 +734,19 @@ async function applyTriagePlan( await emit("triage-apply-mutation-failed", { reason: "markResolved-unavailable", }); - return resolvedIds; + // Every approved close/promote becomes a rejection — can't mutate + // anything if the writer module isn't loadable. + for (const decision of plan) { + if (decision.outcome === "fix") pendingFixIds.push(decision.id); + else rejectedIds.push(decision.id); + } + return { resolvedIds, rejectedIds, pendingFixIds }; } for (const decision of plan) { if (decision.outcome === "fix") { // Fix decisions are operator handoffs — surface in the report // (via the caller's deciderOutput / decision plan), don't mutate. + pendingFixIds.push(decision.id); await emit("triage-apply-fix-pending-operator", { id: decision.id }); continue; } @@ -540,6 +770,7 @@ async function applyTriagePlan( evidenceKind, }); } else { + rejectedIds.push(decision.id); await emit("triage-apply-mutation-rejected", { id: decision.id, outcome: decision.outcome, @@ -548,13 +779,14 @@ async function applyTriagePlan( }); } } catch (err) { + rejectedIds.push(decision.id); await emit("triage-apply-mutation-failed", { id: decision.id, error: err instanceof Error ? err.message : String(err), }); } } - return resolvedIds; + return { resolvedIds, rejectedIds, pendingFixIds }; } /** @@ -698,6 +930,7 @@ export async function handleTriage( model: options.model, agentRunner: options.agentRunner, candidateCount: candidates.length, + expectedIds: candidates.map((candidate) => candidate.id), }); const payload = { ok: result.ok, diff --git a/src/tests/headless-triage-apply.test.ts b/src/tests/headless-triage-apply.test.ts new file mode 100644 index 000000000..30531d7af --- /dev/null +++ b/src/tests/headless-triage-apply.test.ts @@ -0,0 +1,171 @@ +import assert from "node:assert/strict"; +import { mkdirSync, mkdtempSync, rmSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { afterEach, beforeEach, test } from "vitest"; + +import { runTriageApply } from "../headless-triage.js"; + +const tempDirs: string[] = []; +let originalAgentDir: string | undefined; +const deciderPlan = [ + "```yaml", + "decisions:", + " - id: sf-test-1", + " outcome: close", + " evidence_kind: human-clear", + " reason: stale", + "```", + "Self-feedback triage complete.", +].join("\n"); + +function makeProject(): string { + const dir = mkdtempSync(join(tmpdir(), "sf-headless-triage-")); + tempDirs.push(dir); + mkdirSync(join(dir, ".sf"), { recursive: true }); + return dir; +} + +afterEach(() => { + if (originalAgentDir === undefined) delete process.env.SF_CODING_AGENT_DIR; + else process.env.SF_CODING_AGENT_DIR = originalAgentDir; + while (tempDirs.length > 0) { + rmSync(tempDirs.pop()!, { recursive: true, force: true }); + } +}); + +beforeEach(() => { + originalAgentDir = process.env.SF_CODING_AGENT_DIR; + const dir = mkdtempSync(join(tmpdir(), "sf-headless-agent-dir-")); + tempDirs.push(dir); + process.env.SF_CODING_AGENT_DIR = dir; +}); + +test("runTriageApply_when_rubber_duck_agrees_runs_both_agents_in_order", async () => { + const project = makeProject(); + const calls: string[] = []; + const result = await runTriageApply(project, "triage prompt", { + candidateCount: 1, + allowUntrustedRunner: true, + agentRunner: async (agent) => { + calls.push(agent.name); + if (agent.name === "triage-decider") { + return { + ok: true, + output: deciderPlan, + exitCode: 0, + }; + } + return { ok: true, output: "rubber-duck: agree", exitCode: 0 }; + }, + }); + + // Orchestration contract: rubber-duck agreed, so the apply phase ran. + // The actual mutation on sf-test-1 fails because the test project has + // no real ledger entry — that's the rejectedIds path, not a contract + // violation. ok=false on apply failures is itself the contract + // (codex review 2026-05-14 follow-up). + assert.equal(result.agreed, true, "rubber-duck must agree"); + assert.deepEqual( + calls, + ["triage-decider", "rubber-duck"], + "decider must run before rubber-duck", + ); + assert.equal(result.ok, false, "apply should fail on missing ledger entry"); + assert.deepEqual(result.rejectedIds, ["sf-test-1"]); + assert.deepEqual(result.resolvedIds, []); +}); + +test("runTriageApply_when_custom_runner_without_trust_flag_refuses", async () => { + // Defense-in-depth: an injected agentRunner can side-channel-mutate + // the ledger despite the read-only tool override. Only test paths + // that explicitly set allowUntrustedRunner can pass a custom runner. + const project = makeProject(); + let calls = 0; + const result = await runTriageApply(project, "triage prompt", { + // NOTE: allowUntrustedRunner intentionally omitted. + agentRunner: async () => { + calls += 1; + return { ok: true, output: deciderPlan, exitCode: 0 }; + }, + }); + + assert.equal(result.ok, false); + assert.equal( + calls, + 0, + "agentRunner must not be invoked when trust flag is absent", + ); + assert.match(result.error ?? "", /allowUntrustedRunner|custom agentRunner/i); +}); + +test("runTriageApply_when_rubber_duck_disagrees_blocks_completion", async () => { + const project = makeProject(); + const result = await runTriageApply(project, "triage prompt", { + allowUntrustedRunner: true, + agentRunner: async (agent) => { + if (agent.name === "triage-decider") { + return { + ok: true, + output: deciderPlan, + exitCode: 0, + }; + } + return { + ok: true, + output: "## Concern 1:\nThe cited closure is not supported.", + exitCode: 0, + }; + }, + }); + + assert.equal(result.ok, false); + assert.equal(result.agreed, false); + assert.equal( + result.error, + "rubber-duck disagreed — pausing for operator review", + ); +}); + +test("runTriageApply_when_decider_fails_skips_rubber_duck", async () => { + const project = makeProject(); + const calls: string[] = []; + const result = await runTriageApply(project, "triage prompt", { + allowUntrustedRunner: true, + agentRunner: async (agent) => { + calls.push(agent.name); + return { + ok: false, + output: "", + stderr: "model failed", + exitCode: 1, + }; + }, + }); + + assert.equal(result.ok, false); + assert.equal(result.error, "model failed"); + assert.deepEqual(calls, ["triage-decider"]); +}); + +test("runTriageApply_when_decider_outputs_unknown_id_rejects_plan_before_review", async () => { + const project = makeProject(); + const calls: string[] = []; + const result = await runTriageApply(project, "triage prompt", { + candidateCount: 1, + expectedIds: ["sf-expected-1"], + allowUntrustedRunner: true, + agentRunner: async (agent) => { + calls.push(agent.name); + return { + ok: true, + output: deciderPlan, + exitCode: 0, + }; + }, + }); + + assert.equal(result.ok, false); + assert.match(result.error ?? "", /not in the candidate set/); + assert.deepEqual(calls, ["triage-decider"]); +});