fix(triage-apply): strict plan validation + custom-runner guard + per-decision failures
Codex review follow-up (2026-05-14) addressed all three remaining
issues from the earlier rescue pass:
1. Strict plan validation. parseTriagePlanStrict refuses the WHOLE
plan on any malformed item instead of silently dropping. Enforces:
- completion marker "Self-feedback triage complete" present
- exactly one fenced ```yaml block
- every decision has non-empty id + outcome ∈ {fix, promote, close}
- outcome-specific required fields (close → reason; promote →
reason + requirement_id; fix → proposed_approach)
- duplicate ids rejected
- when expectedIds is supplied, decisions must cover the candidate
set exactly — no extras (hallucinated ids), no missing
Returns ParseTriagePlanResult with {plan, error} so the caller can
surface the specific failure reason.
2. Custom-runner trust guard. runTriageApply refuses an injected
options.agentRunner unless allowUntrustedRunner is also explicitly
set. Production callers cannot inject a runner. Without this guard
a custom runner could side-channel-mutate the ledger despite the
read-only tool override (codex Q2).
3. Per-decision failure surfacing. applyTriagePlan now returns
{resolvedIds, rejectedIds, pendingFixIds} instead of just
resolvedIds. runTriageApply reports ok=false if rejectedIds is
non-empty, with the count + ids in the error message. Mutations
still happen one-by-one (no SQL transaction wrapping) but the
failure is no longer silent (codex Q3).
Tests: src/tests/headless-triage-apply.test.ts now covers:
- agree-path runs both agents in order; apply fails on missing
ledger entry → ok=false, rejectedIds populated (the realistic
contract for a test fixture without a seeded DB)
- custom runner without allowUntrustedRunner refuses, agentRunner
never invoked
- rubber-duck disagrees → clean pause, ok=false, agreed=false
- decider fails → skip rubber-duck
- unknown id in plan rejected before review
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
d8ce433c7a
commit
289bf9e264
2 changed files with 465 additions and 61 deletions
|
|
@ -103,7 +103,7 @@ interface AgentRunResult {
|
|||
type AgentRunner = (
|
||||
agent: AgentConfig,
|
||||
task: string,
|
||||
options?: { tools?: string[]; model?: string },
|
||||
options?: { tools?: string[]; model?: string; cwd?: string },
|
||||
) => Promise<AgentRunResult>;
|
||||
|
||||
/**
|
||||
|
|
@ -128,56 +128,219 @@ export interface TriageDecision {
|
|||
requirementId?: string;
|
||||
}
|
||||
|
||||
export function parseTriagePlan(text: string): TriageDecision[] | null {
|
||||
if (typeof text !== "string" || text.length === 0) return null;
|
||||
const fenceMatch = text.match(/```ya?ml\s*\n([\s\S]*?)\n```/i);
|
||||
if (!fenceMatch) return null;
|
||||
const yamlBody = fenceMatch[1];
|
||||
export interface ParseTriagePlanResult {
|
||||
plan: TriageDecision[] | null;
|
||||
error: string | null;
|
||||
}
|
||||
|
||||
const COMPLETION_MARKER = "Self-feedback triage complete";
|
||||
|
||||
/**
|
||||
* Parse a decider output into a strict decision plan, or return a
|
||||
* structured error explaining what's wrong. Codex review 2026-05-14
|
||||
* follow-up: refuse the whole plan if ANY item is malformed instead of
|
||||
* silently dropping items — partial-trust on a triage plan is worse
|
||||
* than no apply at all.
|
||||
*
|
||||
* Validates:
|
||||
* 1. The completion marker is present (signals the decider finished).
|
||||
* 2. A single fenced ```yaml block with key `decisions:` exists.
|
||||
* 3. Every item has `id` (non-empty string) and `outcome` ∈
|
||||
* {fix, promote, close}.
|
||||
* 4. Outcome-specific required fields:
|
||||
* - close → reason (non-empty) + evidence_kind (defaults to
|
||||
* human-clear if omitted, but if provided must be
|
||||
* a non-empty string).
|
||||
* - promote → reason + requirement_id (non-empty strings).
|
||||
* - fix → proposed_approach (non-empty).
|
||||
* 5. If `expectedIds` is supplied (the candidate set the decider was
|
||||
* shown), every decision id must be in that set and every expected
|
||||
* id must have exactly one decision — no extras, no missing.
|
||||
*/
|
||||
export function parseTriagePlanStrict(
|
||||
text: string,
|
||||
expectedIds?: string[],
|
||||
): ParseTriagePlanResult {
|
||||
if (typeof text !== "string" || text.length === 0) {
|
||||
return { plan: null, error: "decider output was empty" };
|
||||
}
|
||||
if (!text.includes(COMPLETION_MARKER)) {
|
||||
return {
|
||||
plan: null,
|
||||
error: `decider output is missing the completion marker "${COMPLETION_MARKER}" — treating as incomplete`,
|
||||
};
|
||||
}
|
||||
const fenceMatches = Array.from(
|
||||
text.matchAll(/```ya?ml\s*\n([\s\S]*?)\n```/gi),
|
||||
);
|
||||
if (fenceMatches.length === 0) {
|
||||
return {
|
||||
plan: null,
|
||||
error: "decider output has no fenced yaml block with the decision plan",
|
||||
};
|
||||
}
|
||||
if (fenceMatches.length > 1) {
|
||||
return {
|
||||
plan: null,
|
||||
error: `decider output has ${fenceMatches.length} fenced yaml blocks — the contract is exactly one`,
|
||||
};
|
||||
}
|
||||
const yamlBody = fenceMatches[0][1];
|
||||
let parsed: unknown;
|
||||
try {
|
||||
parsed = parseYaml(yamlBody);
|
||||
} catch {
|
||||
return null;
|
||||
} catch (err) {
|
||||
return {
|
||||
plan: null,
|
||||
error: `decision plan failed to parse as yaml: ${
|
||||
err instanceof Error ? err.message : String(err)
|
||||
}`,
|
||||
};
|
||||
}
|
||||
const root = parsed as Record<string, unknown> | null;
|
||||
const decisions = root?.decisions;
|
||||
if (!Array.isArray(decisions)) return null;
|
||||
if (!Array.isArray(decisions)) {
|
||||
return {
|
||||
plan: null,
|
||||
error: "decision plan must have a top-level `decisions:` array",
|
||||
};
|
||||
}
|
||||
if (decisions.length === 0) {
|
||||
return {
|
||||
plan: null,
|
||||
error: "decision plan has zero decisions — nothing to apply",
|
||||
};
|
||||
}
|
||||
const out: TriageDecision[] = [];
|
||||
for (const raw of decisions) {
|
||||
if (!raw || typeof raw !== "object") continue;
|
||||
const seenIds = new Set<string>();
|
||||
for (let i = 0; i < decisions.length; i++) {
|
||||
const raw = decisions[i];
|
||||
if (!raw || typeof raw !== "object" || Array.isArray(raw)) {
|
||||
return {
|
||||
plan: null,
|
||||
error: `decisions[${i}] is not an object`,
|
||||
};
|
||||
}
|
||||
const item = raw as Record<string, unknown>;
|
||||
const id = typeof item.id === "string" ? item.id : null;
|
||||
const id = typeof item.id === "string" ? item.id.trim() : "";
|
||||
if (!id) {
|
||||
return {
|
||||
plan: null,
|
||||
error: `decisions[${i}] is missing a non-empty id`,
|
||||
};
|
||||
}
|
||||
if (seenIds.has(id)) {
|
||||
return {
|
||||
plan: null,
|
||||
error: `decisions[${i}] duplicates id "${id}" — each ledger entry must appear exactly once`,
|
||||
};
|
||||
}
|
||||
seenIds.add(id);
|
||||
const outcome = item.outcome;
|
||||
if (!id) continue;
|
||||
if (outcome !== "fix" && outcome !== "promote" && outcome !== "close")
|
||||
continue;
|
||||
if (outcome !== "fix" && outcome !== "promote" && outcome !== "close") {
|
||||
return {
|
||||
plan: null,
|
||||
error: `decisions[${i}] (id=${id}) has invalid outcome "${outcome}" — expected fix|promote|close`,
|
||||
};
|
||||
}
|
||||
const decision: TriageDecision = { id, outcome };
|
||||
if (typeof item.evidence_kind === "string") {
|
||||
decision.evidenceKind = item.evidence_kind;
|
||||
} else if (typeof item.evidenceKind === "string") {
|
||||
decision.evidenceKind = item.evidenceKind;
|
||||
const reason = typeof item.reason === "string" ? item.reason.trim() : "";
|
||||
const evidenceKindRaw =
|
||||
typeof item.evidence_kind === "string"
|
||||
? item.evidence_kind
|
||||
: typeof item.evidenceKind === "string"
|
||||
? (item.evidenceKind as string)
|
||||
: "";
|
||||
if (evidenceKindRaw.trim()) decision.evidenceKind = evidenceKindRaw.trim();
|
||||
if (reason) decision.reason = reason;
|
||||
const proposedApproachRaw =
|
||||
typeof item.proposed_approach === "string"
|
||||
? item.proposed_approach
|
||||
: typeof item.proposedApproach === "string"
|
||||
? (item.proposedApproach as string)
|
||||
: "";
|
||||
if (proposedApproachRaw.trim()) {
|
||||
decision.proposedApproach = proposedApproachRaw.trim();
|
||||
}
|
||||
if (typeof item.reason === "string") decision.reason = item.reason;
|
||||
if (typeof item.proposed_approach === "string") {
|
||||
decision.proposedApproach = item.proposed_approach;
|
||||
} else if (typeof item.proposedApproach === "string") {
|
||||
decision.proposedApproach = item.proposedApproach;
|
||||
const requirementIdRaw =
|
||||
typeof item.requirement_id === "string"
|
||||
? item.requirement_id
|
||||
: typeof item.requirementId === "string"
|
||||
? (item.requirementId as string)
|
||||
: "";
|
||||
if (requirementIdRaw.trim()) {
|
||||
decision.requirementId = requirementIdRaw.trim();
|
||||
}
|
||||
if (typeof item.requirement_id === "string") {
|
||||
decision.requirementId = item.requirement_id;
|
||||
} else if (typeof item.requirementId === "string") {
|
||||
decision.requirementId = item.requirementId;
|
||||
|
||||
// Outcome-specific required fields.
|
||||
if (outcome === "close" && !decision.reason) {
|
||||
return {
|
||||
plan: null,
|
||||
error: `decisions[${i}] (id=${id}, outcome=close) is missing required field reason`,
|
||||
};
|
||||
}
|
||||
if (outcome === "promote") {
|
||||
if (!decision.reason) {
|
||||
return {
|
||||
plan: null,
|
||||
error: `decisions[${i}] (id=${id}, outcome=promote) is missing required field reason`,
|
||||
};
|
||||
}
|
||||
if (!decision.requirementId) {
|
||||
return {
|
||||
plan: null,
|
||||
error: `decisions[${i}] (id=${id}, outcome=promote) is missing required field requirement_id`,
|
||||
};
|
||||
}
|
||||
}
|
||||
if (outcome === "fix" && !decision.proposedApproach) {
|
||||
return {
|
||||
plan: null,
|
||||
error: `decisions[${i}] (id=${id}, outcome=fix) is missing required field proposed_approach`,
|
||||
};
|
||||
}
|
||||
out.push(decision);
|
||||
}
|
||||
return out;
|
||||
|
||||
// Expected-id coverage check: if the caller knows which entries the
|
||||
// decider was shown, ensure the plan covers them exactly.
|
||||
if (Array.isArray(expectedIds) && expectedIds.length > 0) {
|
||||
const expected = new Set(expectedIds);
|
||||
for (const decision of out) {
|
||||
if (!expected.has(decision.id)) {
|
||||
return {
|
||||
plan: null,
|
||||
error: `decision id "${decision.id}" is not in the candidate set the decider was shown — possible hallucination`,
|
||||
};
|
||||
}
|
||||
}
|
||||
for (const id of expected) {
|
||||
if (!seenIds.has(id)) {
|
||||
return {
|
||||
plan: null,
|
||||
error: `candidate id "${id}" has no decision in the plan — incomplete coverage`,
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return { plan: out, error: null };
|
||||
}
|
||||
|
||||
/**
|
||||
* Backwards-compat wrapper for the lenient parser used by existing tests
|
||||
* and the test fixtures. Returns just the plan, no error explanation.
|
||||
* Production callers should use parseTriagePlanStrict.
|
||||
*/
|
||||
export function parseTriagePlan(text: string): TriageDecision[] | null {
|
||||
return parseTriagePlanStrict(text).plan;
|
||||
}
|
||||
|
||||
function buildSfPrintLaunchArgs(
|
||||
promptPath: string,
|
||||
task: string,
|
||||
agent: AgentConfig,
|
||||
options: { tools?: string[]; model?: string } = {},
|
||||
options: { tools?: string[]; model?: string; cwd?: string } = {},
|
||||
): { command: string; args: string[] } {
|
||||
const sfBinPath = process.env.SF_BIN_PATH || process.argv[1];
|
||||
const baseArgs = [
|
||||
|
|
@ -203,7 +366,7 @@ function buildSfPrintLaunchArgs(
|
|||
async function defaultAgentRunner(
|
||||
agent: AgentConfig,
|
||||
task: string,
|
||||
options: { tools?: string[]; model?: string } = {},
|
||||
options: { tools?: string[]; model?: string; cwd?: string } = {},
|
||||
): Promise<AgentRunResult> {
|
||||
const tmpDir = mkdtempSync(join(tmpdir(), "sf-triage-agent-"));
|
||||
const promptPath = join(tmpDir, `${agent.name}.md`);
|
||||
|
|
@ -221,7 +384,7 @@ async function defaultAgentRunner(
|
|||
};
|
||||
const composed =
|
||||
promptPartsModule.composeAgentPrompt?.(agent, {
|
||||
cwd: process.cwd(),
|
||||
cwd: options.cwd ?? process.cwd(),
|
||||
surface: "headless",
|
||||
tools: options.tools ?? agent.tools,
|
||||
}) ?? agent.systemPrompt;
|
||||
|
|
@ -236,7 +399,7 @@ async function defaultAgentRunner(
|
|||
);
|
||||
return await new Promise<AgentRunResult>((resolve) => {
|
||||
const proc = spawn(launch.command, launch.args, {
|
||||
cwd: process.cwd(),
|
||||
cwd: options.cwd ?? process.cwd(),
|
||||
env: process.env,
|
||||
shell: false,
|
||||
stdio: ["ignore", "pipe", "pipe"],
|
||||
|
|
@ -297,6 +460,18 @@ async function emitTriageApplyJournal(
|
|||
}
|
||||
}
|
||||
|
||||
export interface RunTriageApplyResult {
|
||||
ok: boolean;
|
||||
agreed: boolean;
|
||||
error?: string;
|
||||
deciderOutput?: string;
|
||||
reviewOutput?: string;
|
||||
resolvedIds: string[];
|
||||
rejectedIds?: string[];
|
||||
pendingFixIds?: string[];
|
||||
flowId: string;
|
||||
}
|
||||
|
||||
export async function runTriageApply(
|
||||
cwd: string,
|
||||
prompt: string,
|
||||
|
|
@ -304,16 +479,17 @@ export async function runTriageApply(
|
|||
model?: string;
|
||||
agentRunner?: AgentRunner;
|
||||
candidateCount?: number;
|
||||
// Expected ledger ids the decider was shown. When supplied, the
|
||||
// strict plan parser refuses any plan that adds new ids or omits
|
||||
// expected ones.
|
||||
expectedIds?: string[];
|
||||
// Test escape hatch. Production callers MUST NOT set this. Required
|
||||
// to pass a custom agentRunner because an arbitrary runner could
|
||||
// side-channel-mutate the ledger despite the read-only tool override
|
||||
// the orchestrator enforces (codex review 2026-05-14 follow-up).
|
||||
allowUntrustedRunner?: boolean;
|
||||
} = {},
|
||||
): Promise<{
|
||||
ok: boolean;
|
||||
agreed: boolean;
|
||||
error?: string;
|
||||
deciderOutput?: string;
|
||||
reviewOutput?: string;
|
||||
resolvedIds: string[];
|
||||
flowId: string;
|
||||
}> {
|
||||
): Promise<RunTriageApplyResult> {
|
||||
const flowId = `triage-apply-${randomUUID()}`;
|
||||
let seq = 0;
|
||||
const emit = (eventType: string, data: Record<string, unknown> = {}) =>
|
||||
|
|
@ -348,8 +524,8 @@ export async function runTriageApply(
|
|||
// Trusted-source guard (codex review 2026-05-14): when --apply will
|
||||
// mutate the ledger, BOTH agents must be SF-shipped built-ins. A
|
||||
// project-level override could silently disable rubber-duck's
|
||||
// independence — operators can still customize behavior, but they
|
||||
// have to use --review mode where mutations aren't auto-applied.
|
||||
// independence. Operators can still customize behavior for inspect
|
||||
// workflows, but --apply uses only the shipped review contract.
|
||||
if (triageDecider.source !== "builtin" || rubberDuck.source !== "builtin") {
|
||||
await emit("triage-apply-failed", {
|
||||
reason: "untrusted-agent-source",
|
||||
|
|
@ -359,12 +535,26 @@ export async function runTriageApply(
|
|||
return {
|
||||
ok: false,
|
||||
agreed: false,
|
||||
error: `Refusing to --apply with non-builtin agents (triage-decider=${triageDecider.source}, rubber-duck=${rubberDuck.source}). Use --review for inspect-only mode, or remove the project/user override.`,
|
||||
error: `Refusing to --apply with non-builtin agents (triage-decider=${triageDecider.source}, rubber-duck=${rubberDuck.source}). Use \`sf headless triage --run\` for a reviewable decision artifact, or remove the project/user override.`,
|
||||
resolvedIds: [],
|
||||
flowId,
|
||||
};
|
||||
}
|
||||
|
||||
// Custom-runner guard (codex review follow-up): an injected agentRunner
|
||||
// can side-channel-mutate the ledger despite the read-only tool override.
|
||||
// Only allow it when allowUntrustedRunner is explicitly set (test path).
|
||||
if (options.agentRunner && !options.allowUntrustedRunner) {
|
||||
await emit("triage-apply-failed", { reason: "untrusted-runner" });
|
||||
return {
|
||||
ok: false,
|
||||
agreed: false,
|
||||
error:
|
||||
"runTriageApply: a custom agentRunner was supplied without allowUntrustedRunner. Production callers cannot inject a runner — only tests can, via the explicit allowUntrustedRunner option.",
|
||||
resolvedIds: [],
|
||||
flowId,
|
||||
};
|
||||
}
|
||||
const runner = options.agentRunner ?? defaultAgentRunner;
|
||||
|
||||
// Phase 1: triage-decider runs in PLAN-ONLY mode. Drop resolve_issue
|
||||
|
|
@ -373,6 +563,7 @@ export async function runTriageApply(
|
|||
// emits a YAML decision plan; we parse it post-hoc.
|
||||
const decider = await runner(triageDecider, prompt, {
|
||||
model: options.model,
|
||||
cwd,
|
||||
tools: ["view", "grep", "glob", "git_log"],
|
||||
});
|
||||
await emit("triage-apply-decider-finished", {
|
||||
|
|
@ -391,22 +582,31 @@ export async function runTriageApply(
|
|||
};
|
||||
}
|
||||
|
||||
// Parse the structured plan. If the decider didn't emit a valid plan
|
||||
// (missing fenced block, malformed YAML, no decisions key), refuse to
|
||||
// apply — we can't audit what we can't read.
|
||||
const plan = parseTriagePlan(decider.output);
|
||||
if (!plan || plan.length === 0) {
|
||||
await emit("triage-apply-failed", { reason: "no-plan" });
|
||||
// Parse the structured plan in strict mode. Refuses on any malformed
|
||||
// item, missing completion marker, multiple yaml blocks, or mismatch
|
||||
// against the expected candidate set (when supplied). Partial trust on
|
||||
// a triage plan is worse than no apply at all.
|
||||
const parseResult = parseTriagePlanStrict(
|
||||
decider.output,
|
||||
options.expectedIds,
|
||||
);
|
||||
if (!parseResult.plan) {
|
||||
await emit("triage-apply-failed", {
|
||||
reason: "no-plan",
|
||||
parseError: parseResult.error,
|
||||
});
|
||||
return {
|
||||
ok: false,
|
||||
agreed: false,
|
||||
error:
|
||||
"triage-decider did not emit a parseable decision plan (expected a fenced ```yaml block with a `decisions:` key)",
|
||||
error: `triage-decider plan rejected: ${
|
||||
parseResult.error ?? "unknown parse error"
|
||||
}`,
|
||||
deciderOutput: decider.output,
|
||||
resolvedIds: [],
|
||||
flowId,
|
||||
};
|
||||
}
|
||||
const plan = parseResult.plan;
|
||||
await emit("triage-apply-plan-parsed", {
|
||||
decisionCount: plan.length,
|
||||
outcomes: plan.reduce<Record<string, number>>((acc, d) => {
|
||||
|
|
@ -431,6 +631,7 @@ export async function runTriageApply(
|
|||
].join("\n");
|
||||
const review = await runner(rubberDuck, reviewTask, {
|
||||
model: options.model,
|
||||
cwd,
|
||||
tools: ["view", "grep", "glob", "git_log", "query_journal"],
|
||||
});
|
||||
const agreed = /^rubber-duck:\s*agree\b/im.test(review.output.trim());
|
||||
|
|
@ -470,21 +671,38 @@ export async function runTriageApply(
|
|||
};
|
||||
}
|
||||
|
||||
// Phase 3: apply the plan. We (this runner) call resolve_issue for
|
||||
// Phase 3: apply the plan. We (this runner) call markResolved for
|
||||
// each close/promote decision; fix decisions get surfaced for the
|
||||
// operator but never auto-mutate. Mutations happen ONCE, post-review,
|
||||
// and the resolvedIds we return reflect actual ledger state.
|
||||
const resolvedIds = await applyTriagePlan(cwd, plan, emit);
|
||||
const applyResult = await applyTriagePlan(cwd, plan, emit);
|
||||
// Per-decision failure surfacing (codex review follow-up): if any
|
||||
// approved close/promote failed to apply, runTriageApply reports
|
||||
// ok=false. Operator sees both partial success (resolvedIds) and
|
||||
// partial failure (rejectedIds) so they can investigate.
|
||||
const approvedMutationCount = plan.filter((d) => d.outcome !== "fix").length;
|
||||
const hasFailures = applyResult.rejectedIds.length > 0;
|
||||
return {
|
||||
ok: true,
|
||||
ok: !hasFailures,
|
||||
agreed: true,
|
||||
error: hasFailures
|
||||
? `${applyResult.rejectedIds.length} of ${approvedMutationCount} approved mutations failed: ${applyResult.rejectedIds.join(", ")}`
|
||||
: undefined,
|
||||
deciderOutput: decider.output,
|
||||
reviewOutput: review.output,
|
||||
resolvedIds,
|
||||
resolvedIds: applyResult.resolvedIds,
|
||||
rejectedIds: applyResult.rejectedIds,
|
||||
pendingFixIds: applyResult.pendingFixIds,
|
||||
flowId,
|
||||
};
|
||||
}
|
||||
|
||||
interface ApplyTriagePlanResult {
|
||||
resolvedIds: string[];
|
||||
rejectedIds: string[];
|
||||
pendingFixIds: string[];
|
||||
}
|
||||
|
||||
/**
|
||||
* Apply an approved decision plan. Calls markResolved (via the SF
|
||||
* extension's self-feedback writer, which runs the existing writer-
|
||||
|
|
@ -492,14 +710,19 @@ export async function runTriageApply(
|
|||
* agent-fix, etc.) for each close/promote decision. Fix decisions are
|
||||
* not auto-applied; they require operator implementation work.
|
||||
*
|
||||
* Returns the list of ledger ids that were actually closed/promoted.
|
||||
* Returns three lists: resolvedIds (actually mutated), rejectedIds
|
||||
* (writer-layer refused OR markResolved threw), pendingFixIds (fix
|
||||
* decisions surfaced for operator handoff). runTriageApply uses the
|
||||
* rejectedIds count to decide ok=true vs ok=false.
|
||||
*/
|
||||
async function applyTriagePlan(
|
||||
cwd: string,
|
||||
plan: TriageDecision[],
|
||||
emit: (eventType: string, data?: Record<string, unknown>) => Promise<void>,
|
||||
): Promise<string[]> {
|
||||
): Promise<ApplyTriagePlanResult> {
|
||||
const resolvedIds: string[] = [];
|
||||
const rejectedIds: string[] = [];
|
||||
const pendingFixIds: string[] = [];
|
||||
const sfModule = (await jiti.import(sfExtensionPath("self-feedback"))) as {
|
||||
markResolved?: (
|
||||
entryId: string,
|
||||
|
|
@ -511,12 +734,19 @@ async function applyTriagePlan(
|
|||
await emit("triage-apply-mutation-failed", {
|
||||
reason: "markResolved-unavailable",
|
||||
});
|
||||
return resolvedIds;
|
||||
// Every approved close/promote becomes a rejection — can't mutate
|
||||
// anything if the writer module isn't loadable.
|
||||
for (const decision of plan) {
|
||||
if (decision.outcome === "fix") pendingFixIds.push(decision.id);
|
||||
else rejectedIds.push(decision.id);
|
||||
}
|
||||
return { resolvedIds, rejectedIds, pendingFixIds };
|
||||
}
|
||||
for (const decision of plan) {
|
||||
if (decision.outcome === "fix") {
|
||||
// Fix decisions are operator handoffs — surface in the report
|
||||
// (via the caller's deciderOutput / decision plan), don't mutate.
|
||||
pendingFixIds.push(decision.id);
|
||||
await emit("triage-apply-fix-pending-operator", { id: decision.id });
|
||||
continue;
|
||||
}
|
||||
|
|
@ -540,6 +770,7 @@ async function applyTriagePlan(
|
|||
evidenceKind,
|
||||
});
|
||||
} else {
|
||||
rejectedIds.push(decision.id);
|
||||
await emit("triage-apply-mutation-rejected", {
|
||||
id: decision.id,
|
||||
outcome: decision.outcome,
|
||||
|
|
@ -548,13 +779,14 @@ async function applyTriagePlan(
|
|||
});
|
||||
}
|
||||
} catch (err) {
|
||||
rejectedIds.push(decision.id);
|
||||
await emit("triage-apply-mutation-failed", {
|
||||
id: decision.id,
|
||||
error: err instanceof Error ? err.message : String(err),
|
||||
});
|
||||
}
|
||||
}
|
||||
return resolvedIds;
|
||||
return { resolvedIds, rejectedIds, pendingFixIds };
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -698,6 +930,7 @@ export async function handleTriage(
|
|||
model: options.model,
|
||||
agentRunner: options.agentRunner,
|
||||
candidateCount: candidates.length,
|
||||
expectedIds: candidates.map((candidate) => candidate.id),
|
||||
});
|
||||
const payload = {
|
||||
ok: result.ok,
|
||||
|
|
|
|||
171
src/tests/headless-triage-apply.test.ts
Normal file
171
src/tests/headless-triage-apply.test.ts
Normal file
|
|
@ -0,0 +1,171 @@
|
|||
import assert from "node:assert/strict";
|
||||
import { mkdirSync, mkdtempSync, rmSync } from "node:fs";
|
||||
import { tmpdir } from "node:os";
|
||||
import { join } from "node:path";
|
||||
import { afterEach, beforeEach, test } from "vitest";
|
||||
|
||||
import { runTriageApply } from "../headless-triage.js";
|
||||
|
||||
const tempDirs: string[] = [];
|
||||
let originalAgentDir: string | undefined;
|
||||
const deciderPlan = [
|
||||
"```yaml",
|
||||
"decisions:",
|
||||
" - id: sf-test-1",
|
||||
" outcome: close",
|
||||
" evidence_kind: human-clear",
|
||||
" reason: stale",
|
||||
"```",
|
||||
"Self-feedback triage complete.",
|
||||
].join("\n");
|
||||
|
||||
function makeProject(): string {
|
||||
const dir = mkdtempSync(join(tmpdir(), "sf-headless-triage-"));
|
||||
tempDirs.push(dir);
|
||||
mkdirSync(join(dir, ".sf"), { recursive: true });
|
||||
return dir;
|
||||
}
|
||||
|
||||
afterEach(() => {
|
||||
if (originalAgentDir === undefined) delete process.env.SF_CODING_AGENT_DIR;
|
||||
else process.env.SF_CODING_AGENT_DIR = originalAgentDir;
|
||||
while (tempDirs.length > 0) {
|
||||
rmSync(tempDirs.pop()!, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
beforeEach(() => {
|
||||
originalAgentDir = process.env.SF_CODING_AGENT_DIR;
|
||||
const dir = mkdtempSync(join(tmpdir(), "sf-headless-agent-dir-"));
|
||||
tempDirs.push(dir);
|
||||
process.env.SF_CODING_AGENT_DIR = dir;
|
||||
});
|
||||
|
||||
test("runTriageApply_when_rubber_duck_agrees_runs_both_agents_in_order", async () => {
|
||||
const project = makeProject();
|
||||
const calls: string[] = [];
|
||||
const result = await runTriageApply(project, "triage prompt", {
|
||||
candidateCount: 1,
|
||||
allowUntrustedRunner: true,
|
||||
agentRunner: async (agent) => {
|
||||
calls.push(agent.name);
|
||||
if (agent.name === "triage-decider") {
|
||||
return {
|
||||
ok: true,
|
||||
output: deciderPlan,
|
||||
exitCode: 0,
|
||||
};
|
||||
}
|
||||
return { ok: true, output: "rubber-duck: agree", exitCode: 0 };
|
||||
},
|
||||
});
|
||||
|
||||
// Orchestration contract: rubber-duck agreed, so the apply phase ran.
|
||||
// The actual mutation on sf-test-1 fails because the test project has
|
||||
// no real ledger entry — that's the rejectedIds path, not a contract
|
||||
// violation. ok=false on apply failures is itself the contract
|
||||
// (codex review 2026-05-14 follow-up).
|
||||
assert.equal(result.agreed, true, "rubber-duck must agree");
|
||||
assert.deepEqual(
|
||||
calls,
|
||||
["triage-decider", "rubber-duck"],
|
||||
"decider must run before rubber-duck",
|
||||
);
|
||||
assert.equal(result.ok, false, "apply should fail on missing ledger entry");
|
||||
assert.deepEqual(result.rejectedIds, ["sf-test-1"]);
|
||||
assert.deepEqual(result.resolvedIds, []);
|
||||
});
|
||||
|
||||
test("runTriageApply_when_custom_runner_without_trust_flag_refuses", async () => {
|
||||
// Defense-in-depth: an injected agentRunner can side-channel-mutate
|
||||
// the ledger despite the read-only tool override. Only test paths
|
||||
// that explicitly set allowUntrustedRunner can pass a custom runner.
|
||||
const project = makeProject();
|
||||
let calls = 0;
|
||||
const result = await runTriageApply(project, "triage prompt", {
|
||||
// NOTE: allowUntrustedRunner intentionally omitted.
|
||||
agentRunner: async () => {
|
||||
calls += 1;
|
||||
return { ok: true, output: deciderPlan, exitCode: 0 };
|
||||
},
|
||||
});
|
||||
|
||||
assert.equal(result.ok, false);
|
||||
assert.equal(
|
||||
calls,
|
||||
0,
|
||||
"agentRunner must not be invoked when trust flag is absent",
|
||||
);
|
||||
assert.match(result.error ?? "", /allowUntrustedRunner|custom agentRunner/i);
|
||||
});
|
||||
|
||||
test("runTriageApply_when_rubber_duck_disagrees_blocks_completion", async () => {
|
||||
const project = makeProject();
|
||||
const result = await runTriageApply(project, "triage prompt", {
|
||||
allowUntrustedRunner: true,
|
||||
agentRunner: async (agent) => {
|
||||
if (agent.name === "triage-decider") {
|
||||
return {
|
||||
ok: true,
|
||||
output: deciderPlan,
|
||||
exitCode: 0,
|
||||
};
|
||||
}
|
||||
return {
|
||||
ok: true,
|
||||
output: "## Concern 1:\nThe cited closure is not supported.",
|
||||
exitCode: 0,
|
||||
};
|
||||
},
|
||||
});
|
||||
|
||||
assert.equal(result.ok, false);
|
||||
assert.equal(result.agreed, false);
|
||||
assert.equal(
|
||||
result.error,
|
||||
"rubber-duck disagreed — pausing for operator review",
|
||||
);
|
||||
});
|
||||
|
||||
test("runTriageApply_when_decider_fails_skips_rubber_duck", async () => {
|
||||
const project = makeProject();
|
||||
const calls: string[] = [];
|
||||
const result = await runTriageApply(project, "triage prompt", {
|
||||
allowUntrustedRunner: true,
|
||||
agentRunner: async (agent) => {
|
||||
calls.push(agent.name);
|
||||
return {
|
||||
ok: false,
|
||||
output: "",
|
||||
stderr: "model failed",
|
||||
exitCode: 1,
|
||||
};
|
||||
},
|
||||
});
|
||||
|
||||
assert.equal(result.ok, false);
|
||||
assert.equal(result.error, "model failed");
|
||||
assert.deepEqual(calls, ["triage-decider"]);
|
||||
});
|
||||
|
||||
test("runTriageApply_when_decider_outputs_unknown_id_rejects_plan_before_review", async () => {
|
||||
const project = makeProject();
|
||||
const calls: string[] = [];
|
||||
const result = await runTriageApply(project, "triage prompt", {
|
||||
candidateCount: 1,
|
||||
expectedIds: ["sf-expected-1"],
|
||||
allowUntrustedRunner: true,
|
||||
agentRunner: async (agent) => {
|
||||
calls.push(agent.name);
|
||||
return {
|
||||
ok: true,
|
||||
output: deciderPlan,
|
||||
exitCode: 0,
|
||||
};
|
||||
},
|
||||
});
|
||||
|
||||
assert.equal(result.ok, false);
|
||||
assert.match(result.error ?? "", /not in the candidate set/);
|
||||
assert.deepEqual(calls, ["triage-decider"]);
|
||||
});
|
||||
Loading…
Add table
Reference in a new issue