refactor: rename review gate agent

This commit is contained in:
Mikael Hugo 2026-05-14 19:43:01 +02:00
parent 62fbc5d57b
commit 18aa257ede
22 changed files with 80 additions and 114 deletions

View file

@ -1,8 +1,3 @@
---
name: sf-autonomous-runtime
description: Explains SF autonomous loop, UOK gates, installed-runtime drift, and recovery paths.
---
# SF Autonomous Runtime
## Context

View file

@ -1,13 +1,8 @@
---
name: sf-command-surface
description: Use when changing SF slash commands, browser command parity, or headless command dispatch.
---
# SF Command Surface
## When to Use
This skill applies when:
This reference applies when:
- Adding or modifying SF slash commands (`/mode`, `/tasks`, `/skills`, etc.)
- Changing command handlers in `src/resources/extensions/sf/commands/handlers/`
- Updating command catalog descriptions

View file

@ -1,8 +1,3 @@
---
name: sf-harness
description: Use when changing SF harness behavior, generated evidence, verification loops, or repo-native harness boundaries.
---
# SF Harness
Use this only for SF harness changes in this repository.

View file

@ -1,8 +1,3 @@
---
name: sf-operating-model
description: Use when changing SF operating-model vocabulary, surfaces, protocols, output formats, run control, or permission-profile boundaries.
---
# SF Operating Model
Use this when editing SF operating-model docs or code paths.

View file

@ -1,8 +1,3 @@
---
name: sf-planning
description: Use when changing SF milestone, slice, task, backlog, promotion, or spec-first planning behavior.
---
# SF Planning
Use this for SF planning-state changes in this repository.

View file

@ -1,8 +1,3 @@
---
name: sf-state
description: Use when changing SF state ownership, DB-first runtime state, generated artifacts, or .sf/.agents boundary rules.
---
# SF State
Use this for SF state-boundary work in this repository.

View file

@ -42,7 +42,6 @@ const HIDDEN_OR_ALIAS_SUBCOMMANDS = new Set([
"auto",
"footer-config", // alias for /statusline
"h",
"review-code", // alias for /rubber-duck
"stop", // platform-intercepted via BASE_RUNTIME_COMMANDS — never reaches SF handler
"undo-turn", // alias for /rewind
"wt",

View file

@ -5,7 +5,7 @@ const src = resolve(
__dirname,
"..",
"packages",
"pi-coding-agent",
"coding-agent",
"dist",
"core",
"export-html",

View file

@ -5,7 +5,7 @@ const src = resolve(
__dirname,
"..",
"packages",
"pi-coding-agent",
"coding-agent",
"dist",
"modes",
"interactive",

View file

@ -39,10 +39,9 @@ const jiti = createJiti(import.meta.filename, {
debug: false,
});
const agentExtensionsDir = join(getSfEnv().agentDir, "extensions", "sf");
const useAgentDir = existsSync(join(agentExtensionsDir, "state.js"));
function sfExtensionPath(moduleName: string): string {
const agentExtensionsDir = join(getSfEnv().agentDir, "extensions", "sf");
const useAgentDir = existsSync(join(agentExtensionsDir, "state.js"));
if (useAgentDir) return join(agentExtensionsDir, `${moduleName}.js`);
const tsPath = resolveBundledSourceResource(
import.meta.url,
@ -114,8 +113,8 @@ type AgentRunner = (
*
* Why a structured plan instead of letting the decider call resolve_issue
* directly: codex review 2026-05-14 flagged that the original sequential
* design (decider rubber-duck) let the decider mutate state during its
* own turn, before rubber-duck ever saw the decisions. This parser pulls
* design (decider review-code) let the decider mutate state during its
* own turn, before review-code ever saw the decisions. This parser pulls
* the proposed actions out of the decider's text so they can be reviewed
* BEFORE any resolve_issue call.
*/
@ -647,11 +646,11 @@ export async function runTriageApply(
};
const agents = agentsModule.discoverAgents?.(cwd, "both").agents ?? [];
const triageDecider = agents.find((agent) => agent.name === "triage-decider");
const rubberDuck = agents.find((agent) => agent.name === "rubber-duck");
if (!triageDecider || !rubberDuck) {
const reviewCode = agents.find((agent) => agent.name === "review-code");
if (!triageDecider || !reviewCode) {
const missing = [
triageDecider ? null : "triage-decider",
rubberDuck ? null : "rubber-duck",
reviewCode ? null : "review-code",
]
.filter(Boolean)
.join(", ");
@ -677,25 +676,25 @@ export async function runTriageApply(
}
// Trusted-source guard (codex review 2026-05-14): when --apply will
// mutate the ledger, BOTH agents must be SF-shipped built-ins. A
// project-level override could silently disable rubber-duck's
// project-level override could silently disable review-code's
// independence. Operators can still customize behavior for inspect
// workflows, but --apply uses only the shipped review contract.
if (triageDecider.source !== "builtin" || rubberDuck.source !== "builtin") {
const rationale = `non-builtin agents (triage-decider=${triageDecider.source}, rubber-duck=${rubberDuck.source})`;
if (triageDecider.source !== "builtin" || reviewCode.source !== "builtin") {
const rationale = `non-builtin agents (triage-decider=${triageDecider.source}, review-code=${reviewCode.source})`;
const gateFailure = await emitRequiredTriageGate("trusted-agent-source-gate", "fail", rationale, {
triageDeciderSource: triageDecider.source,
rubberDuckSource: rubberDuck.source,
reviewCodeSource: reviewCode.source,
});
if (gateFailure) return gateFailure;
await emit("triage-apply-failed", {
reason: "untrusted-agent-source",
triageDeciderSource: triageDecider.source,
rubberDuckSource: rubberDuck.source,
reviewCodeSource: reviewCode.source,
});
return {
ok: false,
agreed: false,
error: `Refusing to --apply with non-builtin agents (triage-decider=${triageDecider.source}, rubber-duck=${rubberDuck.source}). Use \`sf headless triage --run\` for a reviewable decision artifact, or remove the project/user override.`,
error: `Refusing to --apply with non-builtin agents (triage-decider=${triageDecider.source}, review-code=${reviewCode.source}). Use \`sf headless triage --run\` for a reviewable decision artifact, or remove the project/user override.`,
resolvedIds: [],
flowId,
};
@ -703,7 +702,7 @@ export async function runTriageApply(
const trustGateFailure = await emitRequiredTriageGate(
"trusted-agent-source-gate",
"pass",
"both triage-decider and rubber-duck are SF-shipped built-ins",
"both triage-decider and review-code are SF-shipped built-ins",
);
if (trustGateFailure) return trustGateFailure;
@ -804,12 +803,12 @@ export async function runTriageApply(
}, {}),
});
// Phase 2: rubber-duck reviews the plan with read-only tools. The
// Phase 2: review-code reviews the plan with read-only tools. The
// review task explicitly hands the plan as the artifact under
// scrutiny — the reviewer's job is to spot bad calls before they land.
const reviewTask = [
"Review this self-feedback triage decision PLAN. The plan has NOT yet been applied — your verdict gates whether any resolve_issue mutation runs.",
'Return "rubber-duck: agree" only if every decision in the plan is safe and coherent against the current code/ledger state.',
'Return "review-code: agree" only if every decision in the plan is safe and coherent against the current code/ledger state.',
"On disagreement, name each concerning decision explicitly so the operator (or a follow-up apply pass) can pull just those entries out and proceed with the rest.",
"",
"## Original triage prompt (the ledger entries the decider saw)",
@ -818,16 +817,16 @@ export async function runTriageApply(
"## triage-decider output (includes the plan as a fenced yaml block)",
decider.output,
].join("\n");
const review = await runner(rubberDuck, reviewTask, {
const review = await runner(reviewCode, reviewTask, {
model: options.model,
cwd,
tools: ["view", "grep", "glob", "git_log", "query_journal"],
});
const agreed = /^rubber-duck:\s*agree\b/im.test(review.output.trim());
const agreed = /^review-code:\s*agree\b/im.test(review.output.trim());
await emit(
agreed
? "triage-apply-rubber-duck-agree"
: "triage-apply-rubber-duck-disagree",
? "triage-apply-review-code-agree"
: "triage-apply-review-code-disagree",
{
ok: review.ok,
exitCode: review.exitCode ?? null,
@ -837,15 +836,15 @@ export async function runTriageApply(
const gateFailure = await emitRequiredTriageGate(
"triage-apply-review-gate",
"manual-attention",
"rubber-duck subagent failed to complete; review pending operator",
"review-code subagent failed to complete; review pending operator",
{ exitCode: review.exitCode ?? null },
);
if (gateFailure) return gateFailure;
await emit("triage-apply-failed", { reason: "rubber-duck-failed" });
await emit("triage-apply-failed", { reason: "review-code-failed" });
return {
ok: false,
agreed: false,
error: review.stderr || "rubber-duck failed",
error: review.stderr || "review-code failed",
deciderOutput: decider.output,
reviewOutput: review.output,
resolvedIds: [],
@ -859,13 +858,13 @@ export async function runTriageApply(
const gateFailure = await emitRequiredTriageGate(
"triage-apply-review-gate",
"fail",
"rubber-duck disagreed with the proposed plan; no mutations applied",
"review-code disagreed with the proposed plan; no mutations applied",
);
if (gateFailure) return gateFailure;
return {
ok: false,
agreed: false,
error: "rubber-duck disagreed — pausing for operator review",
error: "review-code disagreed — pausing for operator review",
deciderOutput: decider.output,
reviewOutput: review.output,
resolvedIds: [],
@ -875,7 +874,7 @@ export async function runTriageApply(
const reviewGateFailure = await emitRequiredTriageGate(
"triage-apply-review-gate",
"pass",
"rubber-duck agreed with the proposed plan; apply phase proceeds",
"review-code agreed with the proposed plan; apply phase proceeds",
);
if (reviewGateFailure) return reviewGateFailure;
@ -1162,7 +1161,7 @@ export async function handleTriage(
if (options.apply) {
process.stderr.write(
"[triage] applying via triage-decider -> rubber-duck (this can take a few minutes)…\n",
"[triage] applying via triage-decider -> review-code (this can take a few minutes)…\n",
);
const result = await runTriageApply(cwd, prompt, {
model: options.model,
@ -1183,7 +1182,7 @@ export async function handleTriage(
process.stdout.write(`${JSON.stringify(payload)}\n`);
} else if (result.ok) {
process.stdout.write(
`Triage apply complete: rubber-duck agreed (${result.resolvedIds.length} resolved)\n`,
`Triage apply complete: review-code agreed (${result.resolvedIds.length} resolved)\n`,
);
if (result.resolvedIds.length > 0) {
process.stdout.write(`Resolved: ${result.resolvedIds.join(", ")}\n`);

View file

@ -225,7 +225,7 @@ const SUBCOMMAND_HELP: Record<string, string> = {
" query Machine snapshot: JSON state + next dispatch + costs (no LLM)",
" usage Live LLM-provider usage snapshot (today: gemini-cli tier + per-model quota)",
" reflect Assemble reflection corpus + render prompt for cross-corpus pattern analysis (--json for raw, --run to dispatch to gemini-cli, --model <id> to override)",
" triage Render canonical self-feedback triage prompt (--list/--json inspect, --run writes decisions, --apply runs triage-decider -> rubber-duck)",
" triage Render canonical self-feedback triage prompt (--list/--json inspect, --run writes decisions, --apply runs triage-decider -> review-code)",
"",
"new-milestone flags:",
" --context <path> Path to spec/PRD file (use '-' for stdin)",
@ -259,7 +259,7 @@ const SUBCOMMAND_HELP: Record<string, string> = {
" sf headless triage --list Self-feedback queue digest (impact↓ effort↑ ts↑)",
" sf headless triage | sf-some-model Pipe triage prompt to any model",
" sf headless triage --run Dispatch triage to default model + write decisions",
" sf headless triage --apply Apply via triage-decider, then gate with rubber-duck",
" sf headless triage --apply Apply via triage-decider, then gate with review-code",
" sf headless reflect Render reflection prompt for piping",
" sf headless reflect --run Dispatch reflection + write report",
"",

View file

@ -1,5 +1,5 @@
---
name: rubber-duck
name: review-code
description: Constructive pre-implementation critic — catches design flaws, missing edge cases, and gaps before code is written
model: sonnet
tools: read, grep, find, ls, bash

View file

@ -1,11 +1,11 @@
name: rubber-duck
displayName: Rubber Duck Agent
name: review-code
displayName: Review Code Agent
description: >
A constructive critic and second opinion. Reviews proposals, designs,
decision matrices, or implementations and surfaces ONLY weak points that
matter — bugs, logic errors, missed constraints, decisions that don't
match the ledger state. The default consumer is the triage --apply
pipeline (rubber-duck reviews triage decisions before they auto-apply),
pipeline (review-code reviews triage decisions before they auto-apply),
but operators can invoke directly for any artifact review.
tools: "*"
promptParts:
@ -13,7 +13,7 @@ promptParts:
- toolInstructions
- parallelToolCalling
prompt: |
You are SF's rubber-duck agent: a disciplined devil's advocate. You
You are SF's review-code agent: a disciplined devil's advocate. You
review artifacts and surface only issues that genuinely matter.
Your guiding principle: a flagged finding should feel like a $20 bill
@ -53,7 +53,7 @@ prompt: |
## Output contract
- On AGREE: a single line — "rubber-duck: agree" — followed optionally
- On AGREE: a single line — "review-code: agree" — followed optionally
by one short paragraph of confirming reasoning.
- On DISAGREE: list each concern as a separate `## Concern N:` section
with: (a) what's wrong, (b) the evidence (file path, line, commit

View file

@ -3,7 +3,7 @@ displayName: Self-Feedback Triage Decider
description: >
Reads the open self-feedback queue and proposes a decision plan
(Fix, Promote, or Close per entry). PLAN-ONLY: this agent does NOT
call resolve_issue directly. The plan is reviewed by rubber-duck;
call resolve_issue directly. The plan is reviewed by review-code;
only on agreement does the triage --apply runner execute the plan.
This separation exists so a bad decision can be blocked before any
mutation lands (see sf-mp5lnlbc-ty5fec / codex review 2026-05-14).
@ -22,7 +22,7 @@ prompt: |
forever is the failure mode.
**PLAN-ONLY MODE:** You do not call resolve_issue. You produce a
structured decision plan. A separate review stage (rubber-duck) will
structured decision plan. A separate review stage (review-code) will
read your plan and either approve or block it before any mutation
happens. This separation is intentional: it prevents bad decisions
from landing without a second opinion.
@ -57,7 +57,7 @@ prompt: |
- You have read-only investigation: view, grep, glob, git_log. You
do NOT have resolve_issue, edit, write, or bash. Mutation is the
apply runner's job, after rubber-duck reviews your plan.
apply runner's job, after review-code reviews your plan.
## Output contract

View file

@ -300,8 +300,8 @@ export const TOP_LEVEL_SUBCOMMANDS = [
desc: "Prevent system sleep during long runs (caffeinate / systemd-inhibit)",
},
{
cmd: "rubber-duck",
desc: "Dispatch a rubber-duck subagent for constructive pre-implementation review (alias: review-code)",
cmd: "review-code",
desc: "Dispatch a review-code subagent for constructive pre-implementation review",
},
{
cmd: "delegate",

View file

@ -511,13 +511,11 @@ Examples:
return true;
}
if (
trimmed === "rubber-duck" ||
trimmed.startsWith("rubber-duck ") ||
trimmed === "review-code" ||
trimmed.startsWith("review-code ")
) {
const input = trimmed.replace(/^(?:rubber-duck|review-code)\s*/, "").trim();
await handleRubberDuckCommand(input, ctx, pi);
const input = trimmed.replace(/^review-code\s*/, "").trim();
await handleReviewCodeCommand(input, ctx, pi);
return true;
}
if (trimmed === "delegate" || trimmed.startsWith("delegate ")) {
@ -610,9 +608,9 @@ async function handleKeepAlive(args, ctx) {
}
}
// ─── /rubber-duck ────────────────────────────────────────────────────────────
// ─── /review-code ────────────────────────────────────────────────────────────
async function handleRubberDuckCommand(topic, ctx, _pi) {
async function handleReviewCodeCommand(topic, ctx, _pi) {
const { execSync } = await import("node:child_process");
const root = projectRoot();
@ -640,20 +638,20 @@ async function handleRubberDuckCommand(topic, ctx, _pi) {
const focus = topic ? `Focus on: ${topic}\n\n` : "";
const reviewPrompt =
`Dispatch a \`rubber-duck\` subagent to review the current plan or changes before proceeding. ` +
`Use the \`subagent\` tool with \`agent: "rubber-duck"\`.\n\n` +
`Dispatch a \`review-code\` subagent to review the current plan or changes before proceeding. ` +
`Use the \`subagent\` tool with \`agent: "review-code"\`.\n\n` +
`${focus}` +
`Ask the rubber-duck agent to identify blocking issues, non-blocking issues, and suggestions. ` +
`Ask the review-code agent to identify blocking issues, non-blocking issues, and suggestions. ` +
`After the subagent returns, summarise the verdict and any blocking findings in one short paragraph. ` +
`Do not proceed with implementation until the user acknowledges blocking findings.` +
diff;
ctx.ui.notify("Dispatching rubber-duck review…", "info");
ctx.ui.notify("Dispatching review-code review…", "info");
try {
await ctx.sendMessage?.(reviewPrompt);
} catch {
ctx.ui.notify(
"Could not dispatch rubber-duck. Try: subagent agent=rubber-duck task='review current changes'",
"Could not dispatch review-code. Try: subagent agent=review-code task='review current changes'",
"warning",
);
}

View file

@ -145,7 +145,7 @@
"reset-slice",
"rethink",
"rewind",
"rubber-duck",
"review-code",
"run-hook",
"scaffold",
"scan",

View file

@ -36,7 +36,7 @@ Then:
0. Narrate step transitions, key implementation decisions, and verification outcomes as you work. Keep it terse — one line between tool-call clusters, not between every call — but write complete sentences in user-facing prose, not shorthand notes or scratchpad fragments.
0a. **Batch independent tool calls in parallel.** When the next step needs to read or grep multiple files/paths that don't depend on each other's results, issue them in a single tool-call message (multiple tool uses in one assistant turn) rather than one-at-a-time. Examples: reading the handler + the test file + the schema file to triangulate a bug; greping for two unrelated symbols. Sequential tool calls are only correct when each call's input genuinely depends on the previous call's output. Talking-then-doing is also dead weight — if the next action is unambiguous, just take it; describe what you found in the result, not what you plan to look at.
0b. **Swarm opportunity check.** Before implementation, decide whether this task can be split into a 2-3 worker same-model swarm. Swarm only if the shards have disjoint file/directory ownership, no shared-interface or lockfile edits, shard-local verification, and clear wall-clock savings. If it passes, dispatch `subagent({ tasks: [...] })` with explicit write scopes, expected output files, and verification per worker; then inspect `git status --short`, synthesize results, resolve conflicts, and run final task verification yourself. If it does not pass, continue single-agent execution without ceremony.
0c. **Rubber-duck check (non-trivial tasks only).** If the task touches more than two files, introduces a new abstraction, changes an API boundary, or has a non-obvious failure mode — dispatch a `rubber-duck` subagent with the task plan and any relevant existing code as context. Summarise its verdict in one line. If it returns a **Blocking** finding, address it before writing code. Skip this step for simple edits, test fixes, or renaming tasks.
0c. **Review-code check (non-trivial tasks only).** If the task touches more than two files, introduces a new abstraction, changes an API boundary, or has a non-obvious failure mode — dispatch a `review-code` subagent with the task plan and any relevant existing code as context. Summarise its verdict in one line. If it returns a **Blocking** finding, address it before writing code. Skip this step for simple edits, test fixes, or renaming tasks.
1. {{skillActivation}} Follow any activated skills before writing code. If no skills match this task, skip this step.
2. **Verify file existence before editing.** The task plan references specific files. Before reading or editing any file mentioned in the plan, confirm it exists with `ls`, `find`, or `existsSync`. If a referenced file does NOT exist, stop immediately — do not attempt to create it based on the plan's description of what "should" be there. The file may have been deleted, renamed, or moved. Escalate as `blocker_discovered: true` with a clear description of which file is missing and what the plan expected to find. This prevents phantom work on stale file paths.
3. Execute the steps in the inlined task plan, adapting minor local mismatches when the surrounding code differs from the planner's snapshot

View file

@ -192,7 +192,7 @@ export function discoverAgents(cwd, scope) {
const userDir = path.join(getAgentDir(), "agents");
const projectAgentsDir = findNearestProjectAgentsDir(cwd);
// Built-in agents ship with SF and have lowest precedence — user/project
// definitions with the same name override them. Used for rubber-duck +
// definitions with the same name override them. Used for review-code +
// triage-decider as the foundation for SF's self-driven triage pipeline.
const builtinAgents = scope === "project" ? [] : loadBuiltinAgents();
const userAgents =

View file

@ -113,33 +113,33 @@ test("discoverAgents_when_yaml_and_markdown_share_name_prefers_later_project_ent
assert.match(duplicates[0].systemPrompt, /YAML body/);
});
test("discoverAgents_when_scope_both_includes_builtin_rubber_duck_and_triage_decider", () => {
test("discoverAgents_when_scope_both_includes_builtin_review_code_and_triage_decider", () => {
// Built-in agents ship with SF (src/resources/extensions/sf/agents/) and
// must be discoverable without operator setup. They're the foundation for
// SF's self-driven triage pipeline (sf-mp5lnlbc-ty5fec). Isolate from the
// real ~/.sf/agent/agents/ so the test doesn't conflict with the
// operator's personal rubber-duck.md if present.
// operator's personal review-code.md if present.
const isolatedAgentDir = mkdtempSync(join(tmpdir(), "sf-agent-dir-"));
const originalEnv = process.env.SF_CODING_AGENT_DIR;
process.env.SF_CODING_AGENT_DIR = isolatedAgentDir;
try {
const project = makeProject();
const { agents } = discoverAgents(project, "both");
const rubberDuck = agents.find((a) => a.name === "rubber-duck");
const reviewCode = agents.find((a) => a.name === "review-code");
const triageDecider = agents.find((a) => a.name === "triage-decider");
assert.ok(
rubberDuck,
"rubber-duck builtin agent must be discovered in scope=both",
reviewCode,
"review-code builtin agent must be discovered in scope=both",
);
assert.equal(rubberDuck.source, "builtin");
assert.match(rubberDuck.description, /constructive critic/i);
assert.equal(reviewCode.source, "builtin");
assert.match(reviewCode.description, /constructive critic/i);
assert.ok(
triageDecider,
"triage-decider builtin agent must be discovered in scope=both",
);
assert.equal(triageDecider.source, "builtin");
// triage-decider is plan-only; the apply runner owns mutation after
// rubber-duck agrees.
// review-code agrees.
assert.ok(
!triageDecider.tools?.includes("resolve_issue"),
"triage-decider must not declare resolve_issue tool access",
@ -152,7 +152,7 @@ test("discoverAgents_when_scope_both_includes_builtin_rubber_duck_and_triage_dec
test("discoverAgents_when_project_overrides_builtin_name_project_wins", () => {
// Project-level definitions must shadow built-ins with the same name —
// operators need to be able to customize rubber-duck/triage-decider
// operators need to be able to customize review-code/triage-decider
// behavior per project without forking the SF distribution.
const isolatedAgentDir = mkdtempSync(join(tmpdir(), "sf-agent-dir-"));
const originalEnv = process.env.SF_CODING_AGENT_DIR;
@ -160,17 +160,17 @@ test("discoverAgents_when_project_overrides_builtin_name_project_wins", () => {
try {
const project = makeProject();
writeFileSync(
join(project, ".sf", "agents", "rubber-duck.agent.yaml"),
join(project, ".sf", "agents", "review-code.agent.yaml"),
[
"name: rubber-duck",
"description: Project-level override of rubber-duck.",
"name: review-code",
"description: Project-level override of review-code.",
"prompt: |",
" This is the project override.",
"",
].join("\n"),
);
const { agents } = discoverAgents(project, "both");
const matches = agents.filter((a) => a.name === "rubber-duck");
const matches = agents.filter((a) => a.name === "review-code");
assert.equal(matches.length, 1);
assert.equal(matches[0].source, "project");
assert.match(matches[0].description, /Project-level override/);

View file

@ -41,7 +41,7 @@ beforeEach(() => {
process.env.SF_CODING_AGENT_DIR = dir;
});
test("runTriageApply_when_rubber_duck_agrees_runs_both_agents_in_order", async () => {
test("runTriageApply_when_review_code_agrees_runs_both_agents_in_order", async () => {
const project = makeProject();
const calls: string[] = [];
const result = await runTriageApply(project, "triage prompt", {
@ -56,20 +56,20 @@ test("runTriageApply_when_rubber_duck_agrees_runs_both_agents_in_order", async (
exitCode: 0,
};
}
return { ok: true, output: "rubber-duck: agree", exitCode: 0 };
return { ok: true, output: "review-code: agree", exitCode: 0 };
},
});
// Orchestration contract: rubber-duck agreed, so the apply phase ran.
// Orchestration contract: review-code agreed, so the apply phase ran.
// The actual mutation on sf-test-1 fails because the test project has
// no real ledger entry — that's the rejectedIds path, not a contract
// violation. ok=false on apply failures is itself the contract
// (codex review 2026-05-14 follow-up).
assert.equal(result.agreed, true, "rubber-duck must agree");
assert.equal(result.agreed, true, "review-code must agree");
assert.deepEqual(
calls,
["triage-decider", "rubber-duck"],
"decider must run before rubber-duck",
["triage-decider", "review-code"],
"decider must run before review-code",
);
assert.equal(result.ok, false, "apply should fail on missing ledger entry");
assert.deepEqual(result.rejectedIds, ["sf-test-1"]);
@ -99,7 +99,7 @@ test("runTriageApply_when_custom_runner_without_trust_flag_refuses", async () =>
assert.match(result.error ?? "", /allowUntrustedRunner|custom agentRunner/i);
});
test("runTriageApply_when_rubber_duck_disagrees_blocks_completion", async () => {
test("runTriageApply_when_review_code_disagrees_blocks_completion", async () => {
const project = makeProject();
const result = await runTriageApply(project, "triage prompt", {
allowUntrustedRunner: true,
@ -123,11 +123,11 @@ test("runTriageApply_when_rubber_duck_disagrees_blocks_completion", async () =>
assert.equal(result.agreed, false);
assert.equal(
result.error,
"rubber-duck disagreed — pausing for operator review",
"review-code disagreed — pausing for operator review",
);
});
test("runTriageApply_when_decider_fails_skips_rubber_duck", async () => {
test("runTriageApply_when_decider_fails_skips_review_code", async () => {
const project = makeProject();
const calls: string[] = [];
const result = await runTriageApply(project, "triage prompt", {

View file

@ -4,7 +4,7 @@
* metadata for the three triage gates.
*
* Test contract: each decision point (trusted-source check, plan-validation,
* rubber-duck review) writes exactly one gate_run trace event with
* review-code review) writes exactly one gate_run trace event with
* surface=headless, runControl=supervised, permissionProfile=high, traceId=
* the flowId. The outcome reflects the decision (pass/fail/manual-attention).
*/
@ -108,7 +108,7 @@ describe("runTriageApply emits gate_run trace events with schema-v2 metadata", (
if (agent.name === "triage-decider") {
return { ok: true, output: deciderPlan, exitCode: 0 };
}
return { ok: true, output: "rubber-duck: agree", exitCode: 0 };
return { ok: true, output: "review-code: agree", exitCode: 0 };
},
});