feat: enforce purpose-first adversarial review

This commit is contained in:
Mikael Hugo 2026-05-17 18:00:09 +02:00
parent acafee06e2
commit c2f101734f
25 changed files with 858 additions and 86 deletions

View file

@ -54,9 +54,14 @@ Required sections for non-trivial exports:
- **First line** — what it returns / does, present tense.
- **Purpose:** — why it exists; the value it protects.
- **Consumer:** — who calls it in production. No consumer = symbol shouldn't exist yet.
- **What tests should verify:** — for non-trivial behavior, name the consumer-visible contract the tests must prove. This is not a list of mocks or implementation steps.
A bare `/** Helper. */` is a code smell. Either write the purpose or delete the symbol.
Purpose text is a contract, not decoration. If implementation and purpose
diverge, fix the code or rewrite the purpose with a new failing contract test
first.
### Module-level JSDoc
```js

View file

@ -73,6 +73,10 @@ If any field is missing: `BLOCKED: purpose unclear — [which field is missing]`
Treat the contract as a **falsifiable hypothesis**: name the evidence that would prove it wrong before implementation locks in. A contract without a falsifier is half a contract.
**Purpose outranks implementation momentum.** If code, tests, docs, or agent
memory disagree, the purpose/consumer/contract chain wins. Fix the artifact that
drifted; do not reinterpret the purpose to excuse the implementation.
## Workflow (mapped to sf's phase machine)
### Research phase — name the problem
@ -99,8 +103,11 @@ Clarify highest-impact unknowns first: behaviour, acceptance criteria, data inva
For non-trivial contracts, pressure-test before locking the plan via the [`advisory-partner`](../src/resources/extensions/sf/skills/advisory-partner/SKILL.md) skill — this is sf's adversarial review surface, already wired into the Q3/Q4 gates and `validate-milestone`. It runs with the **validation** model, distinct from the planning/execution model — that's the point.
1. **Advocate pass** — strengthen the best version of the contract.
2. **Challenger pass** — attack assumptions AND propose an alternative. A challenger anchored to the advocate's framing is not adversarial.
3. **Falsifier (required gate, blocks Plan→Execute):** `FALSIFIER: this contract is wrong if [specific observable condition].` Generic falsifiers ("wrong if it doesn't work") are process failures.
2. **Partner pass** — improve the plan as a collaborator: sharpen the purpose,
name the real consumer, remove unnecessary scope, and identify the smallest
proof that creates value.
3. **Challenger pass** — attack assumptions AND propose an alternative. A challenger anchored to the advocate's framing is not adversarial.
4. **Falsifier (required gate, blocks Plan→Execute):** `FALSIFIER: this contract is wrong if [specific observable condition].` Generic falsifiers ("wrong if it doesn't work") are process failures.
**Find the devil and find the experts:**
@ -179,10 +186,14 @@ After completing a step, state confidence as a number `0.01.0` and a one-line
| Purpose & consumer | 0.95 | Run an adversarial review wave (advisory-partner Q3/Q5). |
| Contract test | 0.90 | Adversarial review wave. |
| Implementation | 0.95 | Add a specialist reviewer for the touched boundary (e.g. provider/transport/security). |
| Final evidence | 0.97 | Full adversarial: advocate + challenger + specialist. |
| Final evidence | 0.98 | Full adversarial: advocate + partner + challenger + specialist. |
Skip the gate for trivial steps (typo fix, exhaustive matches with full coverage). The gate earns its keep on I/O boundaries, async loading, protocol integration, and anything touching real backends or models.
Below `0.98`, do not ship non-trivial autonomous behaviour. Persist the gap as
an adversarial finding, write the missing contract test, or explicitly downgrade
the scope before claiming completion.
LLM confidence numbers are poorly calibrated in absolute terms — the *relative* signal matters. If you write 0.7, you know you're guessing. Act on that.
## Tests Find Gaps

View file

@ -77,6 +77,14 @@ export function resolveExpectedArtifactPath(unitType, unitId, base) {
const dir = resolveMilestonePath(base, mid);
return dir ? join(dir, buildMilestoneFileName(mid, "VALIDATION")) : null;
}
case "challenge": {
if (sid) {
const dir = resolveSlicePath(base, mid, sid);
return dir ? join(dir, buildSliceFileName(sid, "CHALLENGE")) : null;
}
const dir = resolveMilestonePath(base, mid);
return dir ? join(dir, buildMilestoneFileName(mid, "CHALLENGE")) : null;
}
case "complete-milestone": {
const dir = resolveMilestonePath(base, mid);
return dir ? join(dir, buildMilestoneFileName(mid, "SUMMARY")) : null;
@ -130,6 +138,10 @@ export function diagnoseExpectedArtifact(unitType, unitId, base) {
return `${relSliceFile(base, mid, sid, "ASSESSMENT")} (UAT assessment result)`;
case "validate-milestone":
return `${relMilestoneFile(base, mid, "VALIDATION")} (milestone validation report)`;
case "challenge":
return sid
? `${relSliceFile(base, mid, sid, "CHALLENGE")} (adversarial slice challenge)`
: `${relMilestoneFile(base, mid, "CHALLENGE")} (adversarial milestone challenge)`;
case "complete-milestone":
return `${relMilestoneFile(base, mid, "SUMMARY")} (milestone summary)`;
default:

View file

@ -484,9 +484,9 @@ export async function selectAndApplyModel(
);
}
}
// ── Lineage-diverse filter (adversary / reviewer roles) ───────────────────
// When the dispatching role is "adversary" or "reviewer" and its constraint
// set includes "lineage-diverse-from-worker", remove candidates that share
// ── Lineage-diverse filter (review roles) ──────────────────────────────────
// When the dispatching role carries "lineage-diverse-from-worker", remove
// candidates that share
// root-vendor lineage with the worker model to prevent rubber-stamp reviews.
// Fail-open: if filtering would leave zero candidates, log a warning and keep
// the full eligible set so the dispatch is never completely blocked.
@ -499,7 +499,9 @@ export async function selectAndApplyModel(
{
const normalizedRole = String(role ?? "").toLowerCase();
const isLineageRole =
normalizedRole === "adversary" || normalizedRole === "reviewer";
normalizedRole === "adversary" ||
normalizedRole === "reviewer" ||
normalizedRole === "validator";
if (isLineageRole) {
const roleConstraints =
DEFAULT_MODEL_ROLE_CONSTRAINTS[normalizedRole] ?? [];
@ -783,7 +785,7 @@ export async function selectAndApplyModel(
}
// Enforce lineage-diverse-from-worker: skip candidates whose root vendor
// matches the worker's. lineageAllowedKeys is populated only when the
// constraint applies (adversary / reviewer with a known worker model id).
// constraint applies for a review role with a known worker model id.
if (lineageAllowedKeys) {
const key = `${model.provider.toLowerCase()}/${model.id.toLowerCase()}`;
if (!lineageAllowedKeys.has(key)) {

View file

@ -3689,6 +3689,7 @@ export async function buildChallengePrompt(
challengeTarget,
challengeMode,
base,
options = {},
) {
const resolveArtifact = async (key) => {
switch (key) {
@ -3710,6 +3711,9 @@ export async function buildChallengePrompt(
milestoneTitle: midTitle,
challengeTarget: challengeTarget ?? "milestone",
challengeMode: challengeMode ?? "red-team",
saveSummaryScope: options.sliceId
? `milestone_id: ${mid}, slice_id: ${options.sliceId}, artifact_type: "CHALLENGE"`
: `milestone_id: ${mid}, artifact_type: "CHALLENGE"`,
inlinedContext,
skillActivation: buildSkillActivationBlock({
base,

View file

@ -60,6 +60,26 @@ import { logError, logWarning } from "./workflow-logger.js";
// Re-export so existing consumers of auto-recovery.ts keep working.
export { diagnoseExpectedArtifact, resolveExpectedArtifactPath };
// ─── Artifact Resolution & Verification ───────────────────────────────────────
/**
* Return true when a challenge report contains the minimum review lenses.
*
* Purpose: prevent one challenge verdict from becoming "truth"; SF requires
* self-review, partner review, and adversary review evidence before a completed
* slice or milestone is considered challenged.
*
* Consumer: challenge artifact verification and auto-dispatch backfill guards.
*/
export function hasMinimumChallengeRounds(content) {
const text = String(content ?? "");
return (
/self(?:[-\s]+review|[-\s]+critique|[-\s]+check)/i.test(text) &&
/partner(?:[-\s]+review|[-\s]+round|[-\s]+pass|[-\s]+check)?/i.test(text) &&
/(adversary|adversarial|combatant|red[-\s]*team)(?:[-\s]+review|[-\s]+round|[-\s]+pass|[-\s]+check)?/i.test(
text,
)
);
}
/**
* Check whether a milestone produced implementation artifacts (non-`.sf/` files)
* in the git history. Uses `git log --name-only` to inspect all commits on the
@ -320,6 +340,14 @@ export function verifyExpectedArtifact(unitType, unitId, base) {
const validationContent = readFileSync(absPath, "utf-8");
if (!isValidationTerminal(validationContent)) return false;
}
if (unitType === "challenge") {
const challengeContent = readFileSync(absPath, "utf-8");
if (!/Overall Verdict/i.test(challengeContent)) return false;
if (!/\b(PASS|NEEDS-REMEDIATION|ADVISORY)\b/i.test(challengeContent)) {
return false;
}
if (!hasMinimumChallengeRounds(challengeContent)) return false;
}
if (unitType === "plan-milestone" || unitType === "roadmap-meeting") {
try {
const roadmapContent = readFileSync(absPath, "utf-8");

View file

@ -393,13 +393,13 @@ export function registerDbTools(pi) {
name: "save_summary",
label: "Save Summary",
description:
"Save a research, summary, context, or assessment artifact to disk with an auto-computed path. " +
"Call this to persist planning or research output (e.g. a research brief, context doc, or summary) for a milestone, slice, or task.",
"Save a research, summary, context, assessment, challenge, or ops artifact to disk with an auto-computed path. " +
"Call this to persist planning, research, adversarial review, or operational output for a milestone, slice, or task.",
promptSnippet:
"Save a planning artifact (SUMMARY/RESEARCH/CONTEXT/ASSESSMENT) to disk",
"Save a planning artifact (SUMMARY/RESEARCH/CONTEXT/ASSESSMENT/CHALLENGE) to disk",
promptGuidelines: [
"milestone_id is required; slice_id and task_id are optional and determine the file path.",
"artifact_type must be one of: SUMMARY, RESEARCH, CONTEXT, ASSESSMENT, CONTEXT-DRAFT.",
"artifact_type must be one of: SUMMARY, RESEARCH, CONTEXT, ASSESSMENT, CONTEXT-DRAFT, CHALLENGE, DEPLOY, ROLLBACK, SMOKE, RELEASE.",
"Use CONTEXT-DRAFT for incremental saves; use CONTEXT only for the final milestone context after verification.",
],
parameters: Type.Object({
@ -412,7 +412,7 @@ export function registerDbTools(pi) {
),
artifact_type: Type.String({
description:
"One of: SUMMARY, RESEARCH, CONTEXT, ASSESSMENT, CONTEXT-DRAFT",
"One of: SUMMARY, RESEARCH, CONTEXT, ASSESSMENT, CONTEXT-DRAFT, CHALLENGE, DEPLOY, ROLLBACK, SMOKE, RELEASE",
}),
content: Type.String({
description: "The full markdown content of the artifact",

View file

@ -166,6 +166,11 @@ export const KNOWN_UNIT_TYPES = [
"run-uat",
"complete-milestone",
"validate-milestone",
"challenge",
"deploy",
"smoke-production",
"release",
"rollback",
"rewrite-docs",
"discuss-milestone",
"discuss-slice",

View file

@ -18,6 +18,19 @@ You are executing SF autonomous mode.
You are the adversary agent. Your job is NOT to complete work — it is to rigorously attack the correctness, safety, and assumptions of what has been built or planned. You represent the failure modes the product team has not considered.
Challenge output is evidence, not truth. A challenge report is not acceptable
until it contains all three review rounds:
1. **Self-review round** — the original model's best internal critique of its
own likely blind spots and weak assumptions.
2. **Partner round** — the constructive partner view: strengthen the purpose,
consumer, smallest valuable proof, and remediation shape.
3. **Adversary round** — the hostile/combatant view: break the claim, find
counterexamples, and name blocking risks.
If any round is missing, the challenge is incomplete even if it has an Overall
Verdict.
### Challenge rules
1. **Be ruthless but specific.** Every finding must include: what breaks, under what condition, and what the impact is.
@ -36,6 +49,19 @@ Based on `{{challengeMode}}`:
### Output format
Start with these required sections:
```
## Self-review Round
<best critique from the model that produced or would defend the work>
## Partner Round
<constructive partner review: purpose, consumer, minimal remediation>
## Adversary Round
<hostile/combatant review: failure modes and counterexamples>
```
For each finding:
```
@ -54,11 +80,25 @@ After all findings, provide an **Overall Verdict**:
- `NEEDS-REMEDIATION` — one or more critical/high findings must be addressed before deploy.
- `ADVISORY` — findings are low severity; proceed with awareness.
Call `save_summary` with `milestone_id: {{milestoneId}}`, `artifact_type: "CHALLENGE"`, and the full challenge report as content.
Call `save_summary` with `{{saveSummaryScope}}`, and the full challenge report as content.
### Report sf-internal observations
If you observe sf-the-tool friction during this unit, file via `report_issue`.
If the target is already closed and you find real remediation work, do **not**
reopen or edit the closed slice/milestone yourself. Closed history stays closed.
Instead:
- File each critical/high finding with `report_issue` using
`kind: "adversarial-finding"`, the finding severity, concrete evidence, and
falsifiable acceptance criteria.
- Recommend a new remediation slice/task in the finding. The planner/triage
path owns opening new work from that evidence.
- Use `NEEDS-REMEDIATION` only when the finding should block the current
milestone/deploy path. Use `ADVISORY` for follow-up work that can be queued
without blocking.
If you observe other sf-the-tool friction during this unit, file via
`report_issue`.
When done, say: "Challenge {{milestoneId}} complete — verdict: <verdict>."

View file

@ -1,6 +1,6 @@
Plan milestone {{milestoneId}} ("{{milestoneTitle}}"). Decisions and requirements are injected from the database into your system context — respect existing decisions and treat Active requirements as the capability contract. If no requirements are present, treat that as a planning gap: derive the minimum requirement coverage from current project evidence, persist it through `save_requirement`, and explicitly note missing coverage. Use the **Roadmap** output template below to shape the milestone planning payload you send to `plan_milestone`. Start the `vision` field with the milestone purpose before implementation detail, include the structured `productResearch` payload when the work is product-facing, workflow-facing, developer-experience, or market-positioning, and make each slice `goal` state the slice purpose before mechanics. If the milestone changes how SF is driven, observed, integrated, or automated, keep the axes separate in the roadmap: surface (TUI/CLI/web/editor/machine), protocol (ACP/RPC/stdio/HTTP/wire), output format (text/json/stream-json), run control (manual/assisted/autonomous), and permission profile (restricted/normal/trusted/unrestricted). Call `plan_milestone` to persist the milestone planning fields and render `{{milestoneId}}-ROADMAP.md` from DB state. Do **not** write `{{milestoneId}}-ROADMAP.md`, `ROADMAP.md`, or other planning artifacts manually. If planning produces structural decisions, call `save_decision`. {{skillActivation}} Fill the Horizontal Checklist section with cross-cutting concerns considered during planning (requirements re-read, decisions re-evaluated, graceful shutdown, revenue paths, auth boundary, shared resources, reconnection). Omit for trivial milestones.
Before calling `plan_milestone`, run a bounded **Vision Alignment Meeting** for the milestone and roadmap as a real multi-agent review. Use the `subagent` tool in `mode: "debate"` with `rounds: 2` and a separate task for each participant lens below. Do **not** merely simulate every participant inside this planner response. Use only supported agent names: `planner`, `reviewer`, `researcher`, and `scout`. Put the stakeholder role name inside the task text; do not invent agent names such as `combatant`, `delivery-lead`, `product-manager`, or `customer-panel`. If the `subagent` tool is unavailable or fails after one retry, record that explicitly in `trigger` and run the structured meeting inline as a degraded fallback. This is allowed to be broader and more nuanced than slice planning. Include at least these participant lenses:
Before calling `plan_milestone`, run a bounded **Vision Alignment Meeting** for the milestone and roadmap as a real multi-agent review. Use the `subagent` tool in `mode: "debate"` with `rounds: 2` and a separate task for each participant lens below. Do **not** merely simulate every participant inside this planner response. Use only supported agent names: `planner`, `reviewer`, `researcher`, and `scout`. Put the stakeholder role name inside the task text; do not invent agent names such as `combatant`, `delivery-lead`, `product-manager`, or `customer-panel`. Do not set a `model` override for Partner, Combatant, or validation-style reviewers; SF's reviewer/adversary routing must choose a lineage-diverse model family from the worker whenever possible. If the `subagent` tool is unavailable or fails after one retry, record that explicitly in `trigger` and run the structured meeting inline as a degraded fallback. This is allowed to be broader and more nuanced than slice planning. Include at least these participant lenses:
- Product Manager
- User Advocate
- Customer Panel

View file

@ -51,7 +51,7 @@ If milestone research exists (inlined above), trust those findings and skip redu
Narrate your decomposition reasoning - why you're grouping work this way, what risks are driving the order, what verification strategy you're choosing and why. Use complete sentences rather than planner shorthand or fragmentary notes.
Before you persist the roadmap, run a bounded **Vision Alignment Meeting** as a real multi-agent review. Use the `subagent` tool in `mode: "debate"` with `rounds: 2` and a separate task for each participant lens below. Do **not** merely simulate every participant inside this planner response. Use only supported agent names: `planner`, `reviewer`, `researcher`, and `scout`. Put the stakeholder role name inside the task text; do not invent agent names such as `combatant`, `delivery-lead`, `product-manager`, or `customer-panel`. If the `subagent` tool is unavailable or fails after one retry, record that explicitly in `trigger` and run the structured meeting inline as a degraded fallback. This is broader than slice planning and should feel allowed to be chatty and nuanced. Gather the strongest additions, cuts, and ordering changes from these participant lenses:
Before you persist the roadmap, run a bounded **Vision Alignment Meeting** as a real multi-agent review. Use the `subagent` tool in `mode: "debate"` with `rounds: 2` and a separate task for each participant lens below. Do **not** merely simulate every participant inside this planner response. Use only supported agent names: `planner`, `reviewer`, `researcher`, and `scout`. Put the stakeholder role name inside the task text; do not invent agent names such as `combatant`, `delivery-lead`, `product-manager`, or `customer-panel`. Do not set a `model` override for Partner, Combatant, or validation-style reviewers; SF's reviewer/adversary routing must choose a lineage-diverse model family from the worker whenever possible. If the `subagent` tool is unavailable or fails after one retry, record that explicitly in `trigger` and run the structured meeting inline as a degraded fallback. This is broader than slice planning and should feel allowed to be chatty and nuanced. Gather the strongest additions, cuts, and ordering changes from these participant lenses:
Each role has a **purpose** (why it exists in the ceremony) and a **depth contract** (what a meaningful contribution must contain). A contribution that restates the roadmap title without evidence, specific risks, or concrete subsystem references is not meaningful — it is a rubber stamp.

View file

@ -23,7 +23,7 @@ All relevant context has been preloaded below — the roadmap, all slice summari
### Step 1 — Dispatch Parallel Reviewers
Call `subagent` with `tasks: [...]` containing ALL THREE reviewers simultaneously.
Use `agent: "reviewer"` for every validation reviewer. Do not use `code`, `coder`, or `worker` here — this is review/validation work, not implementation.
Use `agent: "reviewer"` for every validation reviewer. Do not use `code`, `coder`, or `worker` here — this is review/validation work, not implementation. Do not set a `model` override; SF's reviewer/validator routing should choose a model family that is lineage-diverse from the worker when possible.
**Pass `parentTrace`** at the batch level. For milestone validation, the
"parent" is the slice work that built this milestone — pass a one-line-per-

View file

@ -67,7 +67,20 @@ If you can't fill this in, the plan is incomplete.
---
### 2. Five Challenger Questions
### 2. Partner Pass
Before attacking the plan, improve it as a partner:
- Restate the purpose in one sentence.
- Name the real production consumer.
- Remove scope that does not serve that purpose.
- Identify the smallest proof point that gives the consumer value.
If this pass cannot name purpose and consumer, stop with `RECONSIDER`.
---
### 3. Five Challenger Questions
Answer each from the artifact — not from general knowledge:
@ -88,7 +101,7 @@ Write the objection. Then answer it. If the answer is weak, that's a flag.
---
### 3. Trap Scan
### 4. Trap Scan
| Trap | Warning Sign |
|------|-------------|
@ -100,7 +113,7 @@ Write the objection. Then answer it. If the answer is weak, that's a flag.
---
### 4. Verdict
### 5. Verdict
```
ADVISORY VERDICT: [PROCEED / PROCEED WITH CAVEAT / RECONSIDER]

View file

@ -121,10 +121,14 @@ After each major step, state confidence as `0.01.0` plus a one-line reason.
| Purpose & consumer | 0.95 | Run `advisory-partner` Q3 (abuse) + Q5 (strongest objection). |
| Contract test | 0.90 | Adversarial review wave. |
| Implementation | 0.95 | Add a specialist reviewer for the boundary touched. |
| Evidence | 0.97 | Full advocate + challenger + specialist. |
| Evidence | 0.98 | Full advocate + partner + challenger + specialist. |
LLM confidence is poorly calibrated in absolute terms — the relative signal matters. If you write 0.7, you're guessing. Act on it.
Below `0.98`, non-trivial autonomous behaviour does not ship. Persist the gap
as an adversarial finding, write the missing contract test, or explicitly shrink
scope before claiming completion.
### 8. Verify Consumer and Record Evidence
- Re-run `rg` for the changed symbol; confirm a real caller still depends on it.

View file

@ -50,6 +50,7 @@ import {
mergeDeltaPatches,
readIsolationMode,
} from "./isolation.js";
import { resolveLineageDiverseSubagentModel } from "./lineage-routing.js";
import { composeAgentPrompt } from "./prompt-parts.js";
import { registerWorker, updateWorker } from "./worker-registry.js";
@ -371,6 +372,25 @@ function validateSubagentInvocationAgainstInheritance(
}
return { ok: true };
}
function resolveSubagentModelOverride({
agents,
agentName,
task,
requestedModel,
parentModel,
availableModels,
}) {
const { agent, effectiveName } = resolveAgentByName(agents, agentName);
return resolveLineageDiverseSubagentModel({
agentName: effectiveName ?? agentName,
task,
requestedModel,
agentModel: agent?.model,
parentModel,
availableModels,
});
}
async function executeSubagentInvocation({
defaultCwd,
agents,
@ -383,6 +403,8 @@ async function executeSubagentInvocation({
cmuxSplitsEnabled,
useIsolation,
inheritanceEnvelope,
parentModel,
availableModels,
}) {
const makeDetails = (mode) => (results) => ({
mode,
@ -430,7 +452,14 @@ async function executeSubagentInvocation({
signal,
chainUpdate,
makeDetails("chain"),
step.model ?? params.model,
resolveSubagentModelOverride({
agents,
agentName: step.agent,
task: taskForStep,
requestedModel: step.model ?? params.model,
parentModel,
availableModels,
}),
inheritanceEnvelope,
);
results.push(result);
@ -582,7 +611,14 @@ async function executeSubagentInvocation({
async (t, index) => {
const resultIndex = (round - 1) * batchTasks.length + index;
const prompt = buildDebatePrompt(t, round, transcript);
const taskModelOverride = t.model ?? params.model;
const taskModelOverride = resolveSubagentModelOverride({
agents,
agentName: t.agent,
task: prompt,
requestedModel: t.model ?? params.model,
parentModel,
availableModels,
});
const result = cmuxSplitsEnabled
? await runSingleAgentInCmuxSplit(
cmuxClient,
@ -749,11 +785,18 @@ async function executeSubagentInvocation({
batchSize,
batchId,
);
const taskModelOverride = t.model ?? params.model;
const taskWithTrace = composeTaskWithParentTrace(
t.task,
t.parentTrace ?? params.parentTrace,
);
const taskModelOverride = resolveSubagentModelOverride({
agents,
agentName: t.agent,
task: taskWithTrace,
requestedModel: t.model ?? params.model,
parentModel,
availableModels,
});
const runTask = () =>
cmuxSplitsEnabled
? runSingleAgentInCmuxSplit(
@ -861,7 +904,14 @@ async function executeSubagentInvocation({
signal,
onUpdate,
makeDetails("single"),
params.model,
resolveSubagentModelOverride({
agents,
agentName: params.agent,
task: singleTaskWithTrace,
requestedModel: params.model,
parentModel,
availableModels,
}),
inheritanceEnvelope,
)
: await runSingleAgent(
@ -874,7 +924,14 @@ async function executeSubagentInvocation({
signal,
onUpdate,
makeDetails("single"),
params.model,
resolveSubagentModelOverride({
agents,
agentName: params.agent,
task: singleTaskWithTrace,
requestedModel: params.model,
parentModel,
availableModels,
}),
inheritanceEnvelope,
);
if (isolation) {
@ -1994,6 +2051,10 @@ export default function (pi) {
});
const cmuxClient = CmuxClient.fromPreferences(effectivePreferences);
const cmuxSplitsEnabled = cmuxClient.getConfig().splits;
const parentModel = ctx.model
? { provider: ctx.model.provider, id: ctx.model.id }
: null;
const availableModels = ctx.modelRegistry?.getAvailable?.() ?? [];
// Resolve isolation mode
const isolationMode = readIsolationMode();
const useIsolation = Boolean(params.isolated) && isolationMode !== "none";
@ -2116,6 +2177,8 @@ export default function (pi) {
cmuxSplitsEnabled,
useIsolation,
inheritanceEnvelope,
parentModel,
availableModels,
}),
}
: null;
@ -2138,6 +2201,8 @@ export default function (pi) {
cmuxSplitsEnabled,
useIsolation,
inheritanceEnvelope,
parentModel,
availableModels,
}).then((result) => {
// Append agent turn BEFORE deliverResult fires the notification.
if (jobId && isSingleMode) {
@ -2176,6 +2241,8 @@ export default function (pi) {
cmuxSplitsEnabled,
useIsolation,
inheritanceEnvelope,
parentModel,
availableModels,
});
},
renderCall(args, theme) {

View file

@ -0,0 +1,65 @@
import { isSameRootVendor } from "../uok/model-role-policy.js";
const REVIEW_AGENT_RE =
/\b(reviewer|review-code|sf-reviewer|partner|advocate|adversary|combatant|critic|validator)\b/i;
const REVIEW_TASK_RE =
/\b(partner|advocate|reviewer|review|adversary|adversarial|combatant|challenger|critic|validation|validator)\b/i;
function modelKey(model) {
if (!model) return undefined;
if (typeof model === "string") return model;
if (model.provider && model.id) return `${model.provider}/${model.id}`;
return model.id;
}
/**
* Return true when a subagent request is review-like and needs lineage diversity.
*
* Purpose: keep partner/advisory/adversarial lanes from sharing the worker's
* blind spots just because they are all launched through the generic
* `subagent` tool.
*
* Consumer: subagent/index.js before launching single, chain, parallel, or
* debate subagents.
*/
export function isLineageDiverseReviewSubagent(agentName, task) {
return (
REVIEW_AGENT_RE.test(String(agentName ?? "")) ||
REVIEW_TASK_RE.test(String(task ?? ""))
);
}
/**
* Resolve a subagent model override that avoids the parent worker lineage.
*
* Purpose: enforce R113 at launch time: partner, reviewer, validator, and
* combatant/adversary subagents should use a different model family from the
* parent worker when an alternative is available. If no alternative exists,
* fail open and keep the requested model rather than blocking review entirely.
*
* Consumer: subagent/index.js model override resolution.
*/
export function resolveLineageDiverseSubagentModel({
agentName,
task,
requestedModel,
agentModel,
parentModel,
availableModels = [],
}) {
const requested = requestedModel ?? agentModel;
if (!isLineageDiverseReviewSubagent(agentName, task)) return requestedModel;
const parentKey = modelKey(parentModel);
if (!parentKey) return requestedModel;
const requestedKey = modelKey(requested);
if (requestedKey && !isSameRootVendor(requestedKey, parentKey)) {
return requestedModel;
}
const candidates = availableModels.filter((model) => {
const key = modelKey(model);
return key && !isSameRootVendor(key, parentKey);
});
const selected = candidates[0];
const selectedKey = modelKey(selected);
return selectedKey ?? requestedModel;
}

View file

@ -16,8 +16,10 @@ const parallelPromptMock = vi.hoisted(() =>
);
vi.mock("../auto-prompts.js", () => ({
buildChallengePrompt: vi.fn(async (...args) => `challenge ${args.join(" ")}`),
buildCompleteMilestonePrompt: vi.fn(async () => "complete milestone"),
buildCompleteSlicePrompt: vi.fn(async () => "complete slice"),
buildDeployPrompt: vi.fn(async () => "deploy"),
buildDiscussMilestonePrompt: vi.fn(async () => "discuss milestone"),
buildDiscussProjectPrompt: vi.fn(async () => "discuss project"),
buildDiscussRequirementsPrompt: vi.fn(async () => "discuss requirements"),
@ -33,14 +35,22 @@ vi.mock("../auto-prompts.js", () => ({
buildResearchProjectPrompt: vi.fn(async () => "research project"),
buildResearchMilestonePrompt: vi.fn(async () => "research milestone"),
buildResearchSlicePrompt: vi.fn(async () => "research slice"),
buildReleasePrompt: vi.fn(async () => "release"),
buildRewriteDocsPrompt: vi.fn(async () => "rewrite docs"),
buildRollbackPrompt: vi.fn(async () => "rollback"),
buildRunUatPrompt: vi.fn(async () => "run uat"),
buildSmokeProductionPrompt: vi.fn(async () => "smoke production"),
buildValidateMilestonePrompt: vi.fn(async () => "validate milestone"),
buildWorkflowPreferencesPrompt: vi.fn(async () => "workflow preferences"),
checkNeedsReassessment: vi.fn(async () => null),
checkNeedsRunUat: vi.fn(async () => null),
}));
vi.mock("../preferences-models.js", () => ({
_initPrefsLoader: vi.fn(),
resolveModelWithFallbacksForUnit: vi.fn(() => ({ primary: "mock-model" })),
}));
import {
closeDatabase,
insertAssessment,
@ -326,4 +336,235 @@ describe("resolveDispatch canonical milestone plan", () => {
cleanup(base);
}
});
test("completed_slice_without_challenge_dispatches_slice_challenge_by_default", async () => {
const base = makeTempDir("sf-dispatch-slice-challenge-");
try {
mkdirSync(join(base, ".sf"), { recursive: true });
openDatabase(join(base, ".sf", "sf.db"));
insertMilestone({
id: "M901",
title: "Purpose challenge gate",
status: "active",
});
insertSlice({
milestoneId: "M901",
id: "S01",
title: "Completed work",
status: "complete",
sequence: 1,
});
const result = await resolveDispatch({
state: { phase: "completing-milestone" },
mid: "M901",
midTitle: "Purpose challenge gate",
basePath: base,
prefs: { phases: {} },
session: {},
pipelineVariant: "standard",
});
expect(result).toMatchObject({
action: "dispatch",
unitType: "challenge",
unitId: "M901/S01/challenge",
});
expect(result.prompt).toContain("slice S01");
} finally {
closeDatabase();
cleanup(base);
}
});
test("completing_milestone_after_slice_challenges_dispatches_milestone_challenge_before_completion", async () => {
const base = makeTempDir("sf-dispatch-milestone-challenge-");
try {
mkdirSync(join(base, ".sf"), { recursive: true });
openDatabase(join(base, ".sf", "sf.db"));
insertMilestone({
id: "M902",
title: "Milestone challenge gate",
status: "active",
});
insertSlice({
milestoneId: "M902",
id: "S01",
title: "Completed work",
status: "complete",
sequence: 1,
});
insertAssessment({
path: ".sf/milestones/M902/slices/S01/S01-CHALLENGE.md",
milestoneId: "M902",
sliceId: "S01",
scope: "slice-challenge:S01",
status: "pass",
fullContent:
"## Self-review Round\nok\n## Partner Round\nok\n## Adversary Round\nok\n## Overall Verdict\n- `PASS`\n",
});
insertAssessment({
path: ".sf/milestones/M902/M902-VALIDATION.md",
milestoneId: "M902",
scope: "milestone-validation",
status: "pass",
fullContent: "verdict: pass\nOperational: MET\n",
});
const result = await resolveDispatch({
state: { phase: "completing-milestone" },
mid: "M902",
midTitle: "Milestone challenge gate",
basePath: base,
prefs: { phases: {} },
session: {},
pipelineVariant: "standard",
});
expect(result).toMatchObject({
action: "dispatch",
unitType: "challenge",
unitId: "M902",
});
expect(result.prompt).toContain("milestone");
} finally {
closeDatabase();
cleanup(base);
}
});
test("completed_slice_when_challenge_file_has_no_verdict_dispatches_challenge_again", async () => {
const base = makeTempDir("sf-dispatch-invalid-challenge-");
try {
mkdirSync(join(base, ".sf", "milestones", "M904", "slices", "S01"), {
recursive: true,
});
writeFileSync(
join(
base,
".sf",
"milestones",
"M904",
"slices",
"S01",
"S01-CHALLENGE.md",
),
"# partial challenge without verdict\n",
);
openDatabase(join(base, ".sf", "sf.db"));
insertMilestone({
id: "M904",
title: "Invalid challenge backfill",
status: "active",
});
insertSlice({
milestoneId: "M904",
id: "S01",
title: "Completed work",
status: "complete",
sequence: 1,
});
const result = await resolveDispatch({
state: { phase: "planning" },
mid: "M904",
midTitle: "Invalid challenge backfill",
basePath: base,
prefs: { phases: {} },
session: {},
pipelineVariant: "standard",
});
expect(result).toMatchObject({
action: "dispatch",
unitType: "challenge",
unitId: "M904/S01/challenge",
});
} finally {
closeDatabase();
cleanup(base);
}
});
test("skipped_slice_without_challenge_does_not_backfill_challenge", async () => {
const base = makeTempDir("sf-dispatch-skipped-challenge-");
try {
mkdirSync(join(base, ".sf"), { recursive: true });
openDatabase(join(base, ".sf", "sf.db"));
insertMilestone({
id: "M905",
title: "Skipped slice challenge",
status: "active",
});
insertSlice({
milestoneId: "M905",
id: "S01",
title: "Skipped work",
status: "skipped",
sequence: 1,
});
const result = await resolveDispatch({
state: { phase: "planning" },
mid: "M905",
midTitle: "Skipped slice challenge",
basePath: base,
prefs: { phases: {} },
session: {},
pipelineVariant: "standard",
});
expect(result?.unitType).not.toBe("challenge");
} finally {
closeDatabase();
cleanup(base);
}
});
test("completing_milestone_when_adversarial_review_disabled_keeps_completion_path", async () => {
const base = makeTempDir("sf-dispatch-no-challenge-");
try {
mkdirSync(join(base, ".sf"), { recursive: true });
openDatabase(join(base, ".sf", "sf.db"));
insertMilestone({
id: "M903",
title: "Explicitly disabled challenge",
status: "active",
});
insertSlice({
milestoneId: "M903",
id: "S01",
title: "Completed work",
status: "complete",
sequence: 1,
});
insertAssessment({
path: ".sf/milestones/M903/M903-VALIDATION.md",
milestoneId: "M903",
scope: "milestone-validation",
status: "pass",
fullContent: "verdict: pass\nOperational: MET\n",
});
const result = await resolveDispatch({
state: { phase: "completing-milestone" },
mid: "M903",
midTitle: "Explicitly disabled challenge",
basePath: base,
prefs: { phases: {}, deploy: { adversarial_review: false } },
session: {},
pipelineVariant: "standard",
});
expect(result).toMatchObject({
action: "dispatch",
unitType: "complete-milestone",
unitId: "M903",
prompt: "complete milestone",
});
} finally {
closeDatabase();
cleanup(base);
}
});
});

View file

@ -11,8 +11,10 @@ import { join } from "node:path";
import { afterEach, test } from "vitest";
import { dispatchDirectPhase } from "../auto-direct-dispatch.js";
import { verifyExpectedArtifact } from "../auto-recovery.js";
import { executeSummarySave } from "../tools/workflow-tool-executors.js";
import {
closeDatabase,
getAssessmentByScope,
insertMilestone,
insertSlice,
openDatabase,
@ -130,3 +132,80 @@ test("dispatchDirectPhase_when_db_has_no_completed_slices_ignores_stale_roadmap_
},
]);
});
test("verifyExpectedArtifact_challenge_requires_overall_verdict", () => {
const project = makeProject();
const milestoneDir = join(project, ".sf", "milestones", "M990");
writeFileSync(join(milestoneDir, "M990-CHALLENGE.md"), "# Challenge\n");
assert.equal(verifyExpectedArtifact("challenge", "M990", project), false);
writeFileSync(
join(milestoneDir, "M990-CHALLENGE.md"),
["# Challenge", "", "## Overall Verdict", "- `PASS`", ""].join("\n"),
);
assert.equal(verifyExpectedArtifact("challenge", "M990", project), false);
writeFileSync(
join(milestoneDir, "M990-CHALLENGE.md"),
[
"# Challenge",
"",
"## Self-review Round",
"Internal critique.",
"",
"## Partner Round",
"Constructive critique.",
"",
"## Adversary Round",
"Hostile critique.",
"",
"## Overall Verdict",
"- `PASS`",
"",
].join("\n"),
);
assert.equal(verifyExpectedArtifact("challenge", "M990", project), true);
});
test("executeSummarySave_challenge_persists_assessment_scope", async () => {
const project = makeProject();
const result = await executeSummarySave(
{
milestone_id: "M990",
slice_id: "S01",
artifact_type: "CHALLENGE",
content: [
"# Challenge",
"",
"## Self-review Round",
"Internal critique.",
"",
"## Partner Round",
"Constructive critique.",
"",
"## Adversary Round",
"Hostile critique.",
"",
"## Finding 1: none",
"",
"## Overall Verdict",
"- `ADVISORY`",
"",
].join("\n"),
},
project,
);
assert.equal(result.isError, undefined);
const assessment = getAssessmentByScope("M990", "slice-challenge:S01");
assert.equal(assessment?.status, "advisory");
assert.equal(
assessment?.path,
".sf/milestones/M990/slices/S01/S01-CHALLENGE.md",
);
assert.equal(
verifyExpectedArtifact("challenge", "M990/S01/challenge", project),
true,
);
});

View file

@ -333,23 +333,23 @@ describe("selectAndApplyModel lineage-diverse wiring", () => {
);
});
// ── Non-lineage roles are not filtered ───────────────────────────────────────
test("validator_role_is_not_filtered", async () => {
// ── Validator review is lineage-diverse too ──────────────────────────────────
test("validator_explicit_workerModelId_filters_matching_vendor", async () => {
const candidates = [KIMI, ANTHROPIC, GOOGLE];
// validator's constraints are strict-json + review, NO lineage-diverse-from-worker
// → it should be able to pick kimi even when workerModelId is kimi
// We just assert no throw and that the call completes normally.
let threw = false;
try {
await dispatchFor({
candidates,
role: "validator",
workerModelId: "kimi-k2.6",
});
} catch {
threw = true;
}
assert.equal(threw, false, "validator dispatch should not throw");
const applied = await dispatchFor({
candidates,
role: "validator",
workerModelId: "kimi-k2.6",
});
if (applied === null) return;
const isMoonshot =
applied.provider === "kimi-coding" ||
applied.id.toLowerCase().startsWith("kimi-");
assert.equal(
isMoonshot,
false,
`validator should not get kimi when worker was kimi, got: ${applied.provider}/${applied.id}`,
);
});
// ── modelRoleForUnitType maps challenge → adversary ──────────────────────────
@ -360,6 +360,10 @@ describe("selectAndApplyModel lineage-diverse wiring", () => {
"../uok/model-route-evidence.js"
);
assert.equal(modelRoleForUnitType("challenge"), "adversary");
assert.equal(modelRoleForUnitType("combatant-review"), "reviewer");
assert.equal(modelRoleForUnitType("combatant"), "adversary");
assert.equal(modelRoleForUnitType("partner"), "reviewer");
assert.equal(modelRoleForUnitType("advocate"), "reviewer");
assert.equal(modelRoleForUnitType("review-slice"), "reviewer");
assert.equal(modelRoleForUnitType("validate-milestone"), "validator");
assert.equal(modelRoleForUnitType("execute-task"), "worker");

View file

@ -88,7 +88,11 @@ describe("model role policy", () => {
validator: {
role: "validator",
mode: "auto",
constraints: ["strict-json", "review"],
constraints: [
"strict-json",
"review",
"lineage-diverse-from-worker",
],
},
reviewer: {
role: "reviewer",
@ -140,6 +144,14 @@ describe("model role policy", () => {
);
});
test("validator_default_constraints_include_lineage_diverse_from_worker", () => {
const policy = normalizeRolePolicy("validator");
assert.ok(
policy.constraints.includes("lineage-diverse-from-worker"),
`expected lineage-diverse-from-worker in validator defaults, got: ${policy.constraints.join(", ")}`,
);
});
test("validateRolePolicy_adversary_with_lineage_diverse_from_worker_is_valid", () => {
const result = validateRolePolicy("adversary", {
constraints: ["lineage-diverse-from-worker"],

View file

@ -0,0 +1,78 @@
import assert from "node:assert/strict";
import { test } from "vitest";
import {
isLineageDiverseReviewSubagent,
resolveLineageDiverseSubagentModel,
} from "../subagent/lineage-routing.js";
const availableModels = [
{ provider: "kimi-coding", id: "kimi-k2.6" },
{ provider: "anthropic", id: "claude-sonnet-4-5" },
{ provider: "google-gemini-cli", id: "gemini-3-pro" },
];
test("isLineageDiverseReviewSubagent_detects_partner_and_combatant_lanes", () => {
assert.equal(isLineageDiverseReviewSubagent("reviewer", "audit"), true);
assert.equal(
isLineageDiverseReviewSubagent("reviewer", "Partner: strengthen plan"),
true,
);
assert.equal(
isLineageDiverseReviewSubagent("reviewer", "Combatant: break plan"),
true,
);
assert.equal(isLineageDiverseReviewSubagent("scout", "find files"), false);
});
test("resolveLineageDiverseSubagentModel_when_review_inherits_worker_family_selects_other_family", () => {
const model = resolveLineageDiverseSubagentModel({
agentName: "reviewer",
task: "Partner review",
requestedModel: undefined,
agentModel: undefined,
parentModel: { provider: "kimi-coding", id: "kimi-k2.6" },
availableModels,
});
assert.equal(model, "anthropic/claude-sonnet-4-5");
});
test("resolveLineageDiverseSubagentModel_when_explicit_same_family_selects_other_family", () => {
const model = resolveLineageDiverseSubagentModel({
agentName: "reviewer",
task: "Combatant review",
requestedModel: "kimi-coding/kimi-k2.6",
agentModel: undefined,
parentModel: { provider: "kimi-coding", id: "kimi-k2.6" },
availableModels,
});
assert.equal(model, "anthropic/claude-sonnet-4-5");
});
test("resolveLineageDiverseSubagentModel_when_non_review_keeps_requested_model", () => {
const model = resolveLineageDiverseSubagentModel({
agentName: "scout",
task: "Find files",
requestedModel: "kimi-coding/kimi-k2.6",
agentModel: undefined,
parentModel: { provider: "kimi-coding", id: "kimi-k2.6" },
availableModels,
});
assert.equal(model, "kimi-coding/kimi-k2.6");
});
test("resolveLineageDiverseSubagentModel_when_no_alternative_fails_open", () => {
const model = resolveLineageDiverseSubagentModel({
agentName: "reviewer",
task: "Partner review",
requestedModel: "kimi-coding/kimi-k2.6",
agentModel: undefined,
parentModel: { provider: "kimi-coding", id: "kimi-k2.6" },
availableModels: [{ provider: "kimi-coding", id: "kimi-k2.6" }],
});
assert.equal(model, "kimi-coding/kimi-k2.6");
});

View file

@ -12,6 +12,7 @@ import {
getMilestone,
getSliceStatusSummary,
getSliceTaskCounts,
insertAssessment,
readTransaction,
saveGateResult,
setSliceUatVerdict,
@ -34,6 +35,11 @@ export const SUPPORTED_SUMMARY_ARTIFACT_TYPES = [
"CONTEXT",
"ASSESSMENT",
"CONTEXT-DRAFT",
"CHALLENGE",
"DEPLOY",
"ROLLBACK",
"SMOKE",
"RELEASE",
];
export function isSupportedSummaryArtifactType(artifactType) {
return SUPPORTED_SUMMARY_ARTIFACT_TYPES.includes(artifactType);
@ -132,6 +138,20 @@ export async function executeSummarySave(params, basePath = process.cwd()) {
},
basePath,
);
if (params.artifact_type === "CHALLENGE") {
const verdict = extractChallengeVerdict(params.content);
insertAssessment({
path: `.sf/${relativePath}`,
milestoneId: params.milestone_id,
sliceId: params.slice_id ?? null,
taskId: params.task_id ?? null,
status: verdict,
scope: params.slice_id
? `slice-challenge:${params.slice_id}`
: "milestone-challenge",
fullContent: params.content,
});
}
// Persist UAT verdict to DB when an ASSESSMENT is saved at slice scope.
// This makes checkNeedsRunUat DB-only — no file fallback needed.
if (
@ -194,6 +214,16 @@ export async function executeSummarySave(params, basePath = process.cwd()) {
};
}
}
function extractChallengeVerdict(content) {
const text = String(content ?? "");
const explicit = text.match(
/Overall Verdict[\s\S]{0,200}?\b(PASS|NEEDS-REMEDIATION|ADVISORY)\b/i,
);
if (explicit) return explicit[1].toLowerCase();
const fallback = text.match(/\b(PASS|NEEDS-REMEDIATION|ADVISORY)\b/i);
return fallback ? fallback[1].toLowerCase() : "unknown";
}
export async function executeTaskComplete(params, basePath = process.cwd()) {
const dbAvailable = await ensureDbOpen(basePath);
if (!dbAvailable) {

View file

@ -39,7 +39,12 @@ import {
buildValidateMilestonePrompt,
buildWorkflowPreferencesPrompt,
} from "../auto-prompts.js";
import { hasImplementationArtifacts } from "../auto-recovery.js";
import {
hasMinimumChallengeRounds,
hasImplementationArtifacts,
verifyExpectedArtifact,
} from "../auto-recovery.js";
import { resolveExpectedArtifactPath } from "../auto-artifact-paths.js";
import { getCanonicalMilestonePlan } from "../canonical-milestone-plan.js";
import { resolveDeepProjectSetupState } from "../deep-project-setup-policy.js";
import { resolveEscalation } from "../escalation.js";
@ -79,6 +84,7 @@ import { createScheduleStore } from "../schedule/schedule-store.js";
import {
getMilestone,
getMilestoneSlices,
getAssessmentByScope,
getMilestoneValidationAssessment,
getPendingGates,
getRuntimeCounter,
@ -208,6 +214,66 @@ async function readMilestoneValidationForDispatch(basePath, mid) {
path: validationFile,
};
}
function adversarialReviewEnabled(prefs) {
return (
prefs?.deploy?.adversarial_review !== false &&
prefs?.phases?.skip_adversarial_review !== true
);
}
function challengeAssessmentScope(sliceId) {
return sliceId ? `slice-challenge:${sliceId}` : "milestone-challenge";
}
function challengeUnitId(milestoneId, sliceId) {
return sliceId ? `${milestoneId}/${sliceId}/challenge` : milestoneId;
}
function challengeAlreadyRecorded(basePath, milestoneId, sliceId = null) {
const expectedPath = resolveExpectedArtifactPath(
"challenge",
challengeUnitId(milestoneId, sliceId),
basePath,
);
if (
expectedPath &&
existsSync(expectedPath) &&
verifyExpectedArtifact(
"challenge",
challengeUnitId(milestoneId, sliceId),
basePath,
)
) {
return true;
}
if (!isDbAvailable()) return false;
const assessment = getAssessmentByScope(
milestoneId,
challengeAssessmentScope(sliceId),
);
if (!assessment) return false;
const status = String(assessment.status ?? "")
.trim()
.toLowerCase();
if (["", "unknown"].includes(status)) return false;
return hasMinimumChallengeRounds(assessment.full_content);
}
function findCompletedSliceNeedingChallenge(basePath, milestoneId) {
if (!isDbAvailable()) return null;
return getMilestoneSlices(milestoneId)
.filter((slice) => slice.status === "complete" || slice.status === "done")
.sort(
(a, b) =>
(a.sequence ?? Number.MAX_SAFE_INTEGER) -
(b.sequence ?? Number.MAX_SAFE_INTEGER) || a.id.localeCompare(b.id),
)
.find(
(slice) => !challengeAlreadyRecorded(basePath, milestoneId, slice.id),
);
}
function canonicalPlanStop(mid, plan) {
return {
action: "stop",
@ -887,6 +953,27 @@ export const DISPATCH_RULES = [
return null;
},
},
{
name: "completed-slice → challenge",
match: async ({ mid, midTitle, basePath, prefs }) => {
if (!adversarialReviewEnabled(prefs)) return null;
const slice = findCompletedSliceNeedingChallenge(basePath, mid);
if (!slice) return null;
return {
action: "dispatch",
unitType: "challenge",
unitId: challengeUnitId(mid, slice.id),
prompt: await buildChallengePrompt(
mid,
midTitle,
`slice ${slice.id}`,
prefs?.deploy?.adversarial_mode ?? "red-team",
basePath,
{ sliceId: slice.id },
),
};
},
},
{
name: "reassess-roadmap (post-completion)",
match: async ({ state, mid, midTitle, basePath, prefs }) => {
@ -1613,7 +1700,7 @@ export const DISPATCH_RULES = [
},
{
name: "completing-milestone → complete-milestone",
match: async ({ state, mid, midTitle, basePath }) => {
match: async ({ state, mid, midTitle, basePath, prefs }) => {
if (state.phase !== "completing-milestone") return null;
// Safety guard (#2675): completion is only automatic after a pass verdict.
// Non-pass terminal verdicts are still terminal for validation loops, but
@ -1799,6 +1886,23 @@ export const DISPATCH_RULES = [
} catch {
// Non-fatal advisory
}
if (
adversarialReviewEnabled(prefs) &&
!challengeAlreadyRecorded(basePath, mid)
) {
return {
action: "dispatch",
unitType: "challenge",
unitId: challengeUnitId(mid),
prompt: await buildChallengePrompt(
mid,
midTitle,
"milestone",
prefs?.deploy?.adversarial_mode ?? "red-team",
basePath,
),
};
}
return {
action: "dispatch",
unitType: "complete-milestone",
@ -1807,38 +1911,6 @@ export const DISPATCH_RULES = [
};
},
},
{
name: "completing-milestone → challenge",
match: async ({ state, mid, midTitle, basePath, prefs }) => {
if (state.phase !== "completing-milestone") return null;
if (!prefs?.deploy?.adversarial_review) return null;
// Only trigger if no challenge assessment exists for this milestone yet
try {
const { getDatabase } = await import("./sf-db.js");
const db = getDatabase(basePath);
const row = db
.prepare(
"SELECT id FROM assessments WHERE milestone_id = ? AND artifact_type = 'CHALLENGE' LIMIT 1",
)
.get(mid);
if (row) return null;
} catch {
return null;
}
return {
action: "dispatch",
unitType: "challenge",
unitId: `challenge-${mid}`,
prompt: await buildChallengePrompt(
mid,
midTitle,
"milestone",
prefs?.deploy?.adversarial_mode ?? "red-team",
basePath,
),
};
},
},
{
name: "completing-milestone → release",
match: async ({ state, mid, midTitle, basePath, prefs }) => {

View file

@ -38,13 +38,9 @@ export const SUPPORTED_MODEL_ROLES = Object.freeze([
* role's resolved model). Prevents "looks fine to me" rubber-stamp by
* lineage-twins. When the worker model is unknown or maps to "unknown" vendor,
* this constraint is treated as a no-op (fail-open rather than blocking all
* candidates). Enforcement is best-effort at the model-selection layer.
*
* TODO(lineage-diverse-from-worker): wire handler in auto-model-selection.js
* selectAndApplyModel after the policy gate resolves `routingEligibleModels`,
* call resolveWorkerLineage(resolvedWorkerModelId) and filter candidates with
* !isSameRootVendor(candidate.id, workerVendor). See rootVendorFor() in this
* file for the canonical vendor mapping.
* candidates). Enforcement happens in auto-model-selection.js before final
* route resolution. See rootVendorFor() in this file for the canonical vendor
* mapping.
*/
export const SUPPORTED_MODEL_ROLE_CONSTRAINTS = Object.freeze([
"coding",
@ -79,7 +75,7 @@ const CONCRETE_ROUTE_KEYS = new Set([
export const DEFAULT_MODEL_ROLE_CONSTRAINTS = Object.freeze({
orchestrator: Object.freeze(["long-context", "strict-json"]),
worker: Object.freeze(["coding"]),
validator: Object.freeze(["strict-json", "review"]),
validator: Object.freeze(["strict-json", "review", "lineage-diverse-from-worker"]),
reviewer: Object.freeze(["review", "lineage-diverse-from-worker"]),
adversary: Object.freeze(["review", "lineage-diverse-from-worker"]),
});

View file

@ -23,9 +23,13 @@ export function modelRoleForUnitType(unitType) {
if (normalized.includes("review") || normalized.includes("scrutiny")) {
return "reviewer";
}
if (normalized.includes("partner") || normalized.includes("advocate")) {
return "reviewer";
}
if (
normalized.includes("adversar") ||
normalized.includes("challenge") ||
normalized.includes("combatant") ||
normalized.includes("red-team") ||
normalized.includes("redteam")
) {