From c2f101734f4ed7d8c1467bc8572884cf5218839d Mon Sep 17 00:00:00 2001 From: Mikael Hugo Date: Sun, 17 May 2026 18:00:09 +0200 Subject: [PATCH] feat: enforce purpose-first adversarial review --- STYLEGUIDE.md | 5 + docs/SPEC_FIRST_TDD.md | 17 +- .../extensions/sf/auto-artifact-paths.js | 12 + .../extensions/sf/auto-model-selection.js | 12 +- src/resources/extensions/sf/auto-prompts.js | 4 + src/resources/extensions/sf/auto-recovery.js | 28 ++ .../extensions/sf/bootstrap/db-tools.js | 10 +- .../extensions/sf/preferences-types.js | 5 + .../extensions/sf/prompts/challenge.md | 44 +++- .../sf/prompts/guided-plan-milestone.md | 2 +- .../extensions/sf/prompts/plan-milestone.md | 2 +- .../sf/prompts/validate-milestone.md | 2 +- .../sf/skills/advisory-partner/SKILL.md | 19 +- .../sf/skills/spec-first-tdd/SKILL.md | 6 +- src/resources/extensions/sf/subagent/index.js | 77 +++++- .../extensions/sf/subagent/lineage-routing.js | 65 +++++ .../auto-dispatch-canonical-plan.test.mjs | 241 ++++++++++++++++++ .../db-driven-recovery-dispatch.test.mjs | 79 ++++++ .../sf/tests/lineage-diverse-filter.test.mjs | 36 +-- .../sf/tests/model-role-policy.test.mjs | 14 +- .../tests/subagent-lineage-routing.test.mjs | 78 ++++++ .../sf/tools/workflow-tool-executors.js | 30 +++ .../extensions/sf/uok/auto-dispatch.js | 140 +++++++--- .../extensions/sf/uok/model-role-policy.js | 12 +- .../extensions/sf/uok/model-route-evidence.js | 4 + 25 files changed, 858 insertions(+), 86 deletions(-) create mode 100644 src/resources/extensions/sf/subagent/lineage-routing.js create mode 100644 src/resources/extensions/sf/tests/subagent-lineage-routing.test.mjs diff --git a/STYLEGUIDE.md b/STYLEGUIDE.md index b0ad1a834..a4846c413 100644 --- a/STYLEGUIDE.md +++ b/STYLEGUIDE.md @@ -54,9 +54,14 @@ Required sections for non-trivial exports: - **First line** — what it returns / does, present tense. - **Purpose:** — why it exists; the value it protects. - **Consumer:** — who calls it in production. No consumer = symbol shouldn't exist yet. +- **What tests should verify:** — for non-trivial behavior, name the consumer-visible contract the tests must prove. This is not a list of mocks or implementation steps. A bare `/** Helper. */` is a code smell. Either write the purpose or delete the symbol. +Purpose text is a contract, not decoration. If implementation and purpose +diverge, fix the code or rewrite the purpose with a new failing contract test +first. + ### Module-level JSDoc ```js diff --git a/docs/SPEC_FIRST_TDD.md b/docs/SPEC_FIRST_TDD.md index 81b28e426..3304eb4f0 100644 --- a/docs/SPEC_FIRST_TDD.md +++ b/docs/SPEC_FIRST_TDD.md @@ -73,6 +73,10 @@ If any field is missing: `BLOCKED: purpose unclear — [which field is missing]` Treat the contract as a **falsifiable hypothesis**: name the evidence that would prove it wrong before implementation locks in. A contract without a falsifier is half a contract. +**Purpose outranks implementation momentum.** If code, tests, docs, or agent +memory disagree, the purpose/consumer/contract chain wins. Fix the artifact that +drifted; do not reinterpret the purpose to excuse the implementation. + ## Workflow (mapped to sf's phase machine) ### Research phase — name the problem @@ -99,8 +103,11 @@ Clarify highest-impact unknowns first: behaviour, acceptance criteria, data inva For non-trivial contracts, pressure-test before locking the plan via the [`advisory-partner`](../src/resources/extensions/sf/skills/advisory-partner/SKILL.md) skill — this is sf's adversarial review surface, already wired into the Q3/Q4 gates and `validate-milestone`. It runs with the **validation** model, distinct from the planning/execution model — that's the point. 1. **Advocate pass** — strengthen the best version of the contract. -2. **Challenger pass** — attack assumptions AND propose an alternative. A challenger anchored to the advocate's framing is not adversarial. -3. **Falsifier (required gate, blocks Plan→Execute):** `FALSIFIER: this contract is wrong if [specific observable condition].` Generic falsifiers ("wrong if it doesn't work") are process failures. +2. **Partner pass** — improve the plan as a collaborator: sharpen the purpose, + name the real consumer, remove unnecessary scope, and identify the smallest + proof that creates value. +3. **Challenger pass** — attack assumptions AND propose an alternative. A challenger anchored to the advocate's framing is not adversarial. +4. **Falsifier (required gate, blocks Plan→Execute):** `FALSIFIER: this contract is wrong if [specific observable condition].` Generic falsifiers ("wrong if it doesn't work") are process failures. **Find the devil and find the experts:** @@ -179,10 +186,14 @@ After completing a step, state confidence as a number `0.0–1.0` and a one-line | Purpose & consumer | 0.95 | Run an adversarial review wave (advisory-partner Q3/Q5). | | Contract test | 0.90 | Adversarial review wave. | | Implementation | 0.95 | Add a specialist reviewer for the touched boundary (e.g. provider/transport/security). | -| Final evidence | 0.97 | Full adversarial: advocate + challenger + specialist. | +| Final evidence | 0.98 | Full adversarial: advocate + partner + challenger + specialist. | Skip the gate for trivial steps (typo fix, exhaustive matches with full coverage). The gate earns its keep on I/O boundaries, async loading, protocol integration, and anything touching real backends or models. +Below `0.98`, do not ship non-trivial autonomous behaviour. Persist the gap as +an adversarial finding, write the missing contract test, or explicitly downgrade +the scope before claiming completion. + LLM confidence numbers are poorly calibrated in absolute terms — the *relative* signal matters. If you write 0.7, you know you're guessing. Act on that. ## Tests Find Gaps diff --git a/src/resources/extensions/sf/auto-artifact-paths.js b/src/resources/extensions/sf/auto-artifact-paths.js index a677818ad..5676d2598 100644 --- a/src/resources/extensions/sf/auto-artifact-paths.js +++ b/src/resources/extensions/sf/auto-artifact-paths.js @@ -77,6 +77,14 @@ export function resolveExpectedArtifactPath(unitType, unitId, base) { const dir = resolveMilestonePath(base, mid); return dir ? join(dir, buildMilestoneFileName(mid, "VALIDATION")) : null; } + case "challenge": { + if (sid) { + const dir = resolveSlicePath(base, mid, sid); + return dir ? join(dir, buildSliceFileName(sid, "CHALLENGE")) : null; + } + const dir = resolveMilestonePath(base, mid); + return dir ? join(dir, buildMilestoneFileName(mid, "CHALLENGE")) : null; + } case "complete-milestone": { const dir = resolveMilestonePath(base, mid); return dir ? join(dir, buildMilestoneFileName(mid, "SUMMARY")) : null; @@ -130,6 +138,10 @@ export function diagnoseExpectedArtifact(unitType, unitId, base) { return `${relSliceFile(base, mid, sid, "ASSESSMENT")} (UAT assessment result)`; case "validate-milestone": return `${relMilestoneFile(base, mid, "VALIDATION")} (milestone validation report)`; + case "challenge": + return sid + ? `${relSliceFile(base, mid, sid, "CHALLENGE")} (adversarial slice challenge)` + : `${relMilestoneFile(base, mid, "CHALLENGE")} (adversarial milestone challenge)`; case "complete-milestone": return `${relMilestoneFile(base, mid, "SUMMARY")} (milestone summary)`; default: diff --git a/src/resources/extensions/sf/auto-model-selection.js b/src/resources/extensions/sf/auto-model-selection.js index f44a9a31c..2f71ab4f5 100644 --- a/src/resources/extensions/sf/auto-model-selection.js +++ b/src/resources/extensions/sf/auto-model-selection.js @@ -484,9 +484,9 @@ export async function selectAndApplyModel( ); } } - // ── Lineage-diverse filter (adversary / reviewer roles) ─────────────────── - // When the dispatching role is "adversary" or "reviewer" and its constraint - // set includes "lineage-diverse-from-worker", remove candidates that share + // ── Lineage-diverse filter (review roles) ────────────────────────────────── + // When the dispatching role carries "lineage-diverse-from-worker", remove + // candidates that share // root-vendor lineage with the worker model to prevent rubber-stamp reviews. // Fail-open: if filtering would leave zero candidates, log a warning and keep // the full eligible set so the dispatch is never completely blocked. @@ -499,7 +499,9 @@ export async function selectAndApplyModel( { const normalizedRole = String(role ?? "").toLowerCase(); const isLineageRole = - normalizedRole === "adversary" || normalizedRole === "reviewer"; + normalizedRole === "adversary" || + normalizedRole === "reviewer" || + normalizedRole === "validator"; if (isLineageRole) { const roleConstraints = DEFAULT_MODEL_ROLE_CONSTRAINTS[normalizedRole] ?? []; @@ -783,7 +785,7 @@ export async function selectAndApplyModel( } // Enforce lineage-diverse-from-worker: skip candidates whose root vendor // matches the worker's. lineageAllowedKeys is populated only when the - // constraint applies (adversary / reviewer with a known worker model id). + // constraint applies for a review role with a known worker model id. if (lineageAllowedKeys) { const key = `${model.provider.toLowerCase()}/${model.id.toLowerCase()}`; if (!lineageAllowedKeys.has(key)) { diff --git a/src/resources/extensions/sf/auto-prompts.js b/src/resources/extensions/sf/auto-prompts.js index 71e8121cf..f88c53bea 100644 --- a/src/resources/extensions/sf/auto-prompts.js +++ b/src/resources/extensions/sf/auto-prompts.js @@ -3689,6 +3689,7 @@ export async function buildChallengePrompt( challengeTarget, challengeMode, base, + options = {}, ) { const resolveArtifact = async (key) => { switch (key) { @@ -3710,6 +3711,9 @@ export async function buildChallengePrompt( milestoneTitle: midTitle, challengeTarget: challengeTarget ?? "milestone", challengeMode: challengeMode ?? "red-team", + saveSummaryScope: options.sliceId + ? `milestone_id: ${mid}, slice_id: ${options.sliceId}, artifact_type: "CHALLENGE"` + : `milestone_id: ${mid}, artifact_type: "CHALLENGE"`, inlinedContext, skillActivation: buildSkillActivationBlock({ base, diff --git a/src/resources/extensions/sf/auto-recovery.js b/src/resources/extensions/sf/auto-recovery.js index c68966b19..b352d90f3 100644 --- a/src/resources/extensions/sf/auto-recovery.js +++ b/src/resources/extensions/sf/auto-recovery.js @@ -60,6 +60,26 @@ import { logError, logWarning } from "./workflow-logger.js"; // Re-export so existing consumers of auto-recovery.ts keep working. export { diagnoseExpectedArtifact, resolveExpectedArtifactPath }; // ─── Artifact Resolution & Verification ─────────────────────────────────────── +/** + * Return true when a challenge report contains the minimum review lenses. + * + * Purpose: prevent one challenge verdict from becoming "truth"; SF requires + * self-review, partner review, and adversary review evidence before a completed + * slice or milestone is considered challenged. + * + * Consumer: challenge artifact verification and auto-dispatch backfill guards. + */ +export function hasMinimumChallengeRounds(content) { + const text = String(content ?? ""); + return ( + /self(?:[-\s]+review|[-\s]+critique|[-\s]+check)/i.test(text) && + /partner(?:[-\s]+review|[-\s]+round|[-\s]+pass|[-\s]+check)?/i.test(text) && + /(adversary|adversarial|combatant|red[-\s]*team)(?:[-\s]+review|[-\s]+round|[-\s]+pass|[-\s]+check)?/i.test( + text, + ) + ); +} + /** * Check whether a milestone produced implementation artifacts (non-`.sf/` files) * in the git history. Uses `git log --name-only` to inspect all commits on the @@ -320,6 +340,14 @@ export function verifyExpectedArtifact(unitType, unitId, base) { const validationContent = readFileSync(absPath, "utf-8"); if (!isValidationTerminal(validationContent)) return false; } + if (unitType === "challenge") { + const challengeContent = readFileSync(absPath, "utf-8"); + if (!/Overall Verdict/i.test(challengeContent)) return false; + if (!/\b(PASS|NEEDS-REMEDIATION|ADVISORY)\b/i.test(challengeContent)) { + return false; + } + if (!hasMinimumChallengeRounds(challengeContent)) return false; + } if (unitType === "plan-milestone" || unitType === "roadmap-meeting") { try { const roadmapContent = readFileSync(absPath, "utf-8"); diff --git a/src/resources/extensions/sf/bootstrap/db-tools.js b/src/resources/extensions/sf/bootstrap/db-tools.js index 17633bb4e..be00c5980 100644 --- a/src/resources/extensions/sf/bootstrap/db-tools.js +++ b/src/resources/extensions/sf/bootstrap/db-tools.js @@ -393,13 +393,13 @@ export function registerDbTools(pi) { name: "save_summary", label: "Save Summary", description: - "Save a research, summary, context, or assessment artifact to disk with an auto-computed path. " + - "Call this to persist planning or research output (e.g. a research brief, context doc, or summary) for a milestone, slice, or task.", + "Save a research, summary, context, assessment, challenge, or ops artifact to disk with an auto-computed path. " + + "Call this to persist planning, research, adversarial review, or operational output for a milestone, slice, or task.", promptSnippet: - "Save a planning artifact (SUMMARY/RESEARCH/CONTEXT/ASSESSMENT) to disk", + "Save a planning artifact (SUMMARY/RESEARCH/CONTEXT/ASSESSMENT/CHALLENGE) to disk", promptGuidelines: [ "milestone_id is required; slice_id and task_id are optional and determine the file path.", - "artifact_type must be one of: SUMMARY, RESEARCH, CONTEXT, ASSESSMENT, CONTEXT-DRAFT.", + "artifact_type must be one of: SUMMARY, RESEARCH, CONTEXT, ASSESSMENT, CONTEXT-DRAFT, CHALLENGE, DEPLOY, ROLLBACK, SMOKE, RELEASE.", "Use CONTEXT-DRAFT for incremental saves; use CONTEXT only for the final milestone context after verification.", ], parameters: Type.Object({ @@ -412,7 +412,7 @@ export function registerDbTools(pi) { ), artifact_type: Type.String({ description: - "One of: SUMMARY, RESEARCH, CONTEXT, ASSESSMENT, CONTEXT-DRAFT", + "One of: SUMMARY, RESEARCH, CONTEXT, ASSESSMENT, CONTEXT-DRAFT, CHALLENGE, DEPLOY, ROLLBACK, SMOKE, RELEASE", }), content: Type.String({ description: "The full markdown content of the artifact", diff --git a/src/resources/extensions/sf/preferences-types.js b/src/resources/extensions/sf/preferences-types.js index 13a02c216..1e5fdeb03 100644 --- a/src/resources/extensions/sf/preferences-types.js +++ b/src/resources/extensions/sf/preferences-types.js @@ -166,6 +166,11 @@ export const KNOWN_UNIT_TYPES = [ "run-uat", "complete-milestone", "validate-milestone", + "challenge", + "deploy", + "smoke-production", + "release", + "rollback", "rewrite-docs", "discuss-milestone", "discuss-slice", diff --git a/src/resources/extensions/sf/prompts/challenge.md b/src/resources/extensions/sf/prompts/challenge.md index 9b40c5887..d5893f3b8 100644 --- a/src/resources/extensions/sf/prompts/challenge.md +++ b/src/resources/extensions/sf/prompts/challenge.md @@ -18,6 +18,19 @@ You are executing SF autonomous mode. You are the adversary agent. Your job is NOT to complete work — it is to rigorously attack the correctness, safety, and assumptions of what has been built or planned. You represent the failure modes the product team has not considered. +Challenge output is evidence, not truth. A challenge report is not acceptable +until it contains all three review rounds: + +1. **Self-review round** — the original model's best internal critique of its + own likely blind spots and weak assumptions. +2. **Partner round** — the constructive partner view: strengthen the purpose, + consumer, smallest valuable proof, and remediation shape. +3. **Adversary round** — the hostile/combatant view: break the claim, find + counterexamples, and name blocking risks. + +If any round is missing, the challenge is incomplete even if it has an Overall +Verdict. + ### Challenge rules 1. **Be ruthless but specific.** Every finding must include: what breaks, under what condition, and what the impact is. @@ -36,6 +49,19 @@ Based on `{{challengeMode}}`: ### Output format +Start with these required sections: + +``` +## Self-review Round + + +## Partner Round + + +## Adversary Round + +``` + For each finding: ``` @@ -54,11 +80,25 @@ After all findings, provide an **Overall Verdict**: - `NEEDS-REMEDIATION` — one or more critical/high findings must be addressed before deploy. - `ADVISORY` — findings are low severity; proceed with awareness. -Call `save_summary` with `milestone_id: {{milestoneId}}`, `artifact_type: "CHALLENGE"`, and the full challenge report as content. +Call `save_summary` with `{{saveSummaryScope}}`, and the full challenge report as content. ### Report sf-internal observations -If you observe sf-the-tool friction during this unit, file via `report_issue`. +If the target is already closed and you find real remediation work, do **not** +reopen or edit the closed slice/milestone yourself. Closed history stays closed. +Instead: + +- File each critical/high finding with `report_issue` using + `kind: "adversarial-finding"`, the finding severity, concrete evidence, and + falsifiable acceptance criteria. +- Recommend a new remediation slice/task in the finding. The planner/triage + path owns opening new work from that evidence. +- Use `NEEDS-REMEDIATION` only when the finding should block the current + milestone/deploy path. Use `ADVISORY` for follow-up work that can be queued + without blocking. + +If you observe other sf-the-tool friction during this unit, file via +`report_issue`. When done, say: "Challenge {{milestoneId}} complete — verdict: ." diff --git a/src/resources/extensions/sf/prompts/guided-plan-milestone.md b/src/resources/extensions/sf/prompts/guided-plan-milestone.md index 4e2751e2e..7a0d3794d 100644 --- a/src/resources/extensions/sf/prompts/guided-plan-milestone.md +++ b/src/resources/extensions/sf/prompts/guided-plan-milestone.md @@ -1,6 +1,6 @@ Plan milestone {{milestoneId}} ("{{milestoneTitle}}"). Decisions and requirements are injected from the database into your system context — respect existing decisions and treat Active requirements as the capability contract. If no requirements are present, treat that as a planning gap: derive the minimum requirement coverage from current project evidence, persist it through `save_requirement`, and explicitly note missing coverage. Use the **Roadmap** output template below to shape the milestone planning payload you send to `plan_milestone`. Start the `vision` field with the milestone purpose before implementation detail, include the structured `productResearch` payload when the work is product-facing, workflow-facing, developer-experience, or market-positioning, and make each slice `goal` state the slice purpose before mechanics. If the milestone changes how SF is driven, observed, integrated, or automated, keep the axes separate in the roadmap: surface (TUI/CLI/web/editor/machine), protocol (ACP/RPC/stdio/HTTP/wire), output format (text/json/stream-json), run control (manual/assisted/autonomous), and permission profile (restricted/normal/trusted/unrestricted). Call `plan_milestone` to persist the milestone planning fields and render `{{milestoneId}}-ROADMAP.md` from DB state. Do **not** write `{{milestoneId}}-ROADMAP.md`, `ROADMAP.md`, or other planning artifacts manually. If planning produces structural decisions, call `save_decision`. {{skillActivation}} Fill the Horizontal Checklist section with cross-cutting concerns considered during planning (requirements re-read, decisions re-evaluated, graceful shutdown, revenue paths, auth boundary, shared resources, reconnection). Omit for trivial milestones. -Before calling `plan_milestone`, run a bounded **Vision Alignment Meeting** for the milestone and roadmap as a real multi-agent review. Use the `subagent` tool in `mode: "debate"` with `rounds: 2` and a separate task for each participant lens below. Do **not** merely simulate every participant inside this planner response. Use only supported agent names: `planner`, `reviewer`, `researcher`, and `scout`. Put the stakeholder role name inside the task text; do not invent agent names such as `combatant`, `delivery-lead`, `product-manager`, or `customer-panel`. If the `subagent` tool is unavailable or fails after one retry, record that explicitly in `trigger` and run the structured meeting inline as a degraded fallback. This is allowed to be broader and more nuanced than slice planning. Include at least these participant lenses: +Before calling `plan_milestone`, run a bounded **Vision Alignment Meeting** for the milestone and roadmap as a real multi-agent review. Use the `subagent` tool in `mode: "debate"` with `rounds: 2` and a separate task for each participant lens below. Do **not** merely simulate every participant inside this planner response. Use only supported agent names: `planner`, `reviewer`, `researcher`, and `scout`. Put the stakeholder role name inside the task text; do not invent agent names such as `combatant`, `delivery-lead`, `product-manager`, or `customer-panel`. Do not set a `model` override for Partner, Combatant, or validation-style reviewers; SF's reviewer/adversary routing must choose a lineage-diverse model family from the worker whenever possible. If the `subagent` tool is unavailable or fails after one retry, record that explicitly in `trigger` and run the structured meeting inline as a degraded fallback. This is allowed to be broader and more nuanced than slice planning. Include at least these participant lenses: - Product Manager - User Advocate - Customer Panel diff --git a/src/resources/extensions/sf/prompts/plan-milestone.md b/src/resources/extensions/sf/prompts/plan-milestone.md index 530ad206a..78d1d7afc 100644 --- a/src/resources/extensions/sf/prompts/plan-milestone.md +++ b/src/resources/extensions/sf/prompts/plan-milestone.md @@ -51,7 +51,7 @@ If milestone research exists (inlined above), trust those findings and skip redu Narrate your decomposition reasoning - why you're grouping work this way, what risks are driving the order, what verification strategy you're choosing and why. Use complete sentences rather than planner shorthand or fragmentary notes. -Before you persist the roadmap, run a bounded **Vision Alignment Meeting** as a real multi-agent review. Use the `subagent` tool in `mode: "debate"` with `rounds: 2` and a separate task for each participant lens below. Do **not** merely simulate every participant inside this planner response. Use only supported agent names: `planner`, `reviewer`, `researcher`, and `scout`. Put the stakeholder role name inside the task text; do not invent agent names such as `combatant`, `delivery-lead`, `product-manager`, or `customer-panel`. If the `subagent` tool is unavailable or fails after one retry, record that explicitly in `trigger` and run the structured meeting inline as a degraded fallback. This is broader than slice planning and should feel allowed to be chatty and nuanced. Gather the strongest additions, cuts, and ordering changes from these participant lenses: +Before you persist the roadmap, run a bounded **Vision Alignment Meeting** as a real multi-agent review. Use the `subagent` tool in `mode: "debate"` with `rounds: 2` and a separate task for each participant lens below. Do **not** merely simulate every participant inside this planner response. Use only supported agent names: `planner`, `reviewer`, `researcher`, and `scout`. Put the stakeholder role name inside the task text; do not invent agent names such as `combatant`, `delivery-lead`, `product-manager`, or `customer-panel`. Do not set a `model` override for Partner, Combatant, or validation-style reviewers; SF's reviewer/adversary routing must choose a lineage-diverse model family from the worker whenever possible. If the `subagent` tool is unavailable or fails after one retry, record that explicitly in `trigger` and run the structured meeting inline as a degraded fallback. This is broader than slice planning and should feel allowed to be chatty and nuanced. Gather the strongest additions, cuts, and ordering changes from these participant lenses: Each role has a **purpose** (why it exists in the ceremony) and a **depth contract** (what a meaningful contribution must contain). A contribution that restates the roadmap title without evidence, specific risks, or concrete subsystem references is not meaningful — it is a rubber stamp. diff --git a/src/resources/extensions/sf/prompts/validate-milestone.md b/src/resources/extensions/sf/prompts/validate-milestone.md index 5deb5e242..c30bce732 100644 --- a/src/resources/extensions/sf/prompts/validate-milestone.md +++ b/src/resources/extensions/sf/prompts/validate-milestone.md @@ -23,7 +23,7 @@ All relevant context has been preloaded below — the roadmap, all slice summari ### Step 1 — Dispatch Parallel Reviewers Call `subagent` with `tasks: [...]` containing ALL THREE reviewers simultaneously. -Use `agent: "reviewer"` for every validation reviewer. Do not use `code`, `coder`, or `worker` here — this is review/validation work, not implementation. +Use `agent: "reviewer"` for every validation reviewer. Do not use `code`, `coder`, or `worker` here — this is review/validation work, not implementation. Do not set a `model` override; SF's reviewer/validator routing should choose a model family that is lineage-diverse from the worker when possible. **Pass `parentTrace`** at the batch level. For milestone validation, the "parent" is the slice work that built this milestone — pass a one-line-per- diff --git a/src/resources/extensions/sf/skills/advisory-partner/SKILL.md b/src/resources/extensions/sf/skills/advisory-partner/SKILL.md index 0531151eb..f43a08ca8 100644 --- a/src/resources/extensions/sf/skills/advisory-partner/SKILL.md +++ b/src/resources/extensions/sf/skills/advisory-partner/SKILL.md @@ -67,7 +67,20 @@ If you can't fill this in, the plan is incomplete. --- -### 2. Five Challenger Questions +### 2. Partner Pass + +Before attacking the plan, improve it as a partner: + +- Restate the purpose in one sentence. +- Name the real production consumer. +- Remove scope that does not serve that purpose. +- Identify the smallest proof point that gives the consumer value. + +If this pass cannot name purpose and consumer, stop with `RECONSIDER`. + +--- + +### 3. Five Challenger Questions Answer each from the artifact — not from general knowledge: @@ -88,7 +101,7 @@ Write the objection. Then answer it. If the answer is weak, that's a flag. --- -### 3. Trap Scan +### 4. Trap Scan | Trap | Warning Sign | |------|-------------| @@ -100,7 +113,7 @@ Write the objection. Then answer it. If the answer is weak, that's a flag. --- -### 4. Verdict +### 5. Verdict ``` ADVISORY VERDICT: [PROCEED / PROCEED WITH CAVEAT / RECONSIDER] diff --git a/src/resources/extensions/sf/skills/spec-first-tdd/SKILL.md b/src/resources/extensions/sf/skills/spec-first-tdd/SKILL.md index 5f2be3bc6..563bf3249 100644 --- a/src/resources/extensions/sf/skills/spec-first-tdd/SKILL.md +++ b/src/resources/extensions/sf/skills/spec-first-tdd/SKILL.md @@ -121,10 +121,14 @@ After each major step, state confidence as `0.0–1.0` plus a one-line reason. | Purpose & consumer | 0.95 | Run `advisory-partner` Q3 (abuse) + Q5 (strongest objection). | | Contract test | 0.90 | Adversarial review wave. | | Implementation | 0.95 | Add a specialist reviewer for the boundary touched. | -| Evidence | 0.97 | Full advocate + challenger + specialist. | +| Evidence | 0.98 | Full advocate + partner + challenger + specialist. | LLM confidence is poorly calibrated in absolute terms — the relative signal matters. If you write 0.7, you're guessing. Act on it. +Below `0.98`, non-trivial autonomous behaviour does not ship. Persist the gap +as an adversarial finding, write the missing contract test, or explicitly shrink +scope before claiming completion. + ### 8. Verify Consumer and Record Evidence - Re-run `rg` for the changed symbol; confirm a real caller still depends on it. diff --git a/src/resources/extensions/sf/subagent/index.js b/src/resources/extensions/sf/subagent/index.js index c6d4795d4..12beb7d78 100644 --- a/src/resources/extensions/sf/subagent/index.js +++ b/src/resources/extensions/sf/subagent/index.js @@ -50,6 +50,7 @@ import { mergeDeltaPatches, readIsolationMode, } from "./isolation.js"; +import { resolveLineageDiverseSubagentModel } from "./lineage-routing.js"; import { composeAgentPrompt } from "./prompt-parts.js"; import { registerWorker, updateWorker } from "./worker-registry.js"; @@ -371,6 +372,25 @@ function validateSubagentInvocationAgainstInheritance( } return { ok: true }; } + +function resolveSubagentModelOverride({ + agents, + agentName, + task, + requestedModel, + parentModel, + availableModels, +}) { + const { agent, effectiveName } = resolveAgentByName(agents, agentName); + return resolveLineageDiverseSubagentModel({ + agentName: effectiveName ?? agentName, + task, + requestedModel, + agentModel: agent?.model, + parentModel, + availableModels, + }); +} async function executeSubagentInvocation({ defaultCwd, agents, @@ -383,6 +403,8 @@ async function executeSubagentInvocation({ cmuxSplitsEnabled, useIsolation, inheritanceEnvelope, + parentModel, + availableModels, }) { const makeDetails = (mode) => (results) => ({ mode, @@ -430,7 +452,14 @@ async function executeSubagentInvocation({ signal, chainUpdate, makeDetails("chain"), - step.model ?? params.model, + resolveSubagentModelOverride({ + agents, + agentName: step.agent, + task: taskForStep, + requestedModel: step.model ?? params.model, + parentModel, + availableModels, + }), inheritanceEnvelope, ); results.push(result); @@ -582,7 +611,14 @@ async function executeSubagentInvocation({ async (t, index) => { const resultIndex = (round - 1) * batchTasks.length + index; const prompt = buildDebatePrompt(t, round, transcript); - const taskModelOverride = t.model ?? params.model; + const taskModelOverride = resolveSubagentModelOverride({ + agents, + agentName: t.agent, + task: prompt, + requestedModel: t.model ?? params.model, + parentModel, + availableModels, + }); const result = cmuxSplitsEnabled ? await runSingleAgentInCmuxSplit( cmuxClient, @@ -749,11 +785,18 @@ async function executeSubagentInvocation({ batchSize, batchId, ); - const taskModelOverride = t.model ?? params.model; const taskWithTrace = composeTaskWithParentTrace( t.task, t.parentTrace ?? params.parentTrace, ); + const taskModelOverride = resolveSubagentModelOverride({ + agents, + agentName: t.agent, + task: taskWithTrace, + requestedModel: t.model ?? params.model, + parentModel, + availableModels, + }); const runTask = () => cmuxSplitsEnabled ? runSingleAgentInCmuxSplit( @@ -861,7 +904,14 @@ async function executeSubagentInvocation({ signal, onUpdate, makeDetails("single"), - params.model, + resolveSubagentModelOverride({ + agents, + agentName: params.agent, + task: singleTaskWithTrace, + requestedModel: params.model, + parentModel, + availableModels, + }), inheritanceEnvelope, ) : await runSingleAgent( @@ -874,7 +924,14 @@ async function executeSubagentInvocation({ signal, onUpdate, makeDetails("single"), - params.model, + resolveSubagentModelOverride({ + agents, + agentName: params.agent, + task: singleTaskWithTrace, + requestedModel: params.model, + parentModel, + availableModels, + }), inheritanceEnvelope, ); if (isolation) { @@ -1994,6 +2051,10 @@ export default function (pi) { }); const cmuxClient = CmuxClient.fromPreferences(effectivePreferences); const cmuxSplitsEnabled = cmuxClient.getConfig().splits; + const parentModel = ctx.model + ? { provider: ctx.model.provider, id: ctx.model.id } + : null; + const availableModels = ctx.modelRegistry?.getAvailable?.() ?? []; // Resolve isolation mode const isolationMode = readIsolationMode(); const useIsolation = Boolean(params.isolated) && isolationMode !== "none"; @@ -2116,6 +2177,8 @@ export default function (pi) { cmuxSplitsEnabled, useIsolation, inheritanceEnvelope, + parentModel, + availableModels, }), } : null; @@ -2138,6 +2201,8 @@ export default function (pi) { cmuxSplitsEnabled, useIsolation, inheritanceEnvelope, + parentModel, + availableModels, }).then((result) => { // Append agent turn BEFORE deliverResult fires the notification. if (jobId && isSingleMode) { @@ -2176,6 +2241,8 @@ export default function (pi) { cmuxSplitsEnabled, useIsolation, inheritanceEnvelope, + parentModel, + availableModels, }); }, renderCall(args, theme) { diff --git a/src/resources/extensions/sf/subagent/lineage-routing.js b/src/resources/extensions/sf/subagent/lineage-routing.js new file mode 100644 index 000000000..83947f745 --- /dev/null +++ b/src/resources/extensions/sf/subagent/lineage-routing.js @@ -0,0 +1,65 @@ +import { isSameRootVendor } from "../uok/model-role-policy.js"; + +const REVIEW_AGENT_RE = + /\b(reviewer|review-code|sf-reviewer|partner|advocate|adversary|combatant|critic|validator)\b/i; +const REVIEW_TASK_RE = + /\b(partner|advocate|reviewer|review|adversary|adversarial|combatant|challenger|critic|validation|validator)\b/i; + +function modelKey(model) { + if (!model) return undefined; + if (typeof model === "string") return model; + if (model.provider && model.id) return `${model.provider}/${model.id}`; + return model.id; +} + +/** + * Return true when a subagent request is review-like and needs lineage diversity. + * + * Purpose: keep partner/advisory/adversarial lanes from sharing the worker's + * blind spots just because they are all launched through the generic + * `subagent` tool. + * + * Consumer: subagent/index.js before launching single, chain, parallel, or + * debate subagents. + */ +export function isLineageDiverseReviewSubagent(agentName, task) { + return ( + REVIEW_AGENT_RE.test(String(agentName ?? "")) || + REVIEW_TASK_RE.test(String(task ?? "")) + ); +} + +/** + * Resolve a subagent model override that avoids the parent worker lineage. + * + * Purpose: enforce R113 at launch time: partner, reviewer, validator, and + * combatant/adversary subagents should use a different model family from the + * parent worker when an alternative is available. If no alternative exists, + * fail open and keep the requested model rather than blocking review entirely. + * + * Consumer: subagent/index.js model override resolution. + */ +export function resolveLineageDiverseSubagentModel({ + agentName, + task, + requestedModel, + agentModel, + parentModel, + availableModels = [], +}) { + const requested = requestedModel ?? agentModel; + if (!isLineageDiverseReviewSubagent(agentName, task)) return requestedModel; + const parentKey = modelKey(parentModel); + if (!parentKey) return requestedModel; + const requestedKey = modelKey(requested); + if (requestedKey && !isSameRootVendor(requestedKey, parentKey)) { + return requestedModel; + } + const candidates = availableModels.filter((model) => { + const key = modelKey(model); + return key && !isSameRootVendor(key, parentKey); + }); + const selected = candidates[0]; + const selectedKey = modelKey(selected); + return selectedKey ?? requestedModel; +} diff --git a/src/resources/extensions/sf/tests/auto-dispatch-canonical-plan.test.mjs b/src/resources/extensions/sf/tests/auto-dispatch-canonical-plan.test.mjs index 3a841cfc2..069bb7a42 100644 --- a/src/resources/extensions/sf/tests/auto-dispatch-canonical-plan.test.mjs +++ b/src/resources/extensions/sf/tests/auto-dispatch-canonical-plan.test.mjs @@ -16,8 +16,10 @@ const parallelPromptMock = vi.hoisted(() => ); vi.mock("../auto-prompts.js", () => ({ + buildChallengePrompt: vi.fn(async (...args) => `challenge ${args.join(" ")}`), buildCompleteMilestonePrompt: vi.fn(async () => "complete milestone"), buildCompleteSlicePrompt: vi.fn(async () => "complete slice"), + buildDeployPrompt: vi.fn(async () => "deploy"), buildDiscussMilestonePrompt: vi.fn(async () => "discuss milestone"), buildDiscussProjectPrompt: vi.fn(async () => "discuss project"), buildDiscussRequirementsPrompt: vi.fn(async () => "discuss requirements"), @@ -33,14 +35,22 @@ vi.mock("../auto-prompts.js", () => ({ buildResearchProjectPrompt: vi.fn(async () => "research project"), buildResearchMilestonePrompt: vi.fn(async () => "research milestone"), buildResearchSlicePrompt: vi.fn(async () => "research slice"), + buildReleasePrompt: vi.fn(async () => "release"), buildRewriteDocsPrompt: vi.fn(async () => "rewrite docs"), + buildRollbackPrompt: vi.fn(async () => "rollback"), buildRunUatPrompt: vi.fn(async () => "run uat"), + buildSmokeProductionPrompt: vi.fn(async () => "smoke production"), buildValidateMilestonePrompt: vi.fn(async () => "validate milestone"), buildWorkflowPreferencesPrompt: vi.fn(async () => "workflow preferences"), checkNeedsReassessment: vi.fn(async () => null), checkNeedsRunUat: vi.fn(async () => null), })); +vi.mock("../preferences-models.js", () => ({ + _initPrefsLoader: vi.fn(), + resolveModelWithFallbacksForUnit: vi.fn(() => ({ primary: "mock-model" })), +})); + import { closeDatabase, insertAssessment, @@ -326,4 +336,235 @@ describe("resolveDispatch canonical milestone plan", () => { cleanup(base); } }); + + test("completed_slice_without_challenge_dispatches_slice_challenge_by_default", async () => { + const base = makeTempDir("sf-dispatch-slice-challenge-"); + try { + mkdirSync(join(base, ".sf"), { recursive: true }); + openDatabase(join(base, ".sf", "sf.db")); + insertMilestone({ + id: "M901", + title: "Purpose challenge gate", + status: "active", + }); + insertSlice({ + milestoneId: "M901", + id: "S01", + title: "Completed work", + status: "complete", + sequence: 1, + }); + + const result = await resolveDispatch({ + state: { phase: "completing-milestone" }, + mid: "M901", + midTitle: "Purpose challenge gate", + basePath: base, + prefs: { phases: {} }, + session: {}, + pipelineVariant: "standard", + }); + + expect(result).toMatchObject({ + action: "dispatch", + unitType: "challenge", + unitId: "M901/S01/challenge", + }); + expect(result.prompt).toContain("slice S01"); + } finally { + closeDatabase(); + cleanup(base); + } + }); + + test("completing_milestone_after_slice_challenges_dispatches_milestone_challenge_before_completion", async () => { + const base = makeTempDir("sf-dispatch-milestone-challenge-"); + try { + mkdirSync(join(base, ".sf"), { recursive: true }); + openDatabase(join(base, ".sf", "sf.db")); + insertMilestone({ + id: "M902", + title: "Milestone challenge gate", + status: "active", + }); + insertSlice({ + milestoneId: "M902", + id: "S01", + title: "Completed work", + status: "complete", + sequence: 1, + }); + insertAssessment({ + path: ".sf/milestones/M902/slices/S01/S01-CHALLENGE.md", + milestoneId: "M902", + sliceId: "S01", + scope: "slice-challenge:S01", + status: "pass", + fullContent: + "## Self-review Round\nok\n## Partner Round\nok\n## Adversary Round\nok\n## Overall Verdict\n- `PASS`\n", + }); + insertAssessment({ + path: ".sf/milestones/M902/M902-VALIDATION.md", + milestoneId: "M902", + scope: "milestone-validation", + status: "pass", + fullContent: "verdict: pass\nOperational: MET\n", + }); + + const result = await resolveDispatch({ + state: { phase: "completing-milestone" }, + mid: "M902", + midTitle: "Milestone challenge gate", + basePath: base, + prefs: { phases: {} }, + session: {}, + pipelineVariant: "standard", + }); + + expect(result).toMatchObject({ + action: "dispatch", + unitType: "challenge", + unitId: "M902", + }); + expect(result.prompt).toContain("milestone"); + } finally { + closeDatabase(); + cleanup(base); + } + }); + + test("completed_slice_when_challenge_file_has_no_verdict_dispatches_challenge_again", async () => { + const base = makeTempDir("sf-dispatch-invalid-challenge-"); + try { + mkdirSync(join(base, ".sf", "milestones", "M904", "slices", "S01"), { + recursive: true, + }); + writeFileSync( + join( + base, + ".sf", + "milestones", + "M904", + "slices", + "S01", + "S01-CHALLENGE.md", + ), + "# partial challenge without verdict\n", + ); + openDatabase(join(base, ".sf", "sf.db")); + insertMilestone({ + id: "M904", + title: "Invalid challenge backfill", + status: "active", + }); + insertSlice({ + milestoneId: "M904", + id: "S01", + title: "Completed work", + status: "complete", + sequence: 1, + }); + + const result = await resolveDispatch({ + state: { phase: "planning" }, + mid: "M904", + midTitle: "Invalid challenge backfill", + basePath: base, + prefs: { phases: {} }, + session: {}, + pipelineVariant: "standard", + }); + + expect(result).toMatchObject({ + action: "dispatch", + unitType: "challenge", + unitId: "M904/S01/challenge", + }); + } finally { + closeDatabase(); + cleanup(base); + } + }); + + test("skipped_slice_without_challenge_does_not_backfill_challenge", async () => { + const base = makeTempDir("sf-dispatch-skipped-challenge-"); + try { + mkdirSync(join(base, ".sf"), { recursive: true }); + openDatabase(join(base, ".sf", "sf.db")); + insertMilestone({ + id: "M905", + title: "Skipped slice challenge", + status: "active", + }); + insertSlice({ + milestoneId: "M905", + id: "S01", + title: "Skipped work", + status: "skipped", + sequence: 1, + }); + + const result = await resolveDispatch({ + state: { phase: "planning" }, + mid: "M905", + midTitle: "Skipped slice challenge", + basePath: base, + prefs: { phases: {} }, + session: {}, + pipelineVariant: "standard", + }); + + expect(result?.unitType).not.toBe("challenge"); + } finally { + closeDatabase(); + cleanup(base); + } + }); + + test("completing_milestone_when_adversarial_review_disabled_keeps_completion_path", async () => { + const base = makeTempDir("sf-dispatch-no-challenge-"); + try { + mkdirSync(join(base, ".sf"), { recursive: true }); + openDatabase(join(base, ".sf", "sf.db")); + insertMilestone({ + id: "M903", + title: "Explicitly disabled challenge", + status: "active", + }); + insertSlice({ + milestoneId: "M903", + id: "S01", + title: "Completed work", + status: "complete", + sequence: 1, + }); + insertAssessment({ + path: ".sf/milestones/M903/M903-VALIDATION.md", + milestoneId: "M903", + scope: "milestone-validation", + status: "pass", + fullContent: "verdict: pass\nOperational: MET\n", + }); + + const result = await resolveDispatch({ + state: { phase: "completing-milestone" }, + mid: "M903", + midTitle: "Explicitly disabled challenge", + basePath: base, + prefs: { phases: {}, deploy: { adversarial_review: false } }, + session: {}, + pipelineVariant: "standard", + }); + + expect(result).toMatchObject({ + action: "dispatch", + unitType: "complete-milestone", + unitId: "M903", + prompt: "complete milestone", + }); + } finally { + closeDatabase(); + cleanup(base); + } + }); }); diff --git a/src/resources/extensions/sf/tests/db-driven-recovery-dispatch.test.mjs b/src/resources/extensions/sf/tests/db-driven-recovery-dispatch.test.mjs index 624345398..8daaa5f9f 100644 --- a/src/resources/extensions/sf/tests/db-driven-recovery-dispatch.test.mjs +++ b/src/resources/extensions/sf/tests/db-driven-recovery-dispatch.test.mjs @@ -11,8 +11,10 @@ import { join } from "node:path"; import { afterEach, test } from "vitest"; import { dispatchDirectPhase } from "../auto-direct-dispatch.js"; import { verifyExpectedArtifact } from "../auto-recovery.js"; +import { executeSummarySave } from "../tools/workflow-tool-executors.js"; import { closeDatabase, + getAssessmentByScope, insertMilestone, insertSlice, openDatabase, @@ -130,3 +132,80 @@ test("dispatchDirectPhase_when_db_has_no_completed_slices_ignores_stale_roadmap_ }, ]); }); + +test("verifyExpectedArtifact_challenge_requires_overall_verdict", () => { + const project = makeProject(); + const milestoneDir = join(project, ".sf", "milestones", "M990"); + writeFileSync(join(milestoneDir, "M990-CHALLENGE.md"), "# Challenge\n"); + assert.equal(verifyExpectedArtifact("challenge", "M990", project), false); + + writeFileSync( + join(milestoneDir, "M990-CHALLENGE.md"), + ["# Challenge", "", "## Overall Verdict", "- `PASS`", ""].join("\n"), + ); + assert.equal(verifyExpectedArtifact("challenge", "M990", project), false); + + writeFileSync( + join(milestoneDir, "M990-CHALLENGE.md"), + [ + "# Challenge", + "", + "## Self-review Round", + "Internal critique.", + "", + "## Partner Round", + "Constructive critique.", + "", + "## Adversary Round", + "Hostile critique.", + "", + "## Overall Verdict", + "- `PASS`", + "", + ].join("\n"), + ); + assert.equal(verifyExpectedArtifact("challenge", "M990", project), true); +}); + +test("executeSummarySave_challenge_persists_assessment_scope", async () => { + const project = makeProject(); + + const result = await executeSummarySave( + { + milestone_id: "M990", + slice_id: "S01", + artifact_type: "CHALLENGE", + content: [ + "# Challenge", + "", + "## Self-review Round", + "Internal critique.", + "", + "## Partner Round", + "Constructive critique.", + "", + "## Adversary Round", + "Hostile critique.", + "", + "## Finding 1: none", + "", + "## Overall Verdict", + "- `ADVISORY`", + "", + ].join("\n"), + }, + project, + ); + + assert.equal(result.isError, undefined); + const assessment = getAssessmentByScope("M990", "slice-challenge:S01"); + assert.equal(assessment?.status, "advisory"); + assert.equal( + assessment?.path, + ".sf/milestones/M990/slices/S01/S01-CHALLENGE.md", + ); + assert.equal( + verifyExpectedArtifact("challenge", "M990/S01/challenge", project), + true, + ); +}); diff --git a/src/resources/extensions/sf/tests/lineage-diverse-filter.test.mjs b/src/resources/extensions/sf/tests/lineage-diverse-filter.test.mjs index 49cebd5e2..5ef78ac88 100644 --- a/src/resources/extensions/sf/tests/lineage-diverse-filter.test.mjs +++ b/src/resources/extensions/sf/tests/lineage-diverse-filter.test.mjs @@ -333,23 +333,23 @@ describe("selectAndApplyModel lineage-diverse wiring", () => { ); }); - // ── Non-lineage roles are not filtered ─────────────────────────────────────── - test("validator_role_is_not_filtered", async () => { + // ── Validator review is lineage-diverse too ────────────────────────────────── + test("validator_explicit_workerModelId_filters_matching_vendor", async () => { const candidates = [KIMI, ANTHROPIC, GOOGLE]; - // validator's constraints are strict-json + review, NO lineage-diverse-from-worker - // → it should be able to pick kimi even when workerModelId is kimi - // We just assert no throw and that the call completes normally. - let threw = false; - try { - await dispatchFor({ - candidates, - role: "validator", - workerModelId: "kimi-k2.6", - }); - } catch { - threw = true; - } - assert.equal(threw, false, "validator dispatch should not throw"); + const applied = await dispatchFor({ + candidates, + role: "validator", + workerModelId: "kimi-k2.6", + }); + if (applied === null) return; + const isMoonshot = + applied.provider === "kimi-coding" || + applied.id.toLowerCase().startsWith("kimi-"); + assert.equal( + isMoonshot, + false, + `validator should not get kimi when worker was kimi, got: ${applied.provider}/${applied.id}`, + ); }); // ── modelRoleForUnitType maps challenge → adversary ────────────────────────── @@ -360,6 +360,10 @@ describe("selectAndApplyModel lineage-diverse wiring", () => { "../uok/model-route-evidence.js" ); assert.equal(modelRoleForUnitType("challenge"), "adversary"); + assert.equal(modelRoleForUnitType("combatant-review"), "reviewer"); + assert.equal(modelRoleForUnitType("combatant"), "adversary"); + assert.equal(modelRoleForUnitType("partner"), "reviewer"); + assert.equal(modelRoleForUnitType("advocate"), "reviewer"); assert.equal(modelRoleForUnitType("review-slice"), "reviewer"); assert.equal(modelRoleForUnitType("validate-milestone"), "validator"); assert.equal(modelRoleForUnitType("execute-task"), "worker"); diff --git a/src/resources/extensions/sf/tests/model-role-policy.test.mjs b/src/resources/extensions/sf/tests/model-role-policy.test.mjs index ee7f02af3..62c0bc0fd 100644 --- a/src/resources/extensions/sf/tests/model-role-policy.test.mjs +++ b/src/resources/extensions/sf/tests/model-role-policy.test.mjs @@ -88,7 +88,11 @@ describe("model role policy", () => { validator: { role: "validator", mode: "auto", - constraints: ["strict-json", "review"], + constraints: [ + "strict-json", + "review", + "lineage-diverse-from-worker", + ], }, reviewer: { role: "reviewer", @@ -140,6 +144,14 @@ describe("model role policy", () => { ); }); + test("validator_default_constraints_include_lineage_diverse_from_worker", () => { + const policy = normalizeRolePolicy("validator"); + assert.ok( + policy.constraints.includes("lineage-diverse-from-worker"), + `expected lineage-diverse-from-worker in validator defaults, got: ${policy.constraints.join(", ")}`, + ); + }); + test("validateRolePolicy_adversary_with_lineage_diverse_from_worker_is_valid", () => { const result = validateRolePolicy("adversary", { constraints: ["lineage-diverse-from-worker"], diff --git a/src/resources/extensions/sf/tests/subagent-lineage-routing.test.mjs b/src/resources/extensions/sf/tests/subagent-lineage-routing.test.mjs new file mode 100644 index 000000000..7cb60db6f --- /dev/null +++ b/src/resources/extensions/sf/tests/subagent-lineage-routing.test.mjs @@ -0,0 +1,78 @@ +import assert from "node:assert/strict"; +import { test } from "vitest"; + +import { + isLineageDiverseReviewSubagent, + resolveLineageDiverseSubagentModel, +} from "../subagent/lineage-routing.js"; + +const availableModels = [ + { provider: "kimi-coding", id: "kimi-k2.6" }, + { provider: "anthropic", id: "claude-sonnet-4-5" }, + { provider: "google-gemini-cli", id: "gemini-3-pro" }, +]; + +test("isLineageDiverseReviewSubagent_detects_partner_and_combatant_lanes", () => { + assert.equal(isLineageDiverseReviewSubagent("reviewer", "audit"), true); + assert.equal( + isLineageDiverseReviewSubagent("reviewer", "Partner: strengthen plan"), + true, + ); + assert.equal( + isLineageDiverseReviewSubagent("reviewer", "Combatant: break plan"), + true, + ); + assert.equal(isLineageDiverseReviewSubagent("scout", "find files"), false); +}); + +test("resolveLineageDiverseSubagentModel_when_review_inherits_worker_family_selects_other_family", () => { + const model = resolveLineageDiverseSubagentModel({ + agentName: "reviewer", + task: "Partner review", + requestedModel: undefined, + agentModel: undefined, + parentModel: { provider: "kimi-coding", id: "kimi-k2.6" }, + availableModels, + }); + + assert.equal(model, "anthropic/claude-sonnet-4-5"); +}); + +test("resolveLineageDiverseSubagentModel_when_explicit_same_family_selects_other_family", () => { + const model = resolveLineageDiverseSubagentModel({ + agentName: "reviewer", + task: "Combatant review", + requestedModel: "kimi-coding/kimi-k2.6", + agentModel: undefined, + parentModel: { provider: "kimi-coding", id: "kimi-k2.6" }, + availableModels, + }); + + assert.equal(model, "anthropic/claude-sonnet-4-5"); +}); + +test("resolveLineageDiverseSubagentModel_when_non_review_keeps_requested_model", () => { + const model = resolveLineageDiverseSubagentModel({ + agentName: "scout", + task: "Find files", + requestedModel: "kimi-coding/kimi-k2.6", + agentModel: undefined, + parentModel: { provider: "kimi-coding", id: "kimi-k2.6" }, + availableModels, + }); + + assert.equal(model, "kimi-coding/kimi-k2.6"); +}); + +test("resolveLineageDiverseSubagentModel_when_no_alternative_fails_open", () => { + const model = resolveLineageDiverseSubagentModel({ + agentName: "reviewer", + task: "Partner review", + requestedModel: "kimi-coding/kimi-k2.6", + agentModel: undefined, + parentModel: { provider: "kimi-coding", id: "kimi-k2.6" }, + availableModels: [{ provider: "kimi-coding", id: "kimi-k2.6" }], + }); + + assert.equal(model, "kimi-coding/kimi-k2.6"); +}); diff --git a/src/resources/extensions/sf/tools/workflow-tool-executors.js b/src/resources/extensions/sf/tools/workflow-tool-executors.js index b0b8f715e..ebf83f9dc 100644 --- a/src/resources/extensions/sf/tools/workflow-tool-executors.js +++ b/src/resources/extensions/sf/tools/workflow-tool-executors.js @@ -12,6 +12,7 @@ import { getMilestone, getSliceStatusSummary, getSliceTaskCounts, + insertAssessment, readTransaction, saveGateResult, setSliceUatVerdict, @@ -34,6 +35,11 @@ export const SUPPORTED_SUMMARY_ARTIFACT_TYPES = [ "CONTEXT", "ASSESSMENT", "CONTEXT-DRAFT", + "CHALLENGE", + "DEPLOY", + "ROLLBACK", + "SMOKE", + "RELEASE", ]; export function isSupportedSummaryArtifactType(artifactType) { return SUPPORTED_SUMMARY_ARTIFACT_TYPES.includes(artifactType); @@ -132,6 +138,20 @@ export async function executeSummarySave(params, basePath = process.cwd()) { }, basePath, ); + if (params.artifact_type === "CHALLENGE") { + const verdict = extractChallengeVerdict(params.content); + insertAssessment({ + path: `.sf/${relativePath}`, + milestoneId: params.milestone_id, + sliceId: params.slice_id ?? null, + taskId: params.task_id ?? null, + status: verdict, + scope: params.slice_id + ? `slice-challenge:${params.slice_id}` + : "milestone-challenge", + fullContent: params.content, + }); + } // Persist UAT verdict to DB when an ASSESSMENT is saved at slice scope. // This makes checkNeedsRunUat DB-only — no file fallback needed. if ( @@ -194,6 +214,16 @@ export async function executeSummarySave(params, basePath = process.cwd()) { }; } } + +function extractChallengeVerdict(content) { + const text = String(content ?? ""); + const explicit = text.match( + /Overall Verdict[\s\S]{0,200}?\b(PASS|NEEDS-REMEDIATION|ADVISORY)\b/i, + ); + if (explicit) return explicit[1].toLowerCase(); + const fallback = text.match(/\b(PASS|NEEDS-REMEDIATION|ADVISORY)\b/i); + return fallback ? fallback[1].toLowerCase() : "unknown"; +} export async function executeTaskComplete(params, basePath = process.cwd()) { const dbAvailable = await ensureDbOpen(basePath); if (!dbAvailable) { diff --git a/src/resources/extensions/sf/uok/auto-dispatch.js b/src/resources/extensions/sf/uok/auto-dispatch.js index 36495d89d..268600df5 100644 --- a/src/resources/extensions/sf/uok/auto-dispatch.js +++ b/src/resources/extensions/sf/uok/auto-dispatch.js @@ -39,7 +39,12 @@ import { buildValidateMilestonePrompt, buildWorkflowPreferencesPrompt, } from "../auto-prompts.js"; -import { hasImplementationArtifacts } from "../auto-recovery.js"; +import { + hasMinimumChallengeRounds, + hasImplementationArtifacts, + verifyExpectedArtifact, +} from "../auto-recovery.js"; +import { resolveExpectedArtifactPath } from "../auto-artifact-paths.js"; import { getCanonicalMilestonePlan } from "../canonical-milestone-plan.js"; import { resolveDeepProjectSetupState } from "../deep-project-setup-policy.js"; import { resolveEscalation } from "../escalation.js"; @@ -79,6 +84,7 @@ import { createScheduleStore } from "../schedule/schedule-store.js"; import { getMilestone, getMilestoneSlices, + getAssessmentByScope, getMilestoneValidationAssessment, getPendingGates, getRuntimeCounter, @@ -208,6 +214,66 @@ async function readMilestoneValidationForDispatch(basePath, mid) { path: validationFile, }; } + +function adversarialReviewEnabled(prefs) { + return ( + prefs?.deploy?.adversarial_review !== false && + prefs?.phases?.skip_adversarial_review !== true + ); +} + +function challengeAssessmentScope(sliceId) { + return sliceId ? `slice-challenge:${sliceId}` : "milestone-challenge"; +} + +function challengeUnitId(milestoneId, sliceId) { + return sliceId ? `${milestoneId}/${sliceId}/challenge` : milestoneId; +} + +function challengeAlreadyRecorded(basePath, milestoneId, sliceId = null) { + const expectedPath = resolveExpectedArtifactPath( + "challenge", + challengeUnitId(milestoneId, sliceId), + basePath, + ); + if ( + expectedPath && + existsSync(expectedPath) && + verifyExpectedArtifact( + "challenge", + challengeUnitId(milestoneId, sliceId), + basePath, + ) + ) { + return true; + } + if (!isDbAvailable()) return false; + const assessment = getAssessmentByScope( + milestoneId, + challengeAssessmentScope(sliceId), + ); + if (!assessment) return false; + const status = String(assessment.status ?? "") + .trim() + .toLowerCase(); + if (["", "unknown"].includes(status)) return false; + return hasMinimumChallengeRounds(assessment.full_content); +} + +function findCompletedSliceNeedingChallenge(basePath, milestoneId) { + if (!isDbAvailable()) return null; + return getMilestoneSlices(milestoneId) + .filter((slice) => slice.status === "complete" || slice.status === "done") + .sort( + (a, b) => + (a.sequence ?? Number.MAX_SAFE_INTEGER) - + (b.sequence ?? Number.MAX_SAFE_INTEGER) || a.id.localeCompare(b.id), + ) + .find( + (slice) => !challengeAlreadyRecorded(basePath, milestoneId, slice.id), + ); +} + function canonicalPlanStop(mid, plan) { return { action: "stop", @@ -887,6 +953,27 @@ export const DISPATCH_RULES = [ return null; }, }, + { + name: "completed-slice → challenge", + match: async ({ mid, midTitle, basePath, prefs }) => { + if (!adversarialReviewEnabled(prefs)) return null; + const slice = findCompletedSliceNeedingChallenge(basePath, mid); + if (!slice) return null; + return { + action: "dispatch", + unitType: "challenge", + unitId: challengeUnitId(mid, slice.id), + prompt: await buildChallengePrompt( + mid, + midTitle, + `slice ${slice.id}`, + prefs?.deploy?.adversarial_mode ?? "red-team", + basePath, + { sliceId: slice.id }, + ), + }; + }, + }, { name: "reassess-roadmap (post-completion)", match: async ({ state, mid, midTitle, basePath, prefs }) => { @@ -1613,7 +1700,7 @@ export const DISPATCH_RULES = [ }, { name: "completing-milestone → complete-milestone", - match: async ({ state, mid, midTitle, basePath }) => { + match: async ({ state, mid, midTitle, basePath, prefs }) => { if (state.phase !== "completing-milestone") return null; // Safety guard (#2675): completion is only automatic after a pass verdict. // Non-pass terminal verdicts are still terminal for validation loops, but @@ -1799,6 +1886,23 @@ export const DISPATCH_RULES = [ } catch { // Non-fatal advisory } + if ( + adversarialReviewEnabled(prefs) && + !challengeAlreadyRecorded(basePath, mid) + ) { + return { + action: "dispatch", + unitType: "challenge", + unitId: challengeUnitId(mid), + prompt: await buildChallengePrompt( + mid, + midTitle, + "milestone", + prefs?.deploy?.adversarial_mode ?? "red-team", + basePath, + ), + }; + } return { action: "dispatch", unitType: "complete-milestone", @@ -1807,38 +1911,6 @@ export const DISPATCH_RULES = [ }; }, }, - { - name: "completing-milestone → challenge", - match: async ({ state, mid, midTitle, basePath, prefs }) => { - if (state.phase !== "completing-milestone") return null; - if (!prefs?.deploy?.adversarial_review) return null; - // Only trigger if no challenge assessment exists for this milestone yet - try { - const { getDatabase } = await import("./sf-db.js"); - const db = getDatabase(basePath); - const row = db - .prepare( - "SELECT id FROM assessments WHERE milestone_id = ? AND artifact_type = 'CHALLENGE' LIMIT 1", - ) - .get(mid); - if (row) return null; - } catch { - return null; - } - return { - action: "dispatch", - unitType: "challenge", - unitId: `challenge-${mid}`, - prompt: await buildChallengePrompt( - mid, - midTitle, - "milestone", - prefs?.deploy?.adversarial_mode ?? "red-team", - basePath, - ), - }; - }, - }, { name: "completing-milestone → release", match: async ({ state, mid, midTitle, basePath, prefs }) => { diff --git a/src/resources/extensions/sf/uok/model-role-policy.js b/src/resources/extensions/sf/uok/model-role-policy.js index a21a92ecb..22470afb7 100644 --- a/src/resources/extensions/sf/uok/model-role-policy.js +++ b/src/resources/extensions/sf/uok/model-role-policy.js @@ -38,13 +38,9 @@ export const SUPPORTED_MODEL_ROLES = Object.freeze([ * role's resolved model). Prevents "looks fine to me" rubber-stamp by * lineage-twins. When the worker model is unknown or maps to "unknown" vendor, * this constraint is treated as a no-op (fail-open rather than blocking all - * candidates). Enforcement is best-effort at the model-selection layer. - * - * TODO(lineage-diverse-from-worker): wire handler in auto-model-selection.js - * selectAndApplyModel — after the policy gate resolves `routingEligibleModels`, - * call resolveWorkerLineage(resolvedWorkerModelId) and filter candidates with - * !isSameRootVendor(candidate.id, workerVendor). See rootVendorFor() in this - * file for the canonical vendor mapping. + * candidates). Enforcement happens in auto-model-selection.js before final + * route resolution. See rootVendorFor() in this file for the canonical vendor + * mapping. */ export const SUPPORTED_MODEL_ROLE_CONSTRAINTS = Object.freeze([ "coding", @@ -79,7 +75,7 @@ const CONCRETE_ROUTE_KEYS = new Set([ export const DEFAULT_MODEL_ROLE_CONSTRAINTS = Object.freeze({ orchestrator: Object.freeze(["long-context", "strict-json"]), worker: Object.freeze(["coding"]), - validator: Object.freeze(["strict-json", "review"]), + validator: Object.freeze(["strict-json", "review", "lineage-diverse-from-worker"]), reviewer: Object.freeze(["review", "lineage-diverse-from-worker"]), adversary: Object.freeze(["review", "lineage-diverse-from-worker"]), }); diff --git a/src/resources/extensions/sf/uok/model-route-evidence.js b/src/resources/extensions/sf/uok/model-route-evidence.js index 2e12d1a83..596d1c940 100644 --- a/src/resources/extensions/sf/uok/model-route-evidence.js +++ b/src/resources/extensions/sf/uok/model-route-evidence.js @@ -23,9 +23,13 @@ export function modelRoleForUnitType(unitType) { if (normalized.includes("review") || normalized.includes("scrutiny")) { return "reviewer"; } + if (normalized.includes("partner") || normalized.includes("advocate")) { + return "reviewer"; + } if ( normalized.includes("adversar") || normalized.includes("challenge") || + normalized.includes("combatant") || normalized.includes("red-team") || normalized.includes("redteam") ) {