feat: enforce purpose-first adversarial review
This commit is contained in:
parent
acafee06e2
commit
c2f101734f
25 changed files with 858 additions and 86 deletions
|
|
@ -54,9 +54,14 @@ Required sections for non-trivial exports:
|
|||
- **First line** — what it returns / does, present tense.
|
||||
- **Purpose:** — why it exists; the value it protects.
|
||||
- **Consumer:** — who calls it in production. No consumer = symbol shouldn't exist yet.
|
||||
- **What tests should verify:** — for non-trivial behavior, name the consumer-visible contract the tests must prove. This is not a list of mocks or implementation steps.
|
||||
|
||||
A bare `/** Helper. */` is a code smell. Either write the purpose or delete the symbol.
|
||||
|
||||
Purpose text is a contract, not decoration. If implementation and purpose
|
||||
diverge, fix the code or rewrite the purpose with a new failing contract test
|
||||
first.
|
||||
|
||||
### Module-level JSDoc
|
||||
|
||||
```js
|
||||
|
|
|
|||
|
|
@ -73,6 +73,10 @@ If any field is missing: `BLOCKED: purpose unclear — [which field is missing]`
|
|||
|
||||
Treat the contract as a **falsifiable hypothesis**: name the evidence that would prove it wrong before implementation locks in. A contract without a falsifier is half a contract.
|
||||
|
||||
**Purpose outranks implementation momentum.** If code, tests, docs, or agent
|
||||
memory disagree, the purpose/consumer/contract chain wins. Fix the artifact that
|
||||
drifted; do not reinterpret the purpose to excuse the implementation.
|
||||
|
||||
## Workflow (mapped to sf's phase machine)
|
||||
|
||||
### Research phase — name the problem
|
||||
|
|
@ -99,8 +103,11 @@ Clarify highest-impact unknowns first: behaviour, acceptance criteria, data inva
|
|||
For non-trivial contracts, pressure-test before locking the plan via the [`advisory-partner`](../src/resources/extensions/sf/skills/advisory-partner/SKILL.md) skill — this is sf's adversarial review surface, already wired into the Q3/Q4 gates and `validate-milestone`. It runs with the **validation** model, distinct from the planning/execution model — that's the point.
|
||||
|
||||
1. **Advocate pass** — strengthen the best version of the contract.
|
||||
2. **Challenger pass** — attack assumptions AND propose an alternative. A challenger anchored to the advocate's framing is not adversarial.
|
||||
3. **Falsifier (required gate, blocks Plan→Execute):** `FALSIFIER: this contract is wrong if [specific observable condition].` Generic falsifiers ("wrong if it doesn't work") are process failures.
|
||||
2. **Partner pass** — improve the plan as a collaborator: sharpen the purpose,
|
||||
name the real consumer, remove unnecessary scope, and identify the smallest
|
||||
proof that creates value.
|
||||
3. **Challenger pass** — attack assumptions AND propose an alternative. A challenger anchored to the advocate's framing is not adversarial.
|
||||
4. **Falsifier (required gate, blocks Plan→Execute):** `FALSIFIER: this contract is wrong if [specific observable condition].` Generic falsifiers ("wrong if it doesn't work") are process failures.
|
||||
|
||||
**Find the devil and find the experts:**
|
||||
|
||||
|
|
@ -179,10 +186,14 @@ After completing a step, state confidence as a number `0.0–1.0` and a one-line
|
|||
| Purpose & consumer | 0.95 | Run an adversarial review wave (advisory-partner Q3/Q5). |
|
||||
| Contract test | 0.90 | Adversarial review wave. |
|
||||
| Implementation | 0.95 | Add a specialist reviewer for the touched boundary (e.g. provider/transport/security). |
|
||||
| Final evidence | 0.97 | Full adversarial: advocate + challenger + specialist. |
|
||||
| Final evidence | 0.98 | Full adversarial: advocate + partner + challenger + specialist. |
|
||||
|
||||
Skip the gate for trivial steps (typo fix, exhaustive matches with full coverage). The gate earns its keep on I/O boundaries, async loading, protocol integration, and anything touching real backends or models.
|
||||
|
||||
Below `0.98`, do not ship non-trivial autonomous behaviour. Persist the gap as
|
||||
an adversarial finding, write the missing contract test, or explicitly downgrade
|
||||
the scope before claiming completion.
|
||||
|
||||
LLM confidence numbers are poorly calibrated in absolute terms — the *relative* signal matters. If you write 0.7, you know you're guessing. Act on that.
|
||||
|
||||
## Tests Find Gaps
|
||||
|
|
|
|||
|
|
@ -77,6 +77,14 @@ export function resolveExpectedArtifactPath(unitType, unitId, base) {
|
|||
const dir = resolveMilestonePath(base, mid);
|
||||
return dir ? join(dir, buildMilestoneFileName(mid, "VALIDATION")) : null;
|
||||
}
|
||||
case "challenge": {
|
||||
if (sid) {
|
||||
const dir = resolveSlicePath(base, mid, sid);
|
||||
return dir ? join(dir, buildSliceFileName(sid, "CHALLENGE")) : null;
|
||||
}
|
||||
const dir = resolveMilestonePath(base, mid);
|
||||
return dir ? join(dir, buildMilestoneFileName(mid, "CHALLENGE")) : null;
|
||||
}
|
||||
case "complete-milestone": {
|
||||
const dir = resolveMilestonePath(base, mid);
|
||||
return dir ? join(dir, buildMilestoneFileName(mid, "SUMMARY")) : null;
|
||||
|
|
@ -130,6 +138,10 @@ export function diagnoseExpectedArtifact(unitType, unitId, base) {
|
|||
return `${relSliceFile(base, mid, sid, "ASSESSMENT")} (UAT assessment result)`;
|
||||
case "validate-milestone":
|
||||
return `${relMilestoneFile(base, mid, "VALIDATION")} (milestone validation report)`;
|
||||
case "challenge":
|
||||
return sid
|
||||
? `${relSliceFile(base, mid, sid, "CHALLENGE")} (adversarial slice challenge)`
|
||||
: `${relMilestoneFile(base, mid, "CHALLENGE")} (adversarial milestone challenge)`;
|
||||
case "complete-milestone":
|
||||
return `${relMilestoneFile(base, mid, "SUMMARY")} (milestone summary)`;
|
||||
default:
|
||||
|
|
|
|||
|
|
@ -484,9 +484,9 @@ export async function selectAndApplyModel(
|
|||
);
|
||||
}
|
||||
}
|
||||
// ── Lineage-diverse filter (adversary / reviewer roles) ───────────────────
|
||||
// When the dispatching role is "adversary" or "reviewer" and its constraint
|
||||
// set includes "lineage-diverse-from-worker", remove candidates that share
|
||||
// ── Lineage-diverse filter (review roles) ──────────────────────────────────
|
||||
// When the dispatching role carries "lineage-diverse-from-worker", remove
|
||||
// candidates that share
|
||||
// root-vendor lineage with the worker model to prevent rubber-stamp reviews.
|
||||
// Fail-open: if filtering would leave zero candidates, log a warning and keep
|
||||
// the full eligible set so the dispatch is never completely blocked.
|
||||
|
|
@ -499,7 +499,9 @@ export async function selectAndApplyModel(
|
|||
{
|
||||
const normalizedRole = String(role ?? "").toLowerCase();
|
||||
const isLineageRole =
|
||||
normalizedRole === "adversary" || normalizedRole === "reviewer";
|
||||
normalizedRole === "adversary" ||
|
||||
normalizedRole === "reviewer" ||
|
||||
normalizedRole === "validator";
|
||||
if (isLineageRole) {
|
||||
const roleConstraints =
|
||||
DEFAULT_MODEL_ROLE_CONSTRAINTS[normalizedRole] ?? [];
|
||||
|
|
@ -783,7 +785,7 @@ export async function selectAndApplyModel(
|
|||
}
|
||||
// Enforce lineage-diverse-from-worker: skip candidates whose root vendor
|
||||
// matches the worker's. lineageAllowedKeys is populated only when the
|
||||
// constraint applies (adversary / reviewer with a known worker model id).
|
||||
// constraint applies for a review role with a known worker model id.
|
||||
if (lineageAllowedKeys) {
|
||||
const key = `${model.provider.toLowerCase()}/${model.id.toLowerCase()}`;
|
||||
if (!lineageAllowedKeys.has(key)) {
|
||||
|
|
|
|||
|
|
@ -3689,6 +3689,7 @@ export async function buildChallengePrompt(
|
|||
challengeTarget,
|
||||
challengeMode,
|
||||
base,
|
||||
options = {},
|
||||
) {
|
||||
const resolveArtifact = async (key) => {
|
||||
switch (key) {
|
||||
|
|
@ -3710,6 +3711,9 @@ export async function buildChallengePrompt(
|
|||
milestoneTitle: midTitle,
|
||||
challengeTarget: challengeTarget ?? "milestone",
|
||||
challengeMode: challengeMode ?? "red-team",
|
||||
saveSummaryScope: options.sliceId
|
||||
? `milestone_id: ${mid}, slice_id: ${options.sliceId}, artifact_type: "CHALLENGE"`
|
||||
: `milestone_id: ${mid}, artifact_type: "CHALLENGE"`,
|
||||
inlinedContext,
|
||||
skillActivation: buildSkillActivationBlock({
|
||||
base,
|
||||
|
|
|
|||
|
|
@ -60,6 +60,26 @@ import { logError, logWarning } from "./workflow-logger.js";
|
|||
// Re-export so existing consumers of auto-recovery.ts keep working.
|
||||
export { diagnoseExpectedArtifact, resolveExpectedArtifactPath };
|
||||
// ─── Artifact Resolution & Verification ───────────────────────────────────────
|
||||
/**
|
||||
* Return true when a challenge report contains the minimum review lenses.
|
||||
*
|
||||
* Purpose: prevent one challenge verdict from becoming "truth"; SF requires
|
||||
* self-review, partner review, and adversary review evidence before a completed
|
||||
* slice or milestone is considered challenged.
|
||||
*
|
||||
* Consumer: challenge artifact verification and auto-dispatch backfill guards.
|
||||
*/
|
||||
export function hasMinimumChallengeRounds(content) {
|
||||
const text = String(content ?? "");
|
||||
return (
|
||||
/self(?:[-\s]+review|[-\s]+critique|[-\s]+check)/i.test(text) &&
|
||||
/partner(?:[-\s]+review|[-\s]+round|[-\s]+pass|[-\s]+check)?/i.test(text) &&
|
||||
/(adversary|adversarial|combatant|red[-\s]*team)(?:[-\s]+review|[-\s]+round|[-\s]+pass|[-\s]+check)?/i.test(
|
||||
text,
|
||||
)
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Check whether a milestone produced implementation artifacts (non-`.sf/` files)
|
||||
* in the git history. Uses `git log --name-only` to inspect all commits on the
|
||||
|
|
@ -320,6 +340,14 @@ export function verifyExpectedArtifact(unitType, unitId, base) {
|
|||
const validationContent = readFileSync(absPath, "utf-8");
|
||||
if (!isValidationTerminal(validationContent)) return false;
|
||||
}
|
||||
if (unitType === "challenge") {
|
||||
const challengeContent = readFileSync(absPath, "utf-8");
|
||||
if (!/Overall Verdict/i.test(challengeContent)) return false;
|
||||
if (!/\b(PASS|NEEDS-REMEDIATION|ADVISORY)\b/i.test(challengeContent)) {
|
||||
return false;
|
||||
}
|
||||
if (!hasMinimumChallengeRounds(challengeContent)) return false;
|
||||
}
|
||||
if (unitType === "plan-milestone" || unitType === "roadmap-meeting") {
|
||||
try {
|
||||
const roadmapContent = readFileSync(absPath, "utf-8");
|
||||
|
|
|
|||
|
|
@ -393,13 +393,13 @@ export function registerDbTools(pi) {
|
|||
name: "save_summary",
|
||||
label: "Save Summary",
|
||||
description:
|
||||
"Save a research, summary, context, or assessment artifact to disk with an auto-computed path. " +
|
||||
"Call this to persist planning or research output (e.g. a research brief, context doc, or summary) for a milestone, slice, or task.",
|
||||
"Save a research, summary, context, assessment, challenge, or ops artifact to disk with an auto-computed path. " +
|
||||
"Call this to persist planning, research, adversarial review, or operational output for a milestone, slice, or task.",
|
||||
promptSnippet:
|
||||
"Save a planning artifact (SUMMARY/RESEARCH/CONTEXT/ASSESSMENT) to disk",
|
||||
"Save a planning artifact (SUMMARY/RESEARCH/CONTEXT/ASSESSMENT/CHALLENGE) to disk",
|
||||
promptGuidelines: [
|
||||
"milestone_id is required; slice_id and task_id are optional and determine the file path.",
|
||||
"artifact_type must be one of: SUMMARY, RESEARCH, CONTEXT, ASSESSMENT, CONTEXT-DRAFT.",
|
||||
"artifact_type must be one of: SUMMARY, RESEARCH, CONTEXT, ASSESSMENT, CONTEXT-DRAFT, CHALLENGE, DEPLOY, ROLLBACK, SMOKE, RELEASE.",
|
||||
"Use CONTEXT-DRAFT for incremental saves; use CONTEXT only for the final milestone context after verification.",
|
||||
],
|
||||
parameters: Type.Object({
|
||||
|
|
@ -412,7 +412,7 @@ export function registerDbTools(pi) {
|
|||
),
|
||||
artifact_type: Type.String({
|
||||
description:
|
||||
"One of: SUMMARY, RESEARCH, CONTEXT, ASSESSMENT, CONTEXT-DRAFT",
|
||||
"One of: SUMMARY, RESEARCH, CONTEXT, ASSESSMENT, CONTEXT-DRAFT, CHALLENGE, DEPLOY, ROLLBACK, SMOKE, RELEASE",
|
||||
}),
|
||||
content: Type.String({
|
||||
description: "The full markdown content of the artifact",
|
||||
|
|
|
|||
|
|
@ -166,6 +166,11 @@ export const KNOWN_UNIT_TYPES = [
|
|||
"run-uat",
|
||||
"complete-milestone",
|
||||
"validate-milestone",
|
||||
"challenge",
|
||||
"deploy",
|
||||
"smoke-production",
|
||||
"release",
|
||||
"rollback",
|
||||
"rewrite-docs",
|
||||
"discuss-milestone",
|
||||
"discuss-slice",
|
||||
|
|
|
|||
|
|
@ -18,6 +18,19 @@ You are executing SF autonomous mode.
|
|||
|
||||
You are the adversary agent. Your job is NOT to complete work — it is to rigorously attack the correctness, safety, and assumptions of what has been built or planned. You represent the failure modes the product team has not considered.
|
||||
|
||||
Challenge output is evidence, not truth. A challenge report is not acceptable
|
||||
until it contains all three review rounds:
|
||||
|
||||
1. **Self-review round** — the original model's best internal critique of its
|
||||
own likely blind spots and weak assumptions.
|
||||
2. **Partner round** — the constructive partner view: strengthen the purpose,
|
||||
consumer, smallest valuable proof, and remediation shape.
|
||||
3. **Adversary round** — the hostile/combatant view: break the claim, find
|
||||
counterexamples, and name blocking risks.
|
||||
|
||||
If any round is missing, the challenge is incomplete even if it has an Overall
|
||||
Verdict.
|
||||
|
||||
### Challenge rules
|
||||
|
||||
1. **Be ruthless but specific.** Every finding must include: what breaks, under what condition, and what the impact is.
|
||||
|
|
@ -36,6 +49,19 @@ Based on `{{challengeMode}}`:
|
|||
|
||||
### Output format
|
||||
|
||||
Start with these required sections:
|
||||
|
||||
```
|
||||
## Self-review Round
|
||||
<best critique from the model that produced or would defend the work>
|
||||
|
||||
## Partner Round
|
||||
<constructive partner review: purpose, consumer, minimal remediation>
|
||||
|
||||
## Adversary Round
|
||||
<hostile/combatant review: failure modes and counterexamples>
|
||||
```
|
||||
|
||||
For each finding:
|
||||
|
||||
```
|
||||
|
|
@ -54,11 +80,25 @@ After all findings, provide an **Overall Verdict**:
|
|||
- `NEEDS-REMEDIATION` — one or more critical/high findings must be addressed before deploy.
|
||||
- `ADVISORY` — findings are low severity; proceed with awareness.
|
||||
|
||||
Call `save_summary` with `milestone_id: {{milestoneId}}`, `artifact_type: "CHALLENGE"`, and the full challenge report as content.
|
||||
Call `save_summary` with `{{saveSummaryScope}}`, and the full challenge report as content.
|
||||
|
||||
### Report sf-internal observations
|
||||
|
||||
If you observe sf-the-tool friction during this unit, file via `report_issue`.
|
||||
If the target is already closed and you find real remediation work, do **not**
|
||||
reopen or edit the closed slice/milestone yourself. Closed history stays closed.
|
||||
Instead:
|
||||
|
||||
- File each critical/high finding with `report_issue` using
|
||||
`kind: "adversarial-finding"`, the finding severity, concrete evidence, and
|
||||
falsifiable acceptance criteria.
|
||||
- Recommend a new remediation slice/task in the finding. The planner/triage
|
||||
path owns opening new work from that evidence.
|
||||
- Use `NEEDS-REMEDIATION` only when the finding should block the current
|
||||
milestone/deploy path. Use `ADVISORY` for follow-up work that can be queued
|
||||
without blocking.
|
||||
|
||||
If you observe other sf-the-tool friction during this unit, file via
|
||||
`report_issue`.
|
||||
|
||||
When done, say: "Challenge {{milestoneId}} complete — verdict: <verdict>."
|
||||
|
||||
|
|
|
|||
|
|
@ -1,6 +1,6 @@
|
|||
Plan milestone {{milestoneId}} ("{{milestoneTitle}}"). Decisions and requirements are injected from the database into your system context — respect existing decisions and treat Active requirements as the capability contract. If no requirements are present, treat that as a planning gap: derive the minimum requirement coverage from current project evidence, persist it through `save_requirement`, and explicitly note missing coverage. Use the **Roadmap** output template below to shape the milestone planning payload you send to `plan_milestone`. Start the `vision` field with the milestone purpose before implementation detail, include the structured `productResearch` payload when the work is product-facing, workflow-facing, developer-experience, or market-positioning, and make each slice `goal` state the slice purpose before mechanics. If the milestone changes how SF is driven, observed, integrated, or automated, keep the axes separate in the roadmap: surface (TUI/CLI/web/editor/machine), protocol (ACP/RPC/stdio/HTTP/wire), output format (text/json/stream-json), run control (manual/assisted/autonomous), and permission profile (restricted/normal/trusted/unrestricted). Call `plan_milestone` to persist the milestone planning fields and render `{{milestoneId}}-ROADMAP.md` from DB state. Do **not** write `{{milestoneId}}-ROADMAP.md`, `ROADMAP.md`, or other planning artifacts manually. If planning produces structural decisions, call `save_decision`. {{skillActivation}} Fill the Horizontal Checklist section with cross-cutting concerns considered during planning (requirements re-read, decisions re-evaluated, graceful shutdown, revenue paths, auth boundary, shared resources, reconnection). Omit for trivial milestones.
|
||||
|
||||
Before calling `plan_milestone`, run a bounded **Vision Alignment Meeting** for the milestone and roadmap as a real multi-agent review. Use the `subagent` tool in `mode: "debate"` with `rounds: 2` and a separate task for each participant lens below. Do **not** merely simulate every participant inside this planner response. Use only supported agent names: `planner`, `reviewer`, `researcher`, and `scout`. Put the stakeholder role name inside the task text; do not invent agent names such as `combatant`, `delivery-lead`, `product-manager`, or `customer-panel`. If the `subagent` tool is unavailable or fails after one retry, record that explicitly in `trigger` and run the structured meeting inline as a degraded fallback. This is allowed to be broader and more nuanced than slice planning. Include at least these participant lenses:
|
||||
Before calling `plan_milestone`, run a bounded **Vision Alignment Meeting** for the milestone and roadmap as a real multi-agent review. Use the `subagent` tool in `mode: "debate"` with `rounds: 2` and a separate task for each participant lens below. Do **not** merely simulate every participant inside this planner response. Use only supported agent names: `planner`, `reviewer`, `researcher`, and `scout`. Put the stakeholder role name inside the task text; do not invent agent names such as `combatant`, `delivery-lead`, `product-manager`, or `customer-panel`. Do not set a `model` override for Partner, Combatant, or validation-style reviewers; SF's reviewer/adversary routing must choose a lineage-diverse model family from the worker whenever possible. If the `subagent` tool is unavailable or fails after one retry, record that explicitly in `trigger` and run the structured meeting inline as a degraded fallback. This is allowed to be broader and more nuanced than slice planning. Include at least these participant lenses:
|
||||
- Product Manager
|
||||
- User Advocate
|
||||
- Customer Panel
|
||||
|
|
|
|||
|
|
@ -51,7 +51,7 @@ If milestone research exists (inlined above), trust those findings and skip redu
|
|||
|
||||
Narrate your decomposition reasoning - why you're grouping work this way, what risks are driving the order, what verification strategy you're choosing and why. Use complete sentences rather than planner shorthand or fragmentary notes.
|
||||
|
||||
Before you persist the roadmap, run a bounded **Vision Alignment Meeting** as a real multi-agent review. Use the `subagent` tool in `mode: "debate"` with `rounds: 2` and a separate task for each participant lens below. Do **not** merely simulate every participant inside this planner response. Use only supported agent names: `planner`, `reviewer`, `researcher`, and `scout`. Put the stakeholder role name inside the task text; do not invent agent names such as `combatant`, `delivery-lead`, `product-manager`, or `customer-panel`. If the `subagent` tool is unavailable or fails after one retry, record that explicitly in `trigger` and run the structured meeting inline as a degraded fallback. This is broader than slice planning and should feel allowed to be chatty and nuanced. Gather the strongest additions, cuts, and ordering changes from these participant lenses:
|
||||
Before you persist the roadmap, run a bounded **Vision Alignment Meeting** as a real multi-agent review. Use the `subagent` tool in `mode: "debate"` with `rounds: 2` and a separate task for each participant lens below. Do **not** merely simulate every participant inside this planner response. Use only supported agent names: `planner`, `reviewer`, `researcher`, and `scout`. Put the stakeholder role name inside the task text; do not invent agent names such as `combatant`, `delivery-lead`, `product-manager`, or `customer-panel`. Do not set a `model` override for Partner, Combatant, or validation-style reviewers; SF's reviewer/adversary routing must choose a lineage-diverse model family from the worker whenever possible. If the `subagent` tool is unavailable or fails after one retry, record that explicitly in `trigger` and run the structured meeting inline as a degraded fallback. This is broader than slice planning and should feel allowed to be chatty and nuanced. Gather the strongest additions, cuts, and ordering changes from these participant lenses:
|
||||
|
||||
Each role has a **purpose** (why it exists in the ceremony) and a **depth contract** (what a meaningful contribution must contain). A contribution that restates the roadmap title without evidence, specific risks, or concrete subsystem references is not meaningful — it is a rubber stamp.
|
||||
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ All relevant context has been preloaded below — the roadmap, all slice summari
|
|||
### Step 1 — Dispatch Parallel Reviewers
|
||||
|
||||
Call `subagent` with `tasks: [...]` containing ALL THREE reviewers simultaneously.
|
||||
Use `agent: "reviewer"` for every validation reviewer. Do not use `code`, `coder`, or `worker` here — this is review/validation work, not implementation.
|
||||
Use `agent: "reviewer"` for every validation reviewer. Do not use `code`, `coder`, or `worker` here — this is review/validation work, not implementation. Do not set a `model` override; SF's reviewer/validator routing should choose a model family that is lineage-diverse from the worker when possible.
|
||||
|
||||
**Pass `parentTrace`** at the batch level. For milestone validation, the
|
||||
"parent" is the slice work that built this milestone — pass a one-line-per-
|
||||
|
|
|
|||
|
|
@ -67,7 +67,20 @@ If you can't fill this in, the plan is incomplete.
|
|||
|
||||
---
|
||||
|
||||
### 2. Five Challenger Questions
|
||||
### 2. Partner Pass
|
||||
|
||||
Before attacking the plan, improve it as a partner:
|
||||
|
||||
- Restate the purpose in one sentence.
|
||||
- Name the real production consumer.
|
||||
- Remove scope that does not serve that purpose.
|
||||
- Identify the smallest proof point that gives the consumer value.
|
||||
|
||||
If this pass cannot name purpose and consumer, stop with `RECONSIDER`.
|
||||
|
||||
---
|
||||
|
||||
### 3. Five Challenger Questions
|
||||
|
||||
Answer each from the artifact — not from general knowledge:
|
||||
|
||||
|
|
@ -88,7 +101,7 @@ Write the objection. Then answer it. If the answer is weak, that's a flag.
|
|||
|
||||
---
|
||||
|
||||
### 3. Trap Scan
|
||||
### 4. Trap Scan
|
||||
|
||||
| Trap | Warning Sign |
|
||||
|------|-------------|
|
||||
|
|
@ -100,7 +113,7 @@ Write the objection. Then answer it. If the answer is weak, that's a flag.
|
|||
|
||||
---
|
||||
|
||||
### 4. Verdict
|
||||
### 5. Verdict
|
||||
|
||||
```
|
||||
ADVISORY VERDICT: [PROCEED / PROCEED WITH CAVEAT / RECONSIDER]
|
||||
|
|
|
|||
|
|
@ -121,10 +121,14 @@ After each major step, state confidence as `0.0–1.0` plus a one-line reason.
|
|||
| Purpose & consumer | 0.95 | Run `advisory-partner` Q3 (abuse) + Q5 (strongest objection). |
|
||||
| Contract test | 0.90 | Adversarial review wave. |
|
||||
| Implementation | 0.95 | Add a specialist reviewer for the boundary touched. |
|
||||
| Evidence | 0.97 | Full advocate + challenger + specialist. |
|
||||
| Evidence | 0.98 | Full advocate + partner + challenger + specialist. |
|
||||
|
||||
LLM confidence is poorly calibrated in absolute terms — the relative signal matters. If you write 0.7, you're guessing. Act on it.
|
||||
|
||||
Below `0.98`, non-trivial autonomous behaviour does not ship. Persist the gap
|
||||
as an adversarial finding, write the missing contract test, or explicitly shrink
|
||||
scope before claiming completion.
|
||||
|
||||
### 8. Verify Consumer and Record Evidence
|
||||
|
||||
- Re-run `rg` for the changed symbol; confirm a real caller still depends on it.
|
||||
|
|
|
|||
|
|
@ -50,6 +50,7 @@ import {
|
|||
mergeDeltaPatches,
|
||||
readIsolationMode,
|
||||
} from "./isolation.js";
|
||||
import { resolveLineageDiverseSubagentModel } from "./lineage-routing.js";
|
||||
import { composeAgentPrompt } from "./prompt-parts.js";
|
||||
import { registerWorker, updateWorker } from "./worker-registry.js";
|
||||
|
||||
|
|
@ -371,6 +372,25 @@ function validateSubagentInvocationAgainstInheritance(
|
|||
}
|
||||
return { ok: true };
|
||||
}
|
||||
|
||||
function resolveSubagentModelOverride({
|
||||
agents,
|
||||
agentName,
|
||||
task,
|
||||
requestedModel,
|
||||
parentModel,
|
||||
availableModels,
|
||||
}) {
|
||||
const { agent, effectiveName } = resolveAgentByName(agents, agentName);
|
||||
return resolveLineageDiverseSubagentModel({
|
||||
agentName: effectiveName ?? agentName,
|
||||
task,
|
||||
requestedModel,
|
||||
agentModel: agent?.model,
|
||||
parentModel,
|
||||
availableModels,
|
||||
});
|
||||
}
|
||||
async function executeSubagentInvocation({
|
||||
defaultCwd,
|
||||
agents,
|
||||
|
|
@ -383,6 +403,8 @@ async function executeSubagentInvocation({
|
|||
cmuxSplitsEnabled,
|
||||
useIsolation,
|
||||
inheritanceEnvelope,
|
||||
parentModel,
|
||||
availableModels,
|
||||
}) {
|
||||
const makeDetails = (mode) => (results) => ({
|
||||
mode,
|
||||
|
|
@ -430,7 +452,14 @@ async function executeSubagentInvocation({
|
|||
signal,
|
||||
chainUpdate,
|
||||
makeDetails("chain"),
|
||||
step.model ?? params.model,
|
||||
resolveSubagentModelOverride({
|
||||
agents,
|
||||
agentName: step.agent,
|
||||
task: taskForStep,
|
||||
requestedModel: step.model ?? params.model,
|
||||
parentModel,
|
||||
availableModels,
|
||||
}),
|
||||
inheritanceEnvelope,
|
||||
);
|
||||
results.push(result);
|
||||
|
|
@ -582,7 +611,14 @@ async function executeSubagentInvocation({
|
|||
async (t, index) => {
|
||||
const resultIndex = (round - 1) * batchTasks.length + index;
|
||||
const prompt = buildDebatePrompt(t, round, transcript);
|
||||
const taskModelOverride = t.model ?? params.model;
|
||||
const taskModelOverride = resolveSubagentModelOverride({
|
||||
agents,
|
||||
agentName: t.agent,
|
||||
task: prompt,
|
||||
requestedModel: t.model ?? params.model,
|
||||
parentModel,
|
||||
availableModels,
|
||||
});
|
||||
const result = cmuxSplitsEnabled
|
||||
? await runSingleAgentInCmuxSplit(
|
||||
cmuxClient,
|
||||
|
|
@ -749,11 +785,18 @@ async function executeSubagentInvocation({
|
|||
batchSize,
|
||||
batchId,
|
||||
);
|
||||
const taskModelOverride = t.model ?? params.model;
|
||||
const taskWithTrace = composeTaskWithParentTrace(
|
||||
t.task,
|
||||
t.parentTrace ?? params.parentTrace,
|
||||
);
|
||||
const taskModelOverride = resolveSubagentModelOverride({
|
||||
agents,
|
||||
agentName: t.agent,
|
||||
task: taskWithTrace,
|
||||
requestedModel: t.model ?? params.model,
|
||||
parentModel,
|
||||
availableModels,
|
||||
});
|
||||
const runTask = () =>
|
||||
cmuxSplitsEnabled
|
||||
? runSingleAgentInCmuxSplit(
|
||||
|
|
@ -861,7 +904,14 @@ async function executeSubagentInvocation({
|
|||
signal,
|
||||
onUpdate,
|
||||
makeDetails("single"),
|
||||
params.model,
|
||||
resolveSubagentModelOverride({
|
||||
agents,
|
||||
agentName: params.agent,
|
||||
task: singleTaskWithTrace,
|
||||
requestedModel: params.model,
|
||||
parentModel,
|
||||
availableModels,
|
||||
}),
|
||||
inheritanceEnvelope,
|
||||
)
|
||||
: await runSingleAgent(
|
||||
|
|
@ -874,7 +924,14 @@ async function executeSubagentInvocation({
|
|||
signal,
|
||||
onUpdate,
|
||||
makeDetails("single"),
|
||||
params.model,
|
||||
resolveSubagentModelOverride({
|
||||
agents,
|
||||
agentName: params.agent,
|
||||
task: singleTaskWithTrace,
|
||||
requestedModel: params.model,
|
||||
parentModel,
|
||||
availableModels,
|
||||
}),
|
||||
inheritanceEnvelope,
|
||||
);
|
||||
if (isolation) {
|
||||
|
|
@ -1994,6 +2051,10 @@ export default function (pi) {
|
|||
});
|
||||
const cmuxClient = CmuxClient.fromPreferences(effectivePreferences);
|
||||
const cmuxSplitsEnabled = cmuxClient.getConfig().splits;
|
||||
const parentModel = ctx.model
|
||||
? { provider: ctx.model.provider, id: ctx.model.id }
|
||||
: null;
|
||||
const availableModels = ctx.modelRegistry?.getAvailable?.() ?? [];
|
||||
// Resolve isolation mode
|
||||
const isolationMode = readIsolationMode();
|
||||
const useIsolation = Boolean(params.isolated) && isolationMode !== "none";
|
||||
|
|
@ -2116,6 +2177,8 @@ export default function (pi) {
|
|||
cmuxSplitsEnabled,
|
||||
useIsolation,
|
||||
inheritanceEnvelope,
|
||||
parentModel,
|
||||
availableModels,
|
||||
}),
|
||||
}
|
||||
: null;
|
||||
|
|
@ -2138,6 +2201,8 @@ export default function (pi) {
|
|||
cmuxSplitsEnabled,
|
||||
useIsolation,
|
||||
inheritanceEnvelope,
|
||||
parentModel,
|
||||
availableModels,
|
||||
}).then((result) => {
|
||||
// Append agent turn BEFORE deliverResult fires the notification.
|
||||
if (jobId && isSingleMode) {
|
||||
|
|
@ -2176,6 +2241,8 @@ export default function (pi) {
|
|||
cmuxSplitsEnabled,
|
||||
useIsolation,
|
||||
inheritanceEnvelope,
|
||||
parentModel,
|
||||
availableModels,
|
||||
});
|
||||
},
|
||||
renderCall(args, theme) {
|
||||
|
|
|
|||
65
src/resources/extensions/sf/subagent/lineage-routing.js
Normal file
65
src/resources/extensions/sf/subagent/lineage-routing.js
Normal file
|
|
@ -0,0 +1,65 @@
|
|||
import { isSameRootVendor } from "../uok/model-role-policy.js";
|
||||
|
||||
const REVIEW_AGENT_RE =
|
||||
/\b(reviewer|review-code|sf-reviewer|partner|advocate|adversary|combatant|critic|validator)\b/i;
|
||||
const REVIEW_TASK_RE =
|
||||
/\b(partner|advocate|reviewer|review|adversary|adversarial|combatant|challenger|critic|validation|validator)\b/i;
|
||||
|
||||
function modelKey(model) {
|
||||
if (!model) return undefined;
|
||||
if (typeof model === "string") return model;
|
||||
if (model.provider && model.id) return `${model.provider}/${model.id}`;
|
||||
return model.id;
|
||||
}
|
||||
|
||||
/**
|
||||
* Return true when a subagent request is review-like and needs lineage diversity.
|
||||
*
|
||||
* Purpose: keep partner/advisory/adversarial lanes from sharing the worker's
|
||||
* blind spots just because they are all launched through the generic
|
||||
* `subagent` tool.
|
||||
*
|
||||
* Consumer: subagent/index.js before launching single, chain, parallel, or
|
||||
* debate subagents.
|
||||
*/
|
||||
export function isLineageDiverseReviewSubagent(agentName, task) {
|
||||
return (
|
||||
REVIEW_AGENT_RE.test(String(agentName ?? "")) ||
|
||||
REVIEW_TASK_RE.test(String(task ?? ""))
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolve a subagent model override that avoids the parent worker lineage.
|
||||
*
|
||||
* Purpose: enforce R113 at launch time: partner, reviewer, validator, and
|
||||
* combatant/adversary subagents should use a different model family from the
|
||||
* parent worker when an alternative is available. If no alternative exists,
|
||||
* fail open and keep the requested model rather than blocking review entirely.
|
||||
*
|
||||
* Consumer: subagent/index.js model override resolution.
|
||||
*/
|
||||
export function resolveLineageDiverseSubagentModel({
|
||||
agentName,
|
||||
task,
|
||||
requestedModel,
|
||||
agentModel,
|
||||
parentModel,
|
||||
availableModels = [],
|
||||
}) {
|
||||
const requested = requestedModel ?? agentModel;
|
||||
if (!isLineageDiverseReviewSubagent(agentName, task)) return requestedModel;
|
||||
const parentKey = modelKey(parentModel);
|
||||
if (!parentKey) return requestedModel;
|
||||
const requestedKey = modelKey(requested);
|
||||
if (requestedKey && !isSameRootVendor(requestedKey, parentKey)) {
|
||||
return requestedModel;
|
||||
}
|
||||
const candidates = availableModels.filter((model) => {
|
||||
const key = modelKey(model);
|
||||
return key && !isSameRootVendor(key, parentKey);
|
||||
});
|
||||
const selected = candidates[0];
|
||||
const selectedKey = modelKey(selected);
|
||||
return selectedKey ?? requestedModel;
|
||||
}
|
||||
|
|
@ -16,8 +16,10 @@ const parallelPromptMock = vi.hoisted(() =>
|
|||
);
|
||||
|
||||
vi.mock("../auto-prompts.js", () => ({
|
||||
buildChallengePrompt: vi.fn(async (...args) => `challenge ${args.join(" ")}`),
|
||||
buildCompleteMilestonePrompt: vi.fn(async () => "complete milestone"),
|
||||
buildCompleteSlicePrompt: vi.fn(async () => "complete slice"),
|
||||
buildDeployPrompt: vi.fn(async () => "deploy"),
|
||||
buildDiscussMilestonePrompt: vi.fn(async () => "discuss milestone"),
|
||||
buildDiscussProjectPrompt: vi.fn(async () => "discuss project"),
|
||||
buildDiscussRequirementsPrompt: vi.fn(async () => "discuss requirements"),
|
||||
|
|
@ -33,14 +35,22 @@ vi.mock("../auto-prompts.js", () => ({
|
|||
buildResearchProjectPrompt: vi.fn(async () => "research project"),
|
||||
buildResearchMilestonePrompt: vi.fn(async () => "research milestone"),
|
||||
buildResearchSlicePrompt: vi.fn(async () => "research slice"),
|
||||
buildReleasePrompt: vi.fn(async () => "release"),
|
||||
buildRewriteDocsPrompt: vi.fn(async () => "rewrite docs"),
|
||||
buildRollbackPrompt: vi.fn(async () => "rollback"),
|
||||
buildRunUatPrompt: vi.fn(async () => "run uat"),
|
||||
buildSmokeProductionPrompt: vi.fn(async () => "smoke production"),
|
||||
buildValidateMilestonePrompt: vi.fn(async () => "validate milestone"),
|
||||
buildWorkflowPreferencesPrompt: vi.fn(async () => "workflow preferences"),
|
||||
checkNeedsReassessment: vi.fn(async () => null),
|
||||
checkNeedsRunUat: vi.fn(async () => null),
|
||||
}));
|
||||
|
||||
vi.mock("../preferences-models.js", () => ({
|
||||
_initPrefsLoader: vi.fn(),
|
||||
resolveModelWithFallbacksForUnit: vi.fn(() => ({ primary: "mock-model" })),
|
||||
}));
|
||||
|
||||
import {
|
||||
closeDatabase,
|
||||
insertAssessment,
|
||||
|
|
@ -326,4 +336,235 @@ describe("resolveDispatch canonical milestone plan", () => {
|
|||
cleanup(base);
|
||||
}
|
||||
});
|
||||
|
||||
test("completed_slice_without_challenge_dispatches_slice_challenge_by_default", async () => {
|
||||
const base = makeTempDir("sf-dispatch-slice-challenge-");
|
||||
try {
|
||||
mkdirSync(join(base, ".sf"), { recursive: true });
|
||||
openDatabase(join(base, ".sf", "sf.db"));
|
||||
insertMilestone({
|
||||
id: "M901",
|
||||
title: "Purpose challenge gate",
|
||||
status: "active",
|
||||
});
|
||||
insertSlice({
|
||||
milestoneId: "M901",
|
||||
id: "S01",
|
||||
title: "Completed work",
|
||||
status: "complete",
|
||||
sequence: 1,
|
||||
});
|
||||
|
||||
const result = await resolveDispatch({
|
||||
state: { phase: "completing-milestone" },
|
||||
mid: "M901",
|
||||
midTitle: "Purpose challenge gate",
|
||||
basePath: base,
|
||||
prefs: { phases: {} },
|
||||
session: {},
|
||||
pipelineVariant: "standard",
|
||||
});
|
||||
|
||||
expect(result).toMatchObject({
|
||||
action: "dispatch",
|
||||
unitType: "challenge",
|
||||
unitId: "M901/S01/challenge",
|
||||
});
|
||||
expect(result.prompt).toContain("slice S01");
|
||||
} finally {
|
||||
closeDatabase();
|
||||
cleanup(base);
|
||||
}
|
||||
});
|
||||
|
||||
test("completing_milestone_after_slice_challenges_dispatches_milestone_challenge_before_completion", async () => {
|
||||
const base = makeTempDir("sf-dispatch-milestone-challenge-");
|
||||
try {
|
||||
mkdirSync(join(base, ".sf"), { recursive: true });
|
||||
openDatabase(join(base, ".sf", "sf.db"));
|
||||
insertMilestone({
|
||||
id: "M902",
|
||||
title: "Milestone challenge gate",
|
||||
status: "active",
|
||||
});
|
||||
insertSlice({
|
||||
milestoneId: "M902",
|
||||
id: "S01",
|
||||
title: "Completed work",
|
||||
status: "complete",
|
||||
sequence: 1,
|
||||
});
|
||||
insertAssessment({
|
||||
path: ".sf/milestones/M902/slices/S01/S01-CHALLENGE.md",
|
||||
milestoneId: "M902",
|
||||
sliceId: "S01",
|
||||
scope: "slice-challenge:S01",
|
||||
status: "pass",
|
||||
fullContent:
|
||||
"## Self-review Round\nok\n## Partner Round\nok\n## Adversary Round\nok\n## Overall Verdict\n- `PASS`\n",
|
||||
});
|
||||
insertAssessment({
|
||||
path: ".sf/milestones/M902/M902-VALIDATION.md",
|
||||
milestoneId: "M902",
|
||||
scope: "milestone-validation",
|
||||
status: "pass",
|
||||
fullContent: "verdict: pass\nOperational: MET\n",
|
||||
});
|
||||
|
||||
const result = await resolveDispatch({
|
||||
state: { phase: "completing-milestone" },
|
||||
mid: "M902",
|
||||
midTitle: "Milestone challenge gate",
|
||||
basePath: base,
|
||||
prefs: { phases: {} },
|
||||
session: {},
|
||||
pipelineVariant: "standard",
|
||||
});
|
||||
|
||||
expect(result).toMatchObject({
|
||||
action: "dispatch",
|
||||
unitType: "challenge",
|
||||
unitId: "M902",
|
||||
});
|
||||
expect(result.prompt).toContain("milestone");
|
||||
} finally {
|
||||
closeDatabase();
|
||||
cleanup(base);
|
||||
}
|
||||
});
|
||||
|
||||
test("completed_slice_when_challenge_file_has_no_verdict_dispatches_challenge_again", async () => {
|
||||
const base = makeTempDir("sf-dispatch-invalid-challenge-");
|
||||
try {
|
||||
mkdirSync(join(base, ".sf", "milestones", "M904", "slices", "S01"), {
|
||||
recursive: true,
|
||||
});
|
||||
writeFileSync(
|
||||
join(
|
||||
base,
|
||||
".sf",
|
||||
"milestones",
|
||||
"M904",
|
||||
"slices",
|
||||
"S01",
|
||||
"S01-CHALLENGE.md",
|
||||
),
|
||||
"# partial challenge without verdict\n",
|
||||
);
|
||||
openDatabase(join(base, ".sf", "sf.db"));
|
||||
insertMilestone({
|
||||
id: "M904",
|
||||
title: "Invalid challenge backfill",
|
||||
status: "active",
|
||||
});
|
||||
insertSlice({
|
||||
milestoneId: "M904",
|
||||
id: "S01",
|
||||
title: "Completed work",
|
||||
status: "complete",
|
||||
sequence: 1,
|
||||
});
|
||||
|
||||
const result = await resolveDispatch({
|
||||
state: { phase: "planning" },
|
||||
mid: "M904",
|
||||
midTitle: "Invalid challenge backfill",
|
||||
basePath: base,
|
||||
prefs: { phases: {} },
|
||||
session: {},
|
||||
pipelineVariant: "standard",
|
||||
});
|
||||
|
||||
expect(result).toMatchObject({
|
||||
action: "dispatch",
|
||||
unitType: "challenge",
|
||||
unitId: "M904/S01/challenge",
|
||||
});
|
||||
} finally {
|
||||
closeDatabase();
|
||||
cleanup(base);
|
||||
}
|
||||
});
|
||||
|
||||
test("skipped_slice_without_challenge_does_not_backfill_challenge", async () => {
|
||||
const base = makeTempDir("sf-dispatch-skipped-challenge-");
|
||||
try {
|
||||
mkdirSync(join(base, ".sf"), { recursive: true });
|
||||
openDatabase(join(base, ".sf", "sf.db"));
|
||||
insertMilestone({
|
||||
id: "M905",
|
||||
title: "Skipped slice challenge",
|
||||
status: "active",
|
||||
});
|
||||
insertSlice({
|
||||
milestoneId: "M905",
|
||||
id: "S01",
|
||||
title: "Skipped work",
|
||||
status: "skipped",
|
||||
sequence: 1,
|
||||
});
|
||||
|
||||
const result = await resolveDispatch({
|
||||
state: { phase: "planning" },
|
||||
mid: "M905",
|
||||
midTitle: "Skipped slice challenge",
|
||||
basePath: base,
|
||||
prefs: { phases: {} },
|
||||
session: {},
|
||||
pipelineVariant: "standard",
|
||||
});
|
||||
|
||||
expect(result?.unitType).not.toBe("challenge");
|
||||
} finally {
|
||||
closeDatabase();
|
||||
cleanup(base);
|
||||
}
|
||||
});
|
||||
|
||||
test("completing_milestone_when_adversarial_review_disabled_keeps_completion_path", async () => {
|
||||
const base = makeTempDir("sf-dispatch-no-challenge-");
|
||||
try {
|
||||
mkdirSync(join(base, ".sf"), { recursive: true });
|
||||
openDatabase(join(base, ".sf", "sf.db"));
|
||||
insertMilestone({
|
||||
id: "M903",
|
||||
title: "Explicitly disabled challenge",
|
||||
status: "active",
|
||||
});
|
||||
insertSlice({
|
||||
milestoneId: "M903",
|
||||
id: "S01",
|
||||
title: "Completed work",
|
||||
status: "complete",
|
||||
sequence: 1,
|
||||
});
|
||||
insertAssessment({
|
||||
path: ".sf/milestones/M903/M903-VALIDATION.md",
|
||||
milestoneId: "M903",
|
||||
scope: "milestone-validation",
|
||||
status: "pass",
|
||||
fullContent: "verdict: pass\nOperational: MET\n",
|
||||
});
|
||||
|
||||
const result = await resolveDispatch({
|
||||
state: { phase: "completing-milestone" },
|
||||
mid: "M903",
|
||||
midTitle: "Explicitly disabled challenge",
|
||||
basePath: base,
|
||||
prefs: { phases: {}, deploy: { adversarial_review: false } },
|
||||
session: {},
|
||||
pipelineVariant: "standard",
|
||||
});
|
||||
|
||||
expect(result).toMatchObject({
|
||||
action: "dispatch",
|
||||
unitType: "complete-milestone",
|
||||
unitId: "M903",
|
||||
prompt: "complete milestone",
|
||||
});
|
||||
} finally {
|
||||
closeDatabase();
|
||||
cleanup(base);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -11,8 +11,10 @@ import { join } from "node:path";
|
|||
import { afterEach, test } from "vitest";
|
||||
import { dispatchDirectPhase } from "../auto-direct-dispatch.js";
|
||||
import { verifyExpectedArtifact } from "../auto-recovery.js";
|
||||
import { executeSummarySave } from "../tools/workflow-tool-executors.js";
|
||||
import {
|
||||
closeDatabase,
|
||||
getAssessmentByScope,
|
||||
insertMilestone,
|
||||
insertSlice,
|
||||
openDatabase,
|
||||
|
|
@ -130,3 +132,80 @@ test("dispatchDirectPhase_when_db_has_no_completed_slices_ignores_stale_roadmap_
|
|||
},
|
||||
]);
|
||||
});
|
||||
|
||||
test("verifyExpectedArtifact_challenge_requires_overall_verdict", () => {
|
||||
const project = makeProject();
|
||||
const milestoneDir = join(project, ".sf", "milestones", "M990");
|
||||
writeFileSync(join(milestoneDir, "M990-CHALLENGE.md"), "# Challenge\n");
|
||||
assert.equal(verifyExpectedArtifact("challenge", "M990", project), false);
|
||||
|
||||
writeFileSync(
|
||||
join(milestoneDir, "M990-CHALLENGE.md"),
|
||||
["# Challenge", "", "## Overall Verdict", "- `PASS`", ""].join("\n"),
|
||||
);
|
||||
assert.equal(verifyExpectedArtifact("challenge", "M990", project), false);
|
||||
|
||||
writeFileSync(
|
||||
join(milestoneDir, "M990-CHALLENGE.md"),
|
||||
[
|
||||
"# Challenge",
|
||||
"",
|
||||
"## Self-review Round",
|
||||
"Internal critique.",
|
||||
"",
|
||||
"## Partner Round",
|
||||
"Constructive critique.",
|
||||
"",
|
||||
"## Adversary Round",
|
||||
"Hostile critique.",
|
||||
"",
|
||||
"## Overall Verdict",
|
||||
"- `PASS`",
|
||||
"",
|
||||
].join("\n"),
|
||||
);
|
||||
assert.equal(verifyExpectedArtifact("challenge", "M990", project), true);
|
||||
});
|
||||
|
||||
test("executeSummarySave_challenge_persists_assessment_scope", async () => {
|
||||
const project = makeProject();
|
||||
|
||||
const result = await executeSummarySave(
|
||||
{
|
||||
milestone_id: "M990",
|
||||
slice_id: "S01",
|
||||
artifact_type: "CHALLENGE",
|
||||
content: [
|
||||
"# Challenge",
|
||||
"",
|
||||
"## Self-review Round",
|
||||
"Internal critique.",
|
||||
"",
|
||||
"## Partner Round",
|
||||
"Constructive critique.",
|
||||
"",
|
||||
"## Adversary Round",
|
||||
"Hostile critique.",
|
||||
"",
|
||||
"## Finding 1: none",
|
||||
"",
|
||||
"## Overall Verdict",
|
||||
"- `ADVISORY`",
|
||||
"",
|
||||
].join("\n"),
|
||||
},
|
||||
project,
|
||||
);
|
||||
|
||||
assert.equal(result.isError, undefined);
|
||||
const assessment = getAssessmentByScope("M990", "slice-challenge:S01");
|
||||
assert.equal(assessment?.status, "advisory");
|
||||
assert.equal(
|
||||
assessment?.path,
|
||||
".sf/milestones/M990/slices/S01/S01-CHALLENGE.md",
|
||||
);
|
||||
assert.equal(
|
||||
verifyExpectedArtifact("challenge", "M990/S01/challenge", project),
|
||||
true,
|
||||
);
|
||||
});
|
||||
|
|
|
|||
|
|
@ -333,23 +333,23 @@ describe("selectAndApplyModel lineage-diverse wiring", () => {
|
|||
);
|
||||
});
|
||||
|
||||
// ── Non-lineage roles are not filtered ───────────────────────────────────────
|
||||
test("validator_role_is_not_filtered", async () => {
|
||||
// ── Validator review is lineage-diverse too ──────────────────────────────────
|
||||
test("validator_explicit_workerModelId_filters_matching_vendor", async () => {
|
||||
const candidates = [KIMI, ANTHROPIC, GOOGLE];
|
||||
// validator's constraints are strict-json + review, NO lineage-diverse-from-worker
|
||||
// → it should be able to pick kimi even when workerModelId is kimi
|
||||
// We just assert no throw and that the call completes normally.
|
||||
let threw = false;
|
||||
try {
|
||||
await dispatchFor({
|
||||
candidates,
|
||||
role: "validator",
|
||||
workerModelId: "kimi-k2.6",
|
||||
});
|
||||
} catch {
|
||||
threw = true;
|
||||
}
|
||||
assert.equal(threw, false, "validator dispatch should not throw");
|
||||
const applied = await dispatchFor({
|
||||
candidates,
|
||||
role: "validator",
|
||||
workerModelId: "kimi-k2.6",
|
||||
});
|
||||
if (applied === null) return;
|
||||
const isMoonshot =
|
||||
applied.provider === "kimi-coding" ||
|
||||
applied.id.toLowerCase().startsWith("kimi-");
|
||||
assert.equal(
|
||||
isMoonshot,
|
||||
false,
|
||||
`validator should not get kimi when worker was kimi, got: ${applied.provider}/${applied.id}`,
|
||||
);
|
||||
});
|
||||
|
||||
// ── modelRoleForUnitType maps challenge → adversary ──────────────────────────
|
||||
|
|
@ -360,6 +360,10 @@ describe("selectAndApplyModel lineage-diverse wiring", () => {
|
|||
"../uok/model-route-evidence.js"
|
||||
);
|
||||
assert.equal(modelRoleForUnitType("challenge"), "adversary");
|
||||
assert.equal(modelRoleForUnitType("combatant-review"), "reviewer");
|
||||
assert.equal(modelRoleForUnitType("combatant"), "adversary");
|
||||
assert.equal(modelRoleForUnitType("partner"), "reviewer");
|
||||
assert.equal(modelRoleForUnitType("advocate"), "reviewer");
|
||||
assert.equal(modelRoleForUnitType("review-slice"), "reviewer");
|
||||
assert.equal(modelRoleForUnitType("validate-milestone"), "validator");
|
||||
assert.equal(modelRoleForUnitType("execute-task"), "worker");
|
||||
|
|
|
|||
|
|
@ -88,7 +88,11 @@ describe("model role policy", () => {
|
|||
validator: {
|
||||
role: "validator",
|
||||
mode: "auto",
|
||||
constraints: ["strict-json", "review"],
|
||||
constraints: [
|
||||
"strict-json",
|
||||
"review",
|
||||
"lineage-diverse-from-worker",
|
||||
],
|
||||
},
|
||||
reviewer: {
|
||||
role: "reviewer",
|
||||
|
|
@ -140,6 +144,14 @@ describe("model role policy", () => {
|
|||
);
|
||||
});
|
||||
|
||||
test("validator_default_constraints_include_lineage_diverse_from_worker", () => {
|
||||
const policy = normalizeRolePolicy("validator");
|
||||
assert.ok(
|
||||
policy.constraints.includes("lineage-diverse-from-worker"),
|
||||
`expected lineage-diverse-from-worker in validator defaults, got: ${policy.constraints.join(", ")}`,
|
||||
);
|
||||
});
|
||||
|
||||
test("validateRolePolicy_adversary_with_lineage_diverse_from_worker_is_valid", () => {
|
||||
const result = validateRolePolicy("adversary", {
|
||||
constraints: ["lineage-diverse-from-worker"],
|
||||
|
|
|
|||
|
|
@ -0,0 +1,78 @@
|
|||
import assert from "node:assert/strict";
|
||||
import { test } from "vitest";
|
||||
|
||||
import {
|
||||
isLineageDiverseReviewSubagent,
|
||||
resolveLineageDiverseSubagentModel,
|
||||
} from "../subagent/lineage-routing.js";
|
||||
|
||||
const availableModels = [
|
||||
{ provider: "kimi-coding", id: "kimi-k2.6" },
|
||||
{ provider: "anthropic", id: "claude-sonnet-4-5" },
|
||||
{ provider: "google-gemini-cli", id: "gemini-3-pro" },
|
||||
];
|
||||
|
||||
test("isLineageDiverseReviewSubagent_detects_partner_and_combatant_lanes", () => {
|
||||
assert.equal(isLineageDiverseReviewSubagent("reviewer", "audit"), true);
|
||||
assert.equal(
|
||||
isLineageDiverseReviewSubagent("reviewer", "Partner: strengthen plan"),
|
||||
true,
|
||||
);
|
||||
assert.equal(
|
||||
isLineageDiverseReviewSubagent("reviewer", "Combatant: break plan"),
|
||||
true,
|
||||
);
|
||||
assert.equal(isLineageDiverseReviewSubagent("scout", "find files"), false);
|
||||
});
|
||||
|
||||
test("resolveLineageDiverseSubagentModel_when_review_inherits_worker_family_selects_other_family", () => {
|
||||
const model = resolveLineageDiverseSubagentModel({
|
||||
agentName: "reviewer",
|
||||
task: "Partner review",
|
||||
requestedModel: undefined,
|
||||
agentModel: undefined,
|
||||
parentModel: { provider: "kimi-coding", id: "kimi-k2.6" },
|
||||
availableModels,
|
||||
});
|
||||
|
||||
assert.equal(model, "anthropic/claude-sonnet-4-5");
|
||||
});
|
||||
|
||||
test("resolveLineageDiverseSubagentModel_when_explicit_same_family_selects_other_family", () => {
|
||||
const model = resolveLineageDiverseSubagentModel({
|
||||
agentName: "reviewer",
|
||||
task: "Combatant review",
|
||||
requestedModel: "kimi-coding/kimi-k2.6",
|
||||
agentModel: undefined,
|
||||
parentModel: { provider: "kimi-coding", id: "kimi-k2.6" },
|
||||
availableModels,
|
||||
});
|
||||
|
||||
assert.equal(model, "anthropic/claude-sonnet-4-5");
|
||||
});
|
||||
|
||||
test("resolveLineageDiverseSubagentModel_when_non_review_keeps_requested_model", () => {
|
||||
const model = resolveLineageDiverseSubagentModel({
|
||||
agentName: "scout",
|
||||
task: "Find files",
|
||||
requestedModel: "kimi-coding/kimi-k2.6",
|
||||
agentModel: undefined,
|
||||
parentModel: { provider: "kimi-coding", id: "kimi-k2.6" },
|
||||
availableModels,
|
||||
});
|
||||
|
||||
assert.equal(model, "kimi-coding/kimi-k2.6");
|
||||
});
|
||||
|
||||
test("resolveLineageDiverseSubagentModel_when_no_alternative_fails_open", () => {
|
||||
const model = resolveLineageDiverseSubagentModel({
|
||||
agentName: "reviewer",
|
||||
task: "Partner review",
|
||||
requestedModel: "kimi-coding/kimi-k2.6",
|
||||
agentModel: undefined,
|
||||
parentModel: { provider: "kimi-coding", id: "kimi-k2.6" },
|
||||
availableModels: [{ provider: "kimi-coding", id: "kimi-k2.6" }],
|
||||
});
|
||||
|
||||
assert.equal(model, "kimi-coding/kimi-k2.6");
|
||||
});
|
||||
|
|
@ -12,6 +12,7 @@ import {
|
|||
getMilestone,
|
||||
getSliceStatusSummary,
|
||||
getSliceTaskCounts,
|
||||
insertAssessment,
|
||||
readTransaction,
|
||||
saveGateResult,
|
||||
setSliceUatVerdict,
|
||||
|
|
@ -34,6 +35,11 @@ export const SUPPORTED_SUMMARY_ARTIFACT_TYPES = [
|
|||
"CONTEXT",
|
||||
"ASSESSMENT",
|
||||
"CONTEXT-DRAFT",
|
||||
"CHALLENGE",
|
||||
"DEPLOY",
|
||||
"ROLLBACK",
|
||||
"SMOKE",
|
||||
"RELEASE",
|
||||
];
|
||||
export function isSupportedSummaryArtifactType(artifactType) {
|
||||
return SUPPORTED_SUMMARY_ARTIFACT_TYPES.includes(artifactType);
|
||||
|
|
@ -132,6 +138,20 @@ export async function executeSummarySave(params, basePath = process.cwd()) {
|
|||
},
|
||||
basePath,
|
||||
);
|
||||
if (params.artifact_type === "CHALLENGE") {
|
||||
const verdict = extractChallengeVerdict(params.content);
|
||||
insertAssessment({
|
||||
path: `.sf/${relativePath}`,
|
||||
milestoneId: params.milestone_id,
|
||||
sliceId: params.slice_id ?? null,
|
||||
taskId: params.task_id ?? null,
|
||||
status: verdict,
|
||||
scope: params.slice_id
|
||||
? `slice-challenge:${params.slice_id}`
|
||||
: "milestone-challenge",
|
||||
fullContent: params.content,
|
||||
});
|
||||
}
|
||||
// Persist UAT verdict to DB when an ASSESSMENT is saved at slice scope.
|
||||
// This makes checkNeedsRunUat DB-only — no file fallback needed.
|
||||
if (
|
||||
|
|
@ -194,6 +214,16 @@ export async function executeSummarySave(params, basePath = process.cwd()) {
|
|||
};
|
||||
}
|
||||
}
|
||||
|
||||
function extractChallengeVerdict(content) {
|
||||
const text = String(content ?? "");
|
||||
const explicit = text.match(
|
||||
/Overall Verdict[\s\S]{0,200}?\b(PASS|NEEDS-REMEDIATION|ADVISORY)\b/i,
|
||||
);
|
||||
if (explicit) return explicit[1].toLowerCase();
|
||||
const fallback = text.match(/\b(PASS|NEEDS-REMEDIATION|ADVISORY)\b/i);
|
||||
return fallback ? fallback[1].toLowerCase() : "unknown";
|
||||
}
|
||||
export async function executeTaskComplete(params, basePath = process.cwd()) {
|
||||
const dbAvailable = await ensureDbOpen(basePath);
|
||||
if (!dbAvailable) {
|
||||
|
|
|
|||
|
|
@ -39,7 +39,12 @@ import {
|
|||
buildValidateMilestonePrompt,
|
||||
buildWorkflowPreferencesPrompt,
|
||||
} from "../auto-prompts.js";
|
||||
import { hasImplementationArtifacts } from "../auto-recovery.js";
|
||||
import {
|
||||
hasMinimumChallengeRounds,
|
||||
hasImplementationArtifacts,
|
||||
verifyExpectedArtifact,
|
||||
} from "../auto-recovery.js";
|
||||
import { resolveExpectedArtifactPath } from "../auto-artifact-paths.js";
|
||||
import { getCanonicalMilestonePlan } from "../canonical-milestone-plan.js";
|
||||
import { resolveDeepProjectSetupState } from "../deep-project-setup-policy.js";
|
||||
import { resolveEscalation } from "../escalation.js";
|
||||
|
|
@ -79,6 +84,7 @@ import { createScheduleStore } from "../schedule/schedule-store.js";
|
|||
import {
|
||||
getMilestone,
|
||||
getMilestoneSlices,
|
||||
getAssessmentByScope,
|
||||
getMilestoneValidationAssessment,
|
||||
getPendingGates,
|
||||
getRuntimeCounter,
|
||||
|
|
@ -208,6 +214,66 @@ async function readMilestoneValidationForDispatch(basePath, mid) {
|
|||
path: validationFile,
|
||||
};
|
||||
}
|
||||
|
||||
function adversarialReviewEnabled(prefs) {
|
||||
return (
|
||||
prefs?.deploy?.adversarial_review !== false &&
|
||||
prefs?.phases?.skip_adversarial_review !== true
|
||||
);
|
||||
}
|
||||
|
||||
function challengeAssessmentScope(sliceId) {
|
||||
return sliceId ? `slice-challenge:${sliceId}` : "milestone-challenge";
|
||||
}
|
||||
|
||||
function challengeUnitId(milestoneId, sliceId) {
|
||||
return sliceId ? `${milestoneId}/${sliceId}/challenge` : milestoneId;
|
||||
}
|
||||
|
||||
function challengeAlreadyRecorded(basePath, milestoneId, sliceId = null) {
|
||||
const expectedPath = resolveExpectedArtifactPath(
|
||||
"challenge",
|
||||
challengeUnitId(milestoneId, sliceId),
|
||||
basePath,
|
||||
);
|
||||
if (
|
||||
expectedPath &&
|
||||
existsSync(expectedPath) &&
|
||||
verifyExpectedArtifact(
|
||||
"challenge",
|
||||
challengeUnitId(milestoneId, sliceId),
|
||||
basePath,
|
||||
)
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
if (!isDbAvailable()) return false;
|
||||
const assessment = getAssessmentByScope(
|
||||
milestoneId,
|
||||
challengeAssessmentScope(sliceId),
|
||||
);
|
||||
if (!assessment) return false;
|
||||
const status = String(assessment.status ?? "")
|
||||
.trim()
|
||||
.toLowerCase();
|
||||
if (["", "unknown"].includes(status)) return false;
|
||||
return hasMinimumChallengeRounds(assessment.full_content);
|
||||
}
|
||||
|
||||
function findCompletedSliceNeedingChallenge(basePath, milestoneId) {
|
||||
if (!isDbAvailable()) return null;
|
||||
return getMilestoneSlices(milestoneId)
|
||||
.filter((slice) => slice.status === "complete" || slice.status === "done")
|
||||
.sort(
|
||||
(a, b) =>
|
||||
(a.sequence ?? Number.MAX_SAFE_INTEGER) -
|
||||
(b.sequence ?? Number.MAX_SAFE_INTEGER) || a.id.localeCompare(b.id),
|
||||
)
|
||||
.find(
|
||||
(slice) => !challengeAlreadyRecorded(basePath, milestoneId, slice.id),
|
||||
);
|
||||
}
|
||||
|
||||
function canonicalPlanStop(mid, plan) {
|
||||
return {
|
||||
action: "stop",
|
||||
|
|
@ -887,6 +953,27 @@ export const DISPATCH_RULES = [
|
|||
return null;
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "completed-slice → challenge",
|
||||
match: async ({ mid, midTitle, basePath, prefs }) => {
|
||||
if (!adversarialReviewEnabled(prefs)) return null;
|
||||
const slice = findCompletedSliceNeedingChallenge(basePath, mid);
|
||||
if (!slice) return null;
|
||||
return {
|
||||
action: "dispatch",
|
||||
unitType: "challenge",
|
||||
unitId: challengeUnitId(mid, slice.id),
|
||||
prompt: await buildChallengePrompt(
|
||||
mid,
|
||||
midTitle,
|
||||
`slice ${slice.id}`,
|
||||
prefs?.deploy?.adversarial_mode ?? "red-team",
|
||||
basePath,
|
||||
{ sliceId: slice.id },
|
||||
),
|
||||
};
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "reassess-roadmap (post-completion)",
|
||||
match: async ({ state, mid, midTitle, basePath, prefs }) => {
|
||||
|
|
@ -1613,7 +1700,7 @@ export const DISPATCH_RULES = [
|
|||
},
|
||||
{
|
||||
name: "completing-milestone → complete-milestone",
|
||||
match: async ({ state, mid, midTitle, basePath }) => {
|
||||
match: async ({ state, mid, midTitle, basePath, prefs }) => {
|
||||
if (state.phase !== "completing-milestone") return null;
|
||||
// Safety guard (#2675): completion is only automatic after a pass verdict.
|
||||
// Non-pass terminal verdicts are still terminal for validation loops, but
|
||||
|
|
@ -1799,6 +1886,23 @@ export const DISPATCH_RULES = [
|
|||
} catch {
|
||||
// Non-fatal advisory
|
||||
}
|
||||
if (
|
||||
adversarialReviewEnabled(prefs) &&
|
||||
!challengeAlreadyRecorded(basePath, mid)
|
||||
) {
|
||||
return {
|
||||
action: "dispatch",
|
||||
unitType: "challenge",
|
||||
unitId: challengeUnitId(mid),
|
||||
prompt: await buildChallengePrompt(
|
||||
mid,
|
||||
midTitle,
|
||||
"milestone",
|
||||
prefs?.deploy?.adversarial_mode ?? "red-team",
|
||||
basePath,
|
||||
),
|
||||
};
|
||||
}
|
||||
return {
|
||||
action: "dispatch",
|
||||
unitType: "complete-milestone",
|
||||
|
|
@ -1807,38 +1911,6 @@ export const DISPATCH_RULES = [
|
|||
};
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "completing-milestone → challenge",
|
||||
match: async ({ state, mid, midTitle, basePath, prefs }) => {
|
||||
if (state.phase !== "completing-milestone") return null;
|
||||
if (!prefs?.deploy?.adversarial_review) return null;
|
||||
// Only trigger if no challenge assessment exists for this milestone yet
|
||||
try {
|
||||
const { getDatabase } = await import("./sf-db.js");
|
||||
const db = getDatabase(basePath);
|
||||
const row = db
|
||||
.prepare(
|
||||
"SELECT id FROM assessments WHERE milestone_id = ? AND artifact_type = 'CHALLENGE' LIMIT 1",
|
||||
)
|
||||
.get(mid);
|
||||
if (row) return null;
|
||||
} catch {
|
||||
return null;
|
||||
}
|
||||
return {
|
||||
action: "dispatch",
|
||||
unitType: "challenge",
|
||||
unitId: `challenge-${mid}`,
|
||||
prompt: await buildChallengePrompt(
|
||||
mid,
|
||||
midTitle,
|
||||
"milestone",
|
||||
prefs?.deploy?.adversarial_mode ?? "red-team",
|
||||
basePath,
|
||||
),
|
||||
};
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "completing-milestone → release",
|
||||
match: async ({ state, mid, midTitle, basePath, prefs }) => {
|
||||
|
|
|
|||
|
|
@ -38,13 +38,9 @@ export const SUPPORTED_MODEL_ROLES = Object.freeze([
|
|||
* role's resolved model). Prevents "looks fine to me" rubber-stamp by
|
||||
* lineage-twins. When the worker model is unknown or maps to "unknown" vendor,
|
||||
* this constraint is treated as a no-op (fail-open rather than blocking all
|
||||
* candidates). Enforcement is best-effort at the model-selection layer.
|
||||
*
|
||||
* TODO(lineage-diverse-from-worker): wire handler in auto-model-selection.js
|
||||
* selectAndApplyModel — after the policy gate resolves `routingEligibleModels`,
|
||||
* call resolveWorkerLineage(resolvedWorkerModelId) and filter candidates with
|
||||
* !isSameRootVendor(candidate.id, workerVendor). See rootVendorFor() in this
|
||||
* file for the canonical vendor mapping.
|
||||
* candidates). Enforcement happens in auto-model-selection.js before final
|
||||
* route resolution. See rootVendorFor() in this file for the canonical vendor
|
||||
* mapping.
|
||||
*/
|
||||
export const SUPPORTED_MODEL_ROLE_CONSTRAINTS = Object.freeze([
|
||||
"coding",
|
||||
|
|
@ -79,7 +75,7 @@ const CONCRETE_ROUTE_KEYS = new Set([
|
|||
export const DEFAULT_MODEL_ROLE_CONSTRAINTS = Object.freeze({
|
||||
orchestrator: Object.freeze(["long-context", "strict-json"]),
|
||||
worker: Object.freeze(["coding"]),
|
||||
validator: Object.freeze(["strict-json", "review"]),
|
||||
validator: Object.freeze(["strict-json", "review", "lineage-diverse-from-worker"]),
|
||||
reviewer: Object.freeze(["review", "lineage-diverse-from-worker"]),
|
||||
adversary: Object.freeze(["review", "lineage-diverse-from-worker"]),
|
||||
});
|
||||
|
|
|
|||
|
|
@ -23,9 +23,13 @@ export function modelRoleForUnitType(unitType) {
|
|||
if (normalized.includes("review") || normalized.includes("scrutiny")) {
|
||||
return "reviewer";
|
||||
}
|
||||
if (normalized.includes("partner") || normalized.includes("advocate")) {
|
||||
return "reviewer";
|
||||
}
|
||||
if (
|
||||
normalized.includes("adversar") ||
|
||||
normalized.includes("challenge") ||
|
||||
normalized.includes("combatant") ||
|
||||
normalized.includes("red-team") ||
|
||||
normalized.includes("redteam")
|
||||
) {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue