diff --git a/src/headless-triage.ts b/src/headless-triage.ts index 0dce57de7..ddffe0cad 100644 --- a/src/headless-triage.ts +++ b/src/headless-triage.ts @@ -24,9 +24,13 @@ * Consumer: headless.ts when command === "triage". */ -import { existsSync } from "node:fs"; +import { spawn } from "node:child_process"; +import { randomUUID } from "node:crypto"; +import { existsSync, mkdtempSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; import { join } from "node:path"; import { createJiti } from "@mariozechner/jiti"; +import { parse as parseYaml } from "yaml"; import { resolveBundledSourceResource } from "./bundled-resource-path.js"; import { getSfEnv } from "./env.js"; @@ -60,7 +64,9 @@ export interface HandleTriageOptions { list?: boolean; max?: number; run?: boolean; + apply?: boolean; model?: string; + agentRunner?: AgentRunner; } export interface HandleTriageResult { @@ -77,6 +83,480 @@ interface TriageCandidate { effortEstimate?: number; } +interface AgentConfig { + name: string; + model?: string; + tools?: string[]; + systemPrompt: string; + promptParts?: string[]; + source?: string; + filePath?: string; +} + +interface AgentRunResult { + ok: boolean; + output: string; + stderr?: string; + exitCode?: number; +} + +type AgentRunner = ( + agent: AgentConfig, + task: string, + options?: { tools?: string[]; model?: string }, +) => Promise; + +/** + * Triage-decider's output contract is a YAML fenced block with key + * `decisions:`. Parse it. Returns null when no plan is present or YAML + * fails to load — runTriageApply treats null as "do not apply" (safe + * default: when in doubt, never mutate). + * + * Why a structured plan instead of letting the decider call resolve_issue + * directly: codex review 2026-05-14 flagged that the original sequential + * design (decider → rubber-duck) let the decider mutate state during its + * own turn, before rubber-duck ever saw the decisions. This parser pulls + * the proposed actions out of the decider's text so they can be reviewed + * BEFORE any resolve_issue call. + */ +export interface TriageDecision { + id: string; + outcome: "fix" | "promote" | "close"; + evidenceKind?: string; + reason?: string; + proposedApproach?: string; + requirementId?: string; +} + +export function parseTriagePlan(text: string): TriageDecision[] | null { + if (typeof text !== "string" || text.length === 0) return null; + const fenceMatch = text.match(/```ya?ml\s*\n([\s\S]*?)\n```/i); + if (!fenceMatch) return null; + const yamlBody = fenceMatch[1]; + let parsed: unknown; + try { + parsed = parseYaml(yamlBody); + } catch { + return null; + } + const root = parsed as Record | null; + const decisions = root?.decisions; + if (!Array.isArray(decisions)) return null; + const out: TriageDecision[] = []; + for (const raw of decisions) { + if (!raw || typeof raw !== "object") continue; + const item = raw as Record; + const id = typeof item.id === "string" ? item.id : null; + const outcome = item.outcome; + if (!id) continue; + if (outcome !== "fix" && outcome !== "promote" && outcome !== "close") + continue; + const decision: TriageDecision = { id, outcome }; + if (typeof item.evidence_kind === "string") { + decision.evidenceKind = item.evidence_kind; + } else if (typeof item.evidenceKind === "string") { + decision.evidenceKind = item.evidenceKind; + } + if (typeof item.reason === "string") decision.reason = item.reason; + if (typeof item.proposed_approach === "string") { + decision.proposedApproach = item.proposed_approach; + } else if (typeof item.proposedApproach === "string") { + decision.proposedApproach = item.proposedApproach; + } + if (typeof item.requirement_id === "string") { + decision.requirementId = item.requirement_id; + } else if (typeof item.requirementId === "string") { + decision.requirementId = item.requirementId; + } + out.push(decision); + } + return out; +} + +function buildSfPrintLaunchArgs( + promptPath: string, + task: string, + agent: AgentConfig, + options: { tools?: string[]; model?: string } = {}, +): { command: string; args: string[] } { + const sfBinPath = process.env.SF_BIN_PATH || process.argv[1]; + const baseArgs = [ + "--mode", + "text", + "-p", + "--no-session", + "--append-system-prompt", + promptPath, + ]; + const tools = options.tools ?? agent.tools; + if (tools && tools.length > 0) baseArgs.push("--tools", tools.join(",")); + const model = options.model ?? agent.model; + if (model) baseArgs.push("--model", model); + baseArgs.push(`Task: ${task}`); + if (!sfBinPath) return { command: "sf", args: baseArgs }; + if (sfBinPath.endsWith(".js") || sfBinPath.endsWith(".ts")) { + return { command: process.execPath, args: [sfBinPath, ...baseArgs] }; + } + return { command: sfBinPath, args: baseArgs }; +} + +async function defaultAgentRunner( + agent: AgentConfig, + task: string, + options: { tools?: string[]; model?: string } = {}, +): Promise { + const tmpDir = mkdtempSync(join(tmpdir(), "sf-triage-agent-")); + const promptPath = join(tmpDir, `${agent.name}.md`); + // Compose the system prompt via the prompt-parts registry. Dynamic + // import because src/resources/ is excluded from the root tsconfig + // (extensions get their own build). If the module isn't available + // fall back to the agent's raw systemPrompt — degrades gracefully. + const promptPartsModule = (await jiti.import( + sfExtensionPath("subagent/prompt-parts"), + )) as { + composeAgentPrompt?: ( + agent: AgentConfig, + context: { cwd: string; surface: string; tools?: string[] }, + ) => string; + }; + const composed = + promptPartsModule.composeAgentPrompt?.(agent, { + cwd: process.cwd(), + surface: "headless", + tools: options.tools ?? agent.tools, + }) ?? agent.systemPrompt; + const appendedPrompt = `${composed}\n\n## Task Input\n\n${task}`; + writeFileSync(promptPath, appendedPrompt, { encoding: "utf-8", mode: 0o600 }); + try { + const launch = buildSfPrintLaunchArgs( + promptPath, + "Run the task input from the appended system prompt.", + agent, + options, + ); + return await new Promise((resolve) => { + const proc = spawn(launch.command, launch.args, { + cwd: process.cwd(), + env: process.env, + shell: false, + stdio: ["ignore", "pipe", "pipe"], + }); + let stdout = ""; + let stderr = ""; + proc.stdout.on("data", (chunk) => { + stdout += chunk.toString(); + }); + proc.stderr.on("data", (chunk) => { + stderr += chunk.toString(); + }); + proc.on("error", (err) => { + resolve({ + ok: false, + output: stdout, + stderr: err instanceof Error ? err.message : String(err), + exitCode: 1, + }); + }); + proc.on("close", (code) => { + resolve({ + ok: (code ?? 1) === 0, + output: stdout.trim(), + stderr: stderr.trim(), + exitCode: code ?? 1, + }); + }); + }); + } finally { + rmSync(tmpDir, { recursive: true, force: true }); + } +} + +async function emitTriageApplyJournal( + cwd: string, + flowId: string, + seq: number, + eventType: string, + data: Record = {}, +): Promise { + try { + const journalModule = (await jiti.import(sfExtensionPath("journal"))) as { + emitJournalEvent?: ( + basePath: string, + entry: Record, + ) => void; + }; + journalModule.emitJournalEvent?.(cwd, { + ts: new Date().toISOString(), + flowId, + seq, + eventType, + data, + }); + } catch { + // Journal is best-effort; the apply result remains authoritative. + } +} + +export async function runTriageApply( + cwd: string, + prompt: string, + options: { + model?: string; + agentRunner?: AgentRunner; + candidateCount?: number; + } = {}, +): Promise<{ + ok: boolean; + agreed: boolean; + error?: string; + deciderOutput?: string; + reviewOutput?: string; + resolvedIds: string[]; + flowId: string; +}> { + const flowId = `triage-apply-${randomUUID()}`; + let seq = 0; + const emit = (eventType: string, data: Record = {}) => + emitTriageApplyJournal(cwd, flowId, seq++, eventType, data); + await emit("triage-apply-start", { + candidateCount: options.candidateCount ?? null, + }); + const agentsModule = (await jiti.import( + sfExtensionPath("subagent/agents"), + )) as { + discoverAgents?: (cwd: string, scope: string) => { agents: AgentConfig[] }; + }; + const agents = agentsModule.discoverAgents?.(cwd, "both").agents ?? []; + const triageDecider = agents.find((agent) => agent.name === "triage-decider"); + const rubberDuck = agents.find((agent) => agent.name === "rubber-duck"); + if (!triageDecider || !rubberDuck) { + const missing = [ + triageDecider ? null : "triage-decider", + rubberDuck ? null : "rubber-duck", + ] + .filter(Boolean) + .join(", "); + await emit("triage-apply-failed", { reason: "missing-agent", missing }); + return { + ok: false, + agreed: false, + error: `Missing built-in agent(s): ${missing}`, + resolvedIds: [], + flowId, + }; + } + // Trusted-source guard (codex review 2026-05-14): when --apply will + // mutate the ledger, BOTH agents must be SF-shipped built-ins. A + // project-level override could silently disable rubber-duck's + // independence — operators can still customize behavior, but they + // have to use --review mode where mutations aren't auto-applied. + if (triageDecider.source !== "builtin" || rubberDuck.source !== "builtin") { + await emit("triage-apply-failed", { + reason: "untrusted-agent-source", + triageDeciderSource: triageDecider.source, + rubberDuckSource: rubberDuck.source, + }); + return { + ok: false, + agreed: false, + error: `Refusing to --apply with non-builtin agents (triage-decider=${triageDecider.source}, rubber-duck=${rubberDuck.source}). Use --review for inspect-only mode, or remove the project/user override.`, + resolvedIds: [], + flowId, + }; + } + + const runner = options.agentRunner ?? defaultAgentRunner; + + // Phase 1: triage-decider runs in PLAN-ONLY mode. Drop resolve_issue + // from its tool list (the YAML already drops it, but this is defense- + // in-depth in case a project override resurrects it). The decider + // emits a YAML decision plan; we parse it post-hoc. + const decider = await runner(triageDecider, prompt, { + model: options.model, + tools: ["view", "grep", "glob", "git_log"], + }); + await emit("triage-apply-decider-finished", { + ok: decider.ok, + exitCode: decider.exitCode ?? null, + }); + if (!decider.ok) { + await emit("triage-apply-failed", { reason: "decider-failed" }); + return { + ok: false, + agreed: false, + error: decider.stderr || "triage-decider failed", + deciderOutput: decider.output, + resolvedIds: [], + flowId, + }; + } + + // Parse the structured plan. If the decider didn't emit a valid plan + // (missing fenced block, malformed YAML, no decisions key), refuse to + // apply — we can't audit what we can't read. + const plan = parseTriagePlan(decider.output); + if (!plan || plan.length === 0) { + await emit("triage-apply-failed", { reason: "no-plan" }); + return { + ok: false, + agreed: false, + error: + "triage-decider did not emit a parseable decision plan (expected a fenced ```yaml block with a `decisions:` key)", + deciderOutput: decider.output, + resolvedIds: [], + flowId, + }; + } + await emit("triage-apply-plan-parsed", { + decisionCount: plan.length, + outcomes: plan.reduce>((acc, d) => { + acc[d.outcome] = (acc[d.outcome] ?? 0) + 1; + return acc; + }, {}), + }); + + // Phase 2: rubber-duck reviews the plan with read-only tools. The + // review task explicitly hands the plan as the artifact under + // scrutiny — the reviewer's job is to spot bad calls before they land. + const reviewTask = [ + "Review this self-feedback triage decision PLAN. The plan has NOT yet been applied — your verdict gates whether any resolve_issue mutation runs.", + 'Return "rubber-duck: agree" only if every decision in the plan is safe and coherent against the current code/ledger state.', + "On disagreement, name each concerning decision explicitly so the operator (or a follow-up apply pass) can pull just those entries out and proceed with the rest.", + "", + "## Original triage prompt (the ledger entries the decider saw)", + prompt, + "", + "## triage-decider output (includes the plan as a fenced yaml block)", + decider.output, + ].join("\n"); + const review = await runner(rubberDuck, reviewTask, { + model: options.model, + tools: ["view", "grep", "glob", "git_log", "query_journal"], + }); + const agreed = /^rubber-duck:\s*agree\b/im.test(review.output.trim()); + await emit( + agreed + ? "triage-apply-rubber-duck-agree" + : "triage-apply-rubber-duck-disagree", + { + ok: review.ok, + exitCode: review.exitCode ?? null, + }, + ); + if (!review.ok) { + await emit("triage-apply-failed", { reason: "rubber-duck-failed" }); + return { + ok: false, + agreed: false, + error: review.stderr || "rubber-duck failed", + deciderOutput: decider.output, + reviewOutput: review.output, + resolvedIds: [], + flowId, + }; + } + if (!agreed) { + // Disagreement is a clean pause, not a failure. The plan and the + // review are both persisted in the decision report; the operator + // can read both and act. + return { + ok: false, + agreed: false, + error: "rubber-duck disagreed — pausing for operator review", + deciderOutput: decider.output, + reviewOutput: review.output, + resolvedIds: [], + flowId, + }; + } + + // Phase 3: apply the plan. We (this runner) call resolve_issue for + // each close/promote decision; fix decisions get surfaced for the + // operator but never auto-mutate. Mutations happen ONCE, post-review, + // and the resolvedIds we return reflect actual ledger state. + const resolvedIds = await applyTriagePlan(cwd, plan, emit); + return { + ok: true, + agreed: true, + deciderOutput: decider.output, + reviewOutput: review.output, + resolvedIds, + flowId, + }; +} + +/** + * Apply an approved decision plan. Calls markResolved (via the SF + * extension's self-feedback writer, which runs the existing writer- + * layer constraints — accepted evidence kinds, commit-exists check for + * agent-fix, etc.) for each close/promote decision. Fix decisions are + * not auto-applied; they require operator implementation work. + * + * Returns the list of ledger ids that were actually closed/promoted. + */ +async function applyTriagePlan( + cwd: string, + plan: TriageDecision[], + emit: (eventType: string, data?: Record) => Promise, +): Promise { + const resolvedIds: string[] = []; + const sfModule = (await jiti.import(sfExtensionPath("self-feedback"))) as { + markResolved?: ( + entryId: string, + resolution: Record, + basePath?: string, + ) => boolean; + }; + if (typeof sfModule.markResolved !== "function") { + await emit("triage-apply-mutation-failed", { + reason: "markResolved-unavailable", + }); + return resolvedIds; + } + for (const decision of plan) { + if (decision.outcome === "fix") { + // Fix decisions are operator handoffs — surface in the report + // (via the caller's deciderOutput / decision plan), don't mutate. + await emit("triage-apply-fix-pending-operator", { id: decision.id }); + continue; + } + const evidenceKind = + decision.evidenceKind ?? + (decision.outcome === "promote" + ? "promoted-to-requirement" + : "human-clear"); + const evidence: Record = { kind: evidenceKind }; + if (decision.outcome === "promote" && decision.requirementId) { + evidence.requirementId = decision.requirementId; + } + const reason = decision.reason ?? ""; + try { + const ok = sfModule.markResolved(decision.id, { reason, evidence }, cwd); + if (ok) { + resolvedIds.push(decision.id); + await emit("triage-apply-resolved", { + id: decision.id, + outcome: decision.outcome, + evidenceKind, + }); + } else { + await emit("triage-apply-mutation-rejected", { + id: decision.id, + outcome: decision.outcome, + evidenceKind, + note: "writer layer refused the resolution", + }); + } + } catch (err) { + await emit("triage-apply-mutation-failed", { + id: decision.id, + error: err instanceof Error ? err.message : String(err), + }); + } + } + return resolvedIds; +} + /** * Render the triage queue or canonical triage prompt to stdout. * @@ -147,20 +627,18 @@ export async function handleTriage( if (candidates.length === 0) { if (options.json) { - process.stdout.write( - `${JSON.stringify({ ok: true, candidates: [] })}\n`, - ); + process.stdout.write(`${JSON.stringify({ ok: true, candidates: [] })}\n`); } else { process.stdout.write("No open self-feedback candidates to triage.\n"); } return { exitCode: 0 }; } - // --run takes precedence over --json/--list because they describe the - // OUTPUT FORMAT, not the action. With --run, --json controls whether - // the run-result is JSON vs. human text. Without --run, --json emits + // --run/--apply take precedence over --json/--list because they describe the + // ACTION, not the output format. With --run/--apply, --json controls whether + // the result is JSON vs. human text. Without an action, --json emits // the candidate digest as JSON (the inspect path). - if (!options.run) { + if (!options.run && !options.apply) { if (options.json) { process.stdout.write( `${JSON.stringify({ @@ -186,8 +664,7 @@ export async function handleTriage( ); for (const c of candidates) { const impact = c.impactScore != null ? `i${c.impactScore}` : "i?"; - const effort = - c.effortEstimate != null ? `e${c.effortEstimate}` : "e?"; + const effort = c.effortEstimate != null ? `e${c.effortEstimate}` : "e?"; process.stdout.write( ` [${c.severity}] ${impact} ${effort} ${c.id} ${c.kind}\n`, ); @@ -208,11 +685,45 @@ export async function handleTriage( return { exitCode: 1 }; } - if (!options.run) { + if (!options.run && !options.apply) { process.stdout.write(`${prompt}\n`); return { exitCode: 0 }; } + if (options.apply) { + process.stderr.write( + "[triage] applying via triage-decider -> rubber-duck (this can take a few minutes)…\n", + ); + const result = await runTriageApply(cwd, prompt, { + model: options.model, + agentRunner: options.agentRunner, + candidateCount: candidates.length, + }); + const payload = { + ok: result.ok, + agreed: result.agreed, + error: result.error, + flowId: result.flowId, + resolvedIds: result.resolvedIds, + deciderOutput: result.deciderOutput, + reviewOutput: result.reviewOutput, + }; + if (options.json) { + process.stdout.write(`${JSON.stringify(payload)}\n`); + } else if (result.ok) { + process.stdout.write( + `Triage apply complete: rubber-duck agreed (${result.resolvedIds.length} resolved)\n`, + ); + if (result.resolvedIds.length > 0) { + process.stdout.write(`Resolved: ${result.resolvedIds.join(", ")}\n`); + } + } else { + process.stdout.write(`[triage] apply blocked: ${result.error}\n`); + if (result.reviewOutput) process.stdout.write(`${result.reviewOutput}\n`); + } + return { exitCode: result.ok ? 0 : 1 }; + } + // --run: dispatch the prompt via @singularity-forge/ai completeSimple, // capture the decision text, persist to .sf/triage/decisions/.md. // Same shape as `sf headless reflect --run`. The model's output is a diff --git a/src/resources/extensions/sf/agents/triage-decider.agent.yaml b/src/resources/extensions/sf/agents/triage-decider.agent.yaml index 6ed51cec9..c84e35af9 100644 --- a/src/resources/extensions/sf/agents/triage-decider.agent.yaml +++ b/src/resources/extensions/sf/agents/triage-decider.agent.yaml @@ -1,77 +1,89 @@ name: triage-decider displayName: Self-Feedback Triage Decider description: > - Reads the open self-feedback queue and decides each entry's outcome - (Fix, Promote, or Close). Calls resolve_issue directly for closures - and promotions; surfaces fixable entries to the operator with a - proposed approach. Wired by `sf headless triage --apply` after the - rubber-duck review stage agrees. + Reads the open self-feedback queue and proposes a decision plan + (Fix, Promote, or Close per entry). PLAN-ONLY: this agent does NOT + call resolve_issue directly. The plan is reviewed by rubber-duck; + only on agreement does the triage --apply runner execute the plan. + This separation exists so a bad decision can be blocked before any + mutation lands (see sf-mp5lnlbc-ty5fec / codex review 2026-05-14). tools: - - resolve_issue - view - grep - glob - git_log -# promptParts mirrors copilot's declarative composition matrix. SF doesn't -# yet honor these flags at runtime — they document INTENT so the day the -# prompt-composition runtime lands, this agent picks it up automatically -# without a YAML edit. Today's effective behavior is: the full `prompt:` -# below is used verbatim. promptParts: - includeAISafety: true - includeToolInstructions: true - includeParallelToolCalling: false - includeCustomAgentInstructions: false - includeEnvironmentContext: true + - aiSafety + - toolInstructions + - environmentContext prompt: | You are SF's self-feedback triage decider. Your job is to give each open forge-local self-feedback entry a decision — sitting open forever is the failure mode. + **PLAN-ONLY MODE:** You do not call resolve_issue. You produce a + structured decision plan. A separate review stage (rubber-duck) will + read your plan and either approve or block it before any mutation + happens. This separation is intentional: it prevents bad decisions + from landing without a second opinion. + For each entry, choose exactly one outcome: A. Fix it. The defect is real, in scope, and worth fixing now. - Describe the smallest coherent change. Do NOT - implement — surface the proposed approach. + Describe the smallest coherent change as + proposed_approach. Implementation is handed back + to the operator — your job is to decide, not code. B. Promote it. Real defect, but the right place to track is a - requirement, not a self-feedback entry. Call - resolve_issue with evidence kind - "promoted-to-requirement" after ensuring the - requirement row exists. + requirement, not a self-feedback entry. The apply + runner will close with evidence.kind = + "promoted-to-requirement"; cite the requirement + id you intend. C. Close it. The entry is no longer of value: stale, superseded, false positive, or not worth a fix at SF's current - priorities. Call resolve_issue with evidence kind - "human-clear" and a reason that names WHY. + priorities. The apply runner will close with + evidence.kind = "human-clear" and your reason. ## Decision procedure 1. For each entry: verify the claim still applies against the current code (use grep / view / git_log). - 2. If outcome A (fix): describe the smallest coherent change and - surface it as a `## Proposed fix for ` section. Do not call - resolve_issue — the operator (or a follow-up implementation pass) - handles the actual code edit + commit. - 3. If outcome B (promote): call resolve_issue with - `{kind: "promoted-to-requirement", requirementId: }` after - ensuring the requirement row exists. - 4. If outcome C (close): call resolve_issue with - `{kind: "human-clear"}` and a `reason` that names WHY the entry is - no longer of value (stale, superseded by , false - positive, out-of-scope). Be specific — a future reader should be - able to tell whether re-opening makes sense. - 5. Never use evidence kind `"auto-version-bump"` — that kind is - reserved for the automatic version-bump resolver and would - re-open under the credibility check. + 2. Choose outcome A, B, or C per entry. Closing entries deliberately + is valid and expected — not every entry needs a code fix. + 3. Never propose evidence.kind = "auto-version-bump" — that kind is + reserved for the automatic version-bump resolver and would re-open + under the credibility check. ## Tool boundaries - - You have resolve_issue (close/promote entries), view/grep/glob/git_log - (read-only investigation). You do NOT have edit/write/bash. Code - fixes go to the operator — your job is decisions, not implementation. + - You have read-only investigation: view, grep, glob, git_log. You + do NOT have resolve_issue, edit, write, or bash. Mutation is the + apply runner's job, after rubber-duck reviews your plan. ## Output contract - End your final message with the literal line: + Your final message MUST end with a single fenced YAML block named + `decisions` containing one entry per ledger item. Format: + + ```yaml + decisions: + - id: sf-xxxx-yyyyyy + outcome: close # one of: fix | promote | close + evidence_kind: human-clear # required for close (human-clear) or + # promote (promoted-to-requirement); + # omit for fix + reason: > + Specific WHY. Cite commit / entry / file paths as evidence. + A future reader should be able to tell whether re-opening makes + sense. + proposed_approach: > # only for outcome: fix + Smallest coherent change. Files to touch, key invariants. Brief. + requirement_id: REQ-xxx # only for outcome: promote + ``` + + Then on a line by itself: `Self-feedback triage complete.` - This marker confirms the decision pass terminated cleanly. + This marker confirms the plan pass terminated cleanly. If you cannot + produce a complete plan for any reason, omit the marker and explain + what blocked you — the apply runner will treat absence of the marker + as "do not apply".