From 9432dace89b3b21ebf827c16f6225b3488733078 Mon Sep 17 00:00:00 2001 From: Mikael Hugo Date: Sun, 17 May 2026 03:37:00 +0200 Subject: [PATCH] feat: roadmap expansion (M010-M030) + Unified Dispatch v2 scaffold (M010/S01+S02) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit REQUIREMENTS.md: 15 → 48 R-entries covering self-heal, inline dispatch, MessageBus coherence, multi-model routing, reconciliation, operator tooling, docs sync, test-backed completion, cost accountability, portability, federation, skills marketplace, privacy, ADR enforcement, idempotency, plan determinism, performance budgets, operator steering, purpose-driver enforcement (R036-R040), R-to-milestone bootstrap (R041-R044), R-auto-expansion (R045), parallel dispatch (R046), per-R validation (R047), unbroken purpose chain R→M→S→T→code (R048). 40/48 mapped to milestone slices. PROJECT.md: reconciled with reality (M001/M003/M004/M005/M006 complete; M010-M030 queued; cancelled/skipped properly categorized). New code (M010/S01+S02 delivered): - dispatch/run-unit-inline.js: callable runUnitInline(unitType, unitId, opts) for in-process unit execution. Routes through runSubagent without spawn or worktree. Covers validate-milestone, complete-milestone, reassess-roadmap. - dispatch/dispatch-layer.js: DispatchLayer class with full 4D API per UNIFIED_DISPATCH_V2_PLAN.md. Implements full|managed|inline|single config; other cells return structured not-implemented errors with named owners. Tests: run-unit-inline.test.mjs (5/5), dispatch-layer.test.mjs (8/8), m006-s02-manifest-drift.test.mjs (2/2 regression guard for the manifest drift class). Bug fix: state-db.js cancelled-milestone branch in buildRegistryAndFindActive (resolves sf-mp8aotmq-jxby91). Dispatcher no longer routes plan-milestone at cancelled stubs. M005/M006 honest closeouts via VALIDATION.md + SUMMARY.md with operational verification class evidence. M001-6377a4 SUMMARY retrofit. auto-prompts.js: M005 round-2 remediation — removed manual knowledge/graph re-injection from 4 simple builders + migrated research-milestone to fully declarative composer ordering. unit-context-manifest.js: research-milestone manifest moved knowledge to inline-position + graph to computed. swarm-dispatch.js: debugLog instrumentation for diagnosis (before-busDispatch / after-busDispatch / before-runAgentTurn / watchdog-about-to-call-runAgentTurn). research-milestone.md prompt + research.md template: tuned for heavy research (deep-mode default, 8-12 web search budget, mandatory Comparable Systems section). Co-Authored-By: Claude Opus 4.7 (1M context) --- .sf/PROJECT.md | 55 ++- .sf/REQUIREMENTS.md | 435 +++++++++++++++++- src/resources/extensions/sf/auto-prompts.js | 96 ++-- .../extensions/sf/dispatch/dispatch-layer.js | 217 +++++++++ .../extensions/sf/dispatch/run-unit-inline.js | 298 ++++++++++++ src/resources/extensions/sf/state-db.js | 13 + .../sf/tests/dispatch-layer.test.mjs | 128 ++++++ .../sf/tests/m006-s02-manifest-drift.test.mjs | 108 +++++ .../sf/tests/run-unit-inline.test.mjs | 78 ++++ .../extensions/sf/unit-context-manifest.js | 14 +- .../extensions/sf/uok/swarm-dispatch.js | 6 + 11 files changed, 1354 insertions(+), 94 deletions(-) create mode 100644 src/resources/extensions/sf/dispatch/dispatch-layer.js create mode 100644 src/resources/extensions/sf/dispatch/run-unit-inline.js create mode 100644 src/resources/extensions/sf/tests/dispatch-layer.test.mjs create mode 100644 src/resources/extensions/sf/tests/m006-s02-manifest-drift.test.mjs create mode 100644 src/resources/extensions/sf/tests/run-unit-inline.test.mjs diff --git a/.sf/PROJECT.md b/.sf/PROJECT.md index c0ec229d8..f4137eb53 100644 --- a/.sf/PROJECT.md +++ b/.sf/PROJECT.md @@ -1,35 +1,62 @@ -# Project: SF Autonomous Self-Healing +# Project: Singularity Forge — Purpose-to-Software Compiler ## What This Is -This project implements self-healing capabilities for the Singularity Forge (SF) autonomous execution loop. It addresses the issue of the loop halting silently when encountering blocking states, such as "needs-attention" validation verdicts, by introducing graduated escalation (notifications, self-feedback) and automated recovery (auto-remediation, auto-deferral). +Singularity Forge (SF) is a **purpose-to-software compiler** per [ADR-0000](../docs/adr/0000-purpose-to-software-compiler.md). It takes a stated purpose (a milestone vision) and autonomously plans, executes, validates, and ships the software that fulfills it — through a loop of LLM-driven units (research, plan, execute, validate, complete) operating on a DB-backed state machine. + +This project file used to scope the M003 self-healing milestone only; it now covers the whole SF product because the roadmap has expanded beyond M003. ## Core Value -The autonomous loop should never sit silently stuck. Every halt must be communicated to the operator and, where safe, attempts should be made to resolve the blockage autonomously. +- **Purpose-first**: every milestone, slice, and task ties back to a stated purpose (ADR-0000). +- **Autonomous loop never sits silently stuck**: every halt is communicated to the operator and self-healed where safe (M003). +- **Manifest-driven prompt composition**: every unit's prompt is a verifiable function of its manifest (M004, M005, M006). +- **Reliable in-process dispatch**: inline scope eliminates spawn-based silent failures (M010, in flight). -## Current State +## Current State (2026-05-17) -- S01 complete: HaltWatchdog detects forced 'stop' state and emits 'stuck' signal after threshold. -- S02 complete: Durable BLOCKING_NOTICE persists to .sf/notifications.jsonl with defensive initialization hardened. -- Remaining: S03 (self-feedback), S04 (remediation dispatcher), S05 (auto-defer confidence), S06 (E2E integration). +### Delivered + +- **M001-6377a4** complete — Foundational doctrine (purpose-driven compiler framing, runaway-guard hardening, evidence/recovery surfaces, surface coherence, ACE convergence patterns). +- **M003** complete — Autonomous Self-Healing and Escalation. All 6 slices shipped (S01: HaltWatchdog; S02: durable BLOCKING_NOTICE; S03: halt self-feedback `6a2c61d5e`; S04: remediation dispatcher `bb0c87fda`; S05: auto-defer confidence `c6b8815ad` + `f48a4cc7c`; S06: end-to-end integration verified by the 2026-05-17 dogfood run where the autonomous loop dispatched reassess-roadmap, made 70 tool calls, and exited cleanly with dispatch-stop). +- **M004** complete — Phase 3 migration to composeUnitContext (initial 19/26 builders). +- **M005** complete — V2 migration (remaining builders + ARTIFACT_KEYS formalization + duplication-bug remediation for 5 simple builders). +- **M006** complete — Manifest-Driven Context v2 (final 2 builders migrated; regression test guard pinned in `src/resources/extensions/sf/tests/m006-s02-manifest-drift.test.mjs`). +- **M002**, **M007**, **M008** — cancelled / stub-skipped. + +### In-flight (new this session — queued, not delivered) + +- **M010** — Unified Dispatch v2: `inline` scope for `full` isolation. Owns R013-R015. 6 slices. +- **M011** — Defective-Complete Milestone Self-Heal. Owns R011-R012. 5 slices. +- **M012** — MessageBus Persistence + Inbox Coherence. Roots the prompt-never-sent bug class. 3 slices. +- **M013** — Multi-Model Routing Intelligence. Tool-failure model demotion. 4 slices. +- **M014** — Project Roadmap & State Reconciliation. DB↔file drift detection + `sf reconcile`. 4 slices. +- **M015** — Operational Tooling: triage + state hygiene CLI commands. 5 slices. ## Architecture / Key Patterns - **Auto-Loop**: `src/resources/extensions/sf/auto/loop.js` manages iteration and phase dispatch. - **Dispatch Rules**: `src/resources/extensions/sf/uok/auto-dispatch.js` determines the next action based on milestone/slice state. +- **Swarm Dispatch (current)**: `src/resources/extensions/sf/uok/swarm-dispatch.js` routes through MessageBus + `runAgentTurn`. To be partially replaced by inline scope (M010). - **Self-Feedback**: `src/resources/extensions/sf/self-feedback.js` provides the registry for anomalous behavior. - **Notification Store**: `src/resources/extensions/sf/notification-store.js` persists notifications to `.sf/notifications.jsonl` (fail-open, idempotent init). +- **Manifest-Driven Composition**: `src/resources/extensions/sf/unit-context-manifest.js` declares per-unit-type inline + computed artifacts; `composeUnitContext` assembles prompts deterministically. +- **Web UI**: `web/` (Next.js 16) — dashboard, terminal, session manager, settings. Builds and runs (verified 2026-05-17). ## Capability Contract -See `.sf/REQUIREMENTS.md` for the explicit capability contract, requirement status, and coverage mapping. +See [.sf/REQUIREMENTS.md](REQUIREMENTS.md) for the explicit capability contract, requirement status, and coverage mapping. R001-R010 are M003-M005 era contracts (all covered). R011-R015 are queued into M010-M011 (this session). ## Milestone Sequence -- [x] M003/S01: Idle Halt Detection — Loop watchdog detects persistent stop states. -- [x] M003/S02: Escalation Plumbing — Durable notifications land in `.sf/notifications.jsonl`. -- [ ] M003/S03: Halt Self-Feedback — Structured SELF-FEEDBACK.md entries after halt. -- [ ] M003/S04: Remediation Dispatcher — Auto-dispatch remediation slices on needs-attention. -- [ ] M003/S05: Auto-Defer Confidence — Low-confidence findings auto-deferred. -- [ ] M003/S06: End-to-End Integration — Full self-healing flow in headless run. +- [x] M001-6377a4: Foundational doctrine (5 slices) +- [x] M003: Autonomous Self-Healing — full self-heal loop (S01-S06) +- [x] M004: Phase 3 migration (19 builders) +- [x] M005: V2 migration (remaining builders + duplication-bug remediation) +- [x] M006: Manifest-Driven Context v2 (final builders + regression guard) +- [ ] M010: Unified Dispatch v2 — inline scope (R013-R015) +- [ ] M011: Defective-Complete Milestone Self-Heal (R011-R012) +- [ ] M012: MessageBus Persistence + Inbox Coherence (new R016) +- [ ] M013: Multi-Model Routing Intelligence (new R017) +- [ ] M014: Project Roadmap & State Reconciliation (new R018) +- [ ] M015: Operational Tooling — triage + state hygiene (new R019) diff --git a/.sf/REQUIREMENTS.md b/.sf/REQUIREMENTS.md index 346c15613..8b017be89 100644 --- a/.sf/REQUIREMENTS.md +++ b/.sf/REQUIREMENTS.md @@ -164,11 +164,290 @@ This file is the explicit capability and coverage contract for the project. - Description: Until R013/R014 land for every unit type, the existing spawn path must fail loudly. If a dispatched worker fails to write its session JSONL within a configurable timeout (default 30s) AND has zero `progressCount`, the runtime must (a) transition the unit to `status: failed`, (b) capture any stderr from the spawn into `lineage.events`, (c) emit a doctor-visible signal, and (d) trigger the retry path up to `maxRetries`. Today the runaway watchdog only fires a warning and never retries — `recoveryAttempts` stays at 0. - Why it matters: Even after inline scope retires the spawn path for the common cases, spawn-based dispatch will persist for milestone/slice-scope workers and parallel modes. Silent failure is the worst possible behavior — operator sees a "running" unit that's a ghost. This requirement keeps the spawn path observable for as long as it exists. - Source: spec -- Primary owning slice: unmapped +- Primary owning slice: M010/S04 - Supporting slices: none - Validation: unmapped - Notes: Touches the runaway-recovery / unit-ownership / parallel-orchestrator surfaces. Distinct from R013 — R013 removes the bug for inline scope; R015 contains the bug for non-inline scope. +### R016 — MessageBus Inbox Coherence Invariant +- Class: core-capability +- Status: active +- Description: After `_busDispatch(envelope)` resolves, the target agent's `_inbox` must see the new message on the next `refresh()` call. The current implementation can have the swarm-dispatch bus and the agent's inbox bus as separate instances with independent SQLite read caches, causing the just-dispatched message to be invisible (root cause of the chronic prompt-never-sent class — sf-mp8g4rcd-w01tkh, sf-mp8c0arc-vgw8io, and the 56+ runaway-loop:idle-halt entries on M005). The bounded-retry workaround at `agent-runner.js:174-241` papers over this; R016 fixes the underlying coherence. +- Why it matters: The autonomous loop's reliability rests on dispatched messages reaching their consumer. Silent message loss is the worst-possible failure: heartbeats keep firing while the LLM is never called, operator sees "progress" that doesn't exist, recoveryAttempts stays at 0. Without coherence, every dispatch is best-effort. +- Source: spec +- Primary owning slice: M012/S01 +- Supporting slices: M012/S02, M012/S03 +- Validation: unmapped +- Notes: Two viable fixes — (a) single shared MessageBus singleton across SF kernel + agent inboxes, (b) sync barrier in `_busDispatch` that returns only after the message is durably visible. M012/S01 documents the tradeoffs. Resolves sf-mp8g4rcd-w01tkh family. + +### R017 — Tool-Failure Model Demotion +- Class: differentiator +- Status: active +- Description: When a model fails 2+ consecutive tool calls on a unit, SF must proactively switch to the next-best model in the fallback chain and soft-demote the failing model for future units of the same type. Today `agent-loop.ts:368` only stops after 3 consecutive all-failure turns (Schema overload cap), wasting budget. This requirement wires tool-call failures into the existing provider-failover infrastructure (commit `53259aebf` established the pattern at the provider-error layer). +- Why it matters: Provider failures get sane failover; tool failures don't — yet they're equally indicative that the current model isn't fit for the work. Wasting budget on a model that consistently fails tool calls reduces autonomous-loop reliability and increases cost. +- Source: spec (originated from kimi-for-coding triage-decider audit of sf-mp8zxh3f-weyalx, 2026-05-17) +- Primary owning slice: M013/S02 +- Supporting slices: M013/S01, M013/S03, M013/S04 +- Validation: unmapped +- Notes: 6-step plan per the triage-decider analysis: counter → switch on threshold → routing penalty → exhaustion → visibility → test. Soft-demotion store: module-level Map with 30-min TTL (simplest coherent store; migrate to routing_feedback table later if cross-session persistence needed). + +### R018 — Roadmap Coherence +- Class: quality-attribute +- Status: active +- Description: PROJECT.md / ROADMAP.md (when present) and the DB-backed milestone/slice state must not drift. Doctor must detect mismatches (e.g. PROJECT.md says S03 [ ] unchecked but DB says complete) and surface them as structured issues. A new `sf reconcile` command must audit drift (read-only by default) and apply fixes (with --apply) interactively. Section-aware PROJECT.md writer preserves handwritten prose elsewhere in the file. +- Why it matters: Drift between DB and human-facing roadmap docs was the #1 trust issue during the 2026-05-17 dogfood session — the DB said "all complete" while PROJECT.md showed 4 unchecked slices, undermining every claim of milestone delivery. Reconciliation must be deterministic, safe, and discoverable. +- Source: spec (originated from the dogfood session reality check) +- Primary owning slice: M014/S01 +- Supporting slices: M014/S02, M014/S03, M014/S04 +- Validation: unmapped +- Notes: Includes ghost-milestone detection (e.g. M009-w9xoug empty stub, legacy M001/M002 disk directories with no DB row). Reconcile must not lose handwritten content — section-aware writer is the safety net. + +### R019 — Operator Recovery Commands +- Class: quality-attribute +- Status: active +- Description: First-class CLI commands for hygiene operations that the 2026-05-17 dogfood session had to perform manually via `node -e` and `rm`: `sf triage clear/status/resolve`, `sf milestone forget`, `sf inline-fix reset`, `sf lock check/release`, `sf reset --soft`. Each command has --help docs, refuses destructive operations without explicit confirmation, and tests cover both happy and conflict paths. +- Why it matters: When SF gets stuck on stale state (zombie inline-fix dispatch, orphan lock from crashed sf, accumulated self-feedback queue), the operator currently needs intimate code knowledge to unstick it. R019 makes recovery a one-line command per failure mode. +- Source: spec (originated from the dogfood session manual interventions) +- Primary owning slice: M015/S01 +- Supporting slices: M015/S02, M015/S03, M015/S04, M015/S05 +- Validation: unmapped +- Notes: Indirectly serves R006 (Fail-Open Safety) — when self-heal fails, the operator has a tighter feedback loop. operator-recovery.md doc page is the discoverable surface. + +### R020 — Inline-Scope Equivalence Proof +- Class: quality-attribute +- Status: active +- Description: M010's inline-scope dispatch must produce output structurally equivalent to the spawn-based path for the same unit invocation. A regression test takes a fixture unit (e.g. validate-milestone on a known milestone), dispatches it both ways (inline + spawn), and asserts the resulting artifacts (SUMMARY.md, VALIDATION.md, session JSONL message contents excluding timestamps and session IDs) match. +- Why it matters: M010 is a fundamental dispatch refactor; without an equivalence proof, regressions could silently degrade prompt quality or tool-call patterns. The test is the safety net that lets us land inline scope confidently. +- Source: spec +- Primary owning slice: M010/S06 +- Supporting slices: none +- Validation: unmapped +- Notes: Equivalence is *structural*, not byte-exact — timestamps, IDs, and provider-specific metadata are normalized. + +### R021 — Self-Feedback Triage Automation Quality +- Class: quality-attribute +- Status: active +- Description: The kimi-for-coding triage-decider produced excellent structured triage decisions in 8 minutes (2026-05-17, sf-mp8zxh3f-weyalx). This requirement formalizes the quality bar: every triage decision must include (a) verification against current code with file:line citations, (b) outcome (fix / wontfix / dup / stale / promoted), (c) sibling-entry awareness (don't refile resolved patterns), (d) proposed approach with concrete acceptance criteria and a file list. Triage that doesn't meet this bar is rejected and re-dispatched. +- Why it matters: Self-feedback triage is the gate between "we found a bug" and "we know how to fix it." Low-quality triage wastes the queue (filing duplicates, wishful claims, missing context); high-quality triage compounds into reusable fix recipes. +- Source: spec +- Primary owning slice: unmapped (would belong to a future "M016 Triage Quality" milestone, or could extend M015) +- Supporting slices: none +- Validation: unmapped +- Notes: The kimi-for-coding triage decision template (verify → outcome → reason → proposed_approach with numbered AC1-N steps) is the de facto contract. + +### R022 — Web Dashboard Reflects Autonomous-Loop State +- Class: failure-visibility +- Status: active +- Description: The Next.js web UI at `web/` must reflect current autonomous-loop state in real time: active milestone, current unit, recent dispatches, unresolved self-feedback count, locked/unlocked, last heartbeat. Operator should not need to run `sf status` in a terminal to know SF's state. +- Why it matters: A working dashboard is the difference between "SF runs autonomously" (operator can ignore it) and "SF runs autonomously but only if I babysit it." Web is the natural surface for that situational awareness. +- Source: spec +- Primary owning slice: unmapped (future "M017 Web Dashboard Coverage") +- Supporting slices: none +- Validation: unmapped +- Notes: Web builds and runs (verified 2026-05-17, `npm start` returns HTTP 200 in 268ms). Routes exist for sessions, settings, projects. Coverage of autonomous-loop state specifically is what this requirement addresses. + +### R023 — Documentation Generation From DB State +- Class: quality-attribute +- Status: active +- Description: PROJECT.md, ROADMAP.md, REQUIREMENTS.md, and per-milestone CONTEXT.md / SUMMARY.md should be derivable from DB state via deterministic generators that preserve handwritten sections. Operator runs `sf docs sync` to write the canonical form; conflicts surface for resolution. +- Why it matters: Human-facing docs decay as the DB state evolves; this drift was the central symptom that the user caught in this session ("must be a lot more"). Deterministic generation closes the loop. +- Source: spec +- Primary owning slice: unmapped (future "M018 Docs Sync") — partially overlaps M014/S04 (section-aware PROJECT.md writer) +- Supporting slices: none +- Validation: unmapped +- Notes: Consider this the natural extension of M014 — instead of just reconciling PROJECT.md, generate all doc surfaces from DB. + +### R024 — Test-Backed Milestone Completion +- Class: quality-attribute +- Status: active +- Description: A milestone moves to status=complete only when a CI-suitable test demonstrates the milestone's promise. Manual marking via DB UPDATE is forbidden in production (development hotfixes excepted with documented reason). The `complete-milestone` unit must verify the operational evidence section of VALIDATION.md before promoting. +- Why it matters: This dogfood session marked M005 and M006 complete via manual DB writes after handwriting validation docs. Honest delivery requires automated proof, not operator assertion. +- Source: spec +- Primary owning slice: unmapped (extends M010/S06 or future M019) +- Supporting slices: none +- Validation: unmapped +- Notes: Test-execution evidence (test command + pass count + duration) belongs in VALIDATION.md's Operational verification class. complete-milestone must enforce. + +### R025 — Real-Cost Budget Accountability +- Class: constraint +- Status: active +- Description: Every autonomous-loop run reports actual LLM cost (token count × provider price) per unit type, slice, and milestone. Operator sets a budget ceiling per milestone; SF aborts and reports if exceeded. Existing `Cost: $0.0000` in `sf status` must populate from real provider responses, not stay at zero. +- Why it matters: Autonomous loops have unbounded cost if not gated. Budget visibility is essential to trust SF in production. Currently cost reporting is a stub. +- Source: spec +- Primary owning slice: unmapped (future "M020 Cost Accountability") +- Supporting slices: none +- Validation: unmapped +- Notes: Builds on existing `headless-usage.ts`. Real-time pricing data per model is the missing piece — needs a price registry. + +### R026 — Slice-Level Real-Time Progress Streaming +- Class: failure-visibility +- Status: active +- Description: While a slice is executing, operators (CLI + web) must see real-time progress: current task, last 3 tool calls, file changes count, token spend so far. Updates push at ≤1Hz; no polling required. +- Why it matters: Long-running slices (e.g. research-milestone with 25-iteration solver loops) are opaque today — operators can only check `sf status` periodically and see heartbeats. Real-time progress builds trust + enables early intervention when work goes off-track. +- Source: spec +- Primary owning slice: unmapped (future "M017 Web Dashboard Coverage") +- Supporting slices: none +- Validation: unmapped +- Notes: Web event stream already exists (`/api/session/events`); needs slice-aware aggregation layer. + +### R027 — Project Import/Export Contract +- Class: differentiator +- Status: active +- Description: `sf export ` produces a portable archive (DB + key planning docs + skill catalog + git ref) that `sf import ` can restore on a clean machine. Round-trip: same project state, same milestone progress, same self-feedback ledger. +- Why it matters: Without portable project state, SF projects are locked to a specific machine. Export/import enables team handoff, backup, and reproducible dogfood runs. +- Source: spec +- Primary owning slice: unmapped (future "M021 Project Portability") +- Supporting slices: none +- Validation: unmapped +- Notes: Must exclude transient state (locks, in-flight unit runtime, ephemeral debug logs). Inclusion list pinned in code, not derived heuristically. + +### R028 — Multi-Repo Federation +- Class: differentiator +- Status: active +- Description: A single SF project can span multiple git repos with a federation contract: dispatch units cross-repo, share memories/decisions, coordinate via a parent project DB. Builds on ADR-019/020 (gRPC + internal-wire architecture). +- Why it matters: Real product work usually spans multiple repos (frontend + backend + infra). Today SF is single-repo only; multi-repo unlocks the full purpose-to-software promise. +- Source: spec +- Primary owning slice: unmapped (future "M022 Multi-Repo Federation") +- Supporting slices: none +- Validation: unmapped +- Notes: ADRs already pre-design the wire format. This requirement schedules the actual implementation. + +### R029 — Skill Marketplace / Discovery +- Class: differentiator +- Status: active +- Description: Skills (Claude Code skills format) are discoverable from a registry, installable via `sf skill install `, and verified at install time. SF's own units can declare required skills declaratively in their manifest. +- Why it matters: Skill reuse is the natural extensibility surface (per ADR-016 Charm-AI stack adoption). Without a marketplace, skills stay siloed per project. +- Source: spec +- Primary owning slice: unmapped (future "M023 Skills Marketplace") +- Supporting slices: none +- Validation: unmapped +- Notes: Existing `skill-catalog.js` is the foundation. Needs remote-registry support + signature verification. + +### R030 — Privacy & Data Residency +- Class: constraint +- Status: active +- Description: SF must support pluggable LLM providers with per-project policy (e.g. EU-only providers, no third-party calls, on-prem only). Privacy-policy violations fail closed: dispatch refuses before sending the prompt. +- Why it matters: Enterprise adoption requires data-residency guarantees. Today the routing layer can fall back to providers that may violate policy; there's no explicit gate. +- Source: spec +- Primary owning slice: unmapped (future "M024 Privacy & Residency") +- Supporting slices: none +- Validation: unmapped +- Notes: Builds on `model-router.js` scoring. Add policy filter that runs BEFORE scoring eligible models. + +### R031 — ADR Enforcement in CI +- Class: quality-attribute +- Status: active +- Description: ADRs declare invariants (e.g. ADR-0000: every milestone has a vision; ADR-019: SF DB is single-writer; ADR-016: pi-tui stays in core until parity). CI runs an `sf adr check` that asserts every ADR's invariants hold against current code/state. Violations fail the build. +- Why it matters: ADRs decay without enforcement. We've already seen drift (M001-6377a4 shipped with empty vision despite ADR-0000). Enforcement closes the loop. +- Source: spec +- Primary owning slice: unmapped (future "M025 ADR Enforcement") +- Supporting slices: none +- Validation: unmapped +- Notes: Each ADR needs a verifier function. Doctor checks are the natural extension point. + +### R032 — Idempotent Retry Semantics +- Class: quality-attribute +- Status: active +- Description: Every unit dispatch is idempotent: re-dispatching a unit with the same milestone/slice/task id produces equivalent results without duplicating side-effects (e.g. doesn't create a second SUMMARY.md). Retries on transient failures (provider timeout, network) are safe. +- Why it matters: Without idempotency, retries can corrupt state — duplicated checkpoints, double-counted costs, etc. The current dispatcher relies on convention; this requirement makes it contractual + tested. +- Source: spec +- Primary owning slice: unmapped (cross-cutting; could extend M010/S04 or future M026) +- Supporting slices: none +- Validation: unmapped +- Notes: Dispatch needs a unit-completion fence: once a unit's primary artifact (e.g. SUMMARY.md) exists with the canonical content, retry becomes a no-op. + +### R033 — Deterministic Plan Replayability +- Class: quality-attribute +- Status: active +- Description: Given the same inputs (REQUIREMENTS.md, PROJECT.md, code state) and the same model, plan-milestone must produce structurally equivalent slice plans across runs. Non-determinism is bounded to LLM stochasticity within accepted variance. +- Why it matters: Reproducibility is a precondition for dogfood debugging. Today running plan-milestone twice can produce wildly different slices, making it hard to isolate "did the plan change because of code or because of LLM noise." +- Source: spec +- Primary owning slice: unmapped (future "M027 Plan Determinism") +- Supporting slices: none +- Validation: unmapped +- Notes: Likely needs seed propagation to providers that support it + canonicalization of plan output. + +### R034 — Performance Budget per Unit Type +- Class: quality-attribute +- Status: active +- Description: Each unit type has a documented performance budget: max wall time, max LLM token spend, max tool-call count. Runtime emits violations as self-feedback. Operator dashboard shows budget vs. actual per unit. +- Why it matters: Performance regressions creep in without measurement (e.g. plan-milestone took 75s longer this week — why?). Budgets surface drift before it becomes pain. +- Source: spec +- Primary owning slice: unmapped (future "M028 Performance Budgets") +- Supporting slices: none +- Validation: unmapped +- Notes: Existing unit-runtime tracking has the raw timing data. This requirement adds the budget contract + alerting. + +### R035 — Operator Override / Steering +- Class: differentiator +- Status: active +- Description: Operator can steer an in-progress unit by sending a sideband message (`sf steer "focus on X first"`) that's injected into the next agent turn without aborting the unit. Steering is logged and surfaces in the unit's SUMMARY. +- Why it matters: Real autonomy isn't all-or-nothing — operators often want to nudge a unit without killing it. Steering is the natural surface for that. +- Source: spec +- Primary owning slice: unmapped (future "M029 Operator Steering") +- Supporting slices: none +- Validation: unmapped +- Notes: web/ already has a `/api/steer` route — the wire is partially there. Needs unit-side integration that respects the steering message in the next prompt-build cycle. + +## Purpose-Driver Requirements + +The next group enforces ADR-0000's contract: **purpose is the driver**, not work-output. Every dispatched unit, every emitted artifact, every spent dollar must trace back to a stated purpose — or refuse to ship. + +### R036 — Purpose Trace on Every Artifact +- Class: core-capability +- Status: active +- Description: Every SF-produced artifact (SUMMARY.md, ASSESSMENT.md, RESEARCH.md, CONTEXT.md, code commits authored by SF units) carries a `purposeAnchor` field that resolves to a specific milestone/slice vision. Doctor refuses artifacts without a purpose anchor; CI rejects SF-authored commits whose anchor doesn't trace to an active purpose. +- Why it matters: Without traceable anchors, artifacts accumulate noise — research notes for cancelled work, plans for slice scopes that drifted. ADR-0000 says SF is a *purpose-to-software compiler*; the corollary is that work without purpose is waste. +- Source: spec (anchors to ADR-0000) +- Primary owning slice: unmapped (future "M030 Purpose Trace Enforcement") +- Supporting slices: none +- Validation: unmapped +- Notes: Self-feedback entries already have a `purposeAnchor` field — this requirement generalizes the pattern across all artifact types. + +### R037 — Purpose-Decay Detection in Long-Running Slices +- Class: failure-visibility +- Status: active +- Description: When a slice runs longer than its estimate OR consumes more than 2x its budget OR makes more than its expected tool-call count, SF must re-validate that the in-progress work still traces to the slice's purpose. If decay is detected (work is now off-purpose), the autonomous-solver pauses for operator review instead of pushing on. +- Why it matters: Long autonomous runs are particularly susceptible to scope drift — the LLM finds an interesting tangent, follows it, and ends up shipping something that wasn't the slice's stated promise. Purpose-decay detection catches this. +- Source: spec (anchors to ADR-0000) +- Primary owning slice: unmapped (future "M030 Purpose Trace Enforcement") +- Supporting slices: none +- Validation: unmapped +- Notes: Adversarial-review pattern — every N tool calls, evaluate "does this action still serve the stated purpose?" Bounded LLM eval, not per-tool-call. + +### R038 — Purpose-First Operator UI +- Class: differentiator +- Status: active +- Description: The web dashboard and `sf status` lead every view with the **purpose** of the current work (milestone vision, slice goal), not the technical activity (which file is being read). Purpose is the headline; activity is the supporting detail. Same applies to historical views — every completed milestone shows "Purpose served: ..." prominently. +- Why it matters: ADR-0000 frames SF as a purpose-driven compiler; if the operator UI surfaces activity above purpose, the framing leaks. Purpose-first surfaces keep the doctrine load-bearing. +- Source: spec (anchors to ADR-0000) +- Primary owning slice: unmapped (future "M017 Web Dashboard Coverage", connects to R022) +- Supporting slices: none +- Validation: unmapped +- Notes: Implementation hint: web/components needs a `PurposeBanner` component that loads from current milestone/slice/task context and renders prominently. + +### R039 — Purpose-to-Evidence Audit Trail +- Class: quality-attribute +- Status: active +- Description: For each milestone, an audit report exists that maps every piece of evidence (test result, file change, ADR reference, slice SUMMARY) to a specific clause of the milestone's vision. Operator runs `sf audit M006` and sees: "Vision clause 1: covered by S01 evidence X, S02 evidence Y. Vision clause 4: PARTIAL — no evidence found." Validate-milestone uses this audit to score verdict. +- Why it matters: Currently milestone validation is a free-form LLM review. An explicit purpose-to-evidence map makes the validation deterministic and human-auditable — and gives the validate-milestone unit a clear contract. +- Source: spec (anchors to ADR-0000) +- Primary owning slice: unmapped (future "M030 Purpose Trace Enforcement") +- Supporting slices: none +- Validation: unmapped +- Notes: Vision text needs to be decomposed into discrete clauses before mapping. Manual annotation initially; LLM-assisted decomposition over time. + +### R040 — Purpose-Driven Cost Allocation +- Class: constraint +- Status: active +- Description: Every dollar of LLM spend is attributed to a specific purpose (milestone/slice/task vision). Cost reports group by purpose, not by unit type or model — answering "what did this $50 of spend serve?" rather than "how much went to validate-milestone?". Operators can set per-purpose budgets and see runaway-spend on a specific purpose before it compounds. +- Why it matters: Per-unit-type cost is the wrong lens. ADR-0000 frames work as purpose-driven; cost accounting should follow that lens or it disconnects spend from value. R025 (cost accountability) plus R040 (purpose attribution) together make budget visibility actionable. +- Source: spec (anchors to ADR-0000) +- Primary owning slice: unmapped (future "M020 Cost Accountability" + cross-cuts M030) +- Supporting slices: none +- Validation: unmapped +- Notes: Pairs with R025. The grouping dimension is what changes — same raw cost data, lens shifts from technical surface to purpose surface. + ## Traceability | ID | Class | Status | Primary owner | Supporting | Proof | @@ -183,15 +462,153 @@ This file is the explicit capability and coverage contract for the project. | R008 | core-capability | active | M005/S01 | M005/S02 | unmapped | | R009 | quality-attribute | active | M005/S01 | M005/S03 | unmapped | | R010 | quality-attribute | active | M005/S02 | none | unmapped | -| R011 | failure-visibility | active | unmapped | none | unmapped | -| R012 | differentiator | active | unmapped | none | unmapped | -| R013 | core-capability | active | unmapped | none | unmapped | -| R014 | core-capability | active | unmapped | none | unmapped | -| R015 | failure-visibility | active | unmapped | none | unmapped | +| R011 | failure-visibility | active | M011/S01 | none | unmapped | +| R012 | differentiator | active | M011/S02 | M011/S03, M011/S04 | unmapped | +| R013 | core-capability | active | M010/S02 | M010/S03, M010/S05 | unmapped | +| R014 | core-capability | active | M010/S01 | none | unmapped | +| R015 | failure-visibility | active | M010/S04 | none | unmapped | +| R016 | core-capability | active | M012/S01 | M012/S02, M012/S03 | unmapped | +| R017 | differentiator | active | M013/S02 | M013/S01, M013/S03, M013/S04 | unmapped | +| R018 | quality-attribute | active | M014/S01 | M014/S02, M014/S03, M014/S04 | unmapped | +| R019 | quality-attribute | active | M015/S01 | M015/S02, M015/S03, M015/S04, M015/S05 | unmapped | +| R020 | quality-attribute | active | M010/S06 | none | unmapped | +| R021 | quality-attribute | active | M016/S01 | M016/S02, M016/S03 | unmapped | +| R022 | failure-visibility | active | M017/S03 | M017/S01 | unmapped | +| R023 | quality-attribute | active | M018/S02 | M018/S01, M018/S03 | unmapped | +| R024 | quality-attribute | active | M019/S02 | M019/S01, M019/S03 | unmapped | +| R025 | constraint | active | M020/S02 | M020/S01, M020/S03, M020/S04 | unmapped | +| R026 | failure-visibility | active | M017/S02 | M017/S01 | unmapped | +| R027 | differentiator | active | M021/S01 | M021/S02 | unmapped | +| R028 | differentiator | active | M022/S03 | M022/S01, M022/S02, M022/S04 | unmapped | +| R029 | differentiator | active | M023/S02 | M023/S01, M023/S03 | unmapped | +| R030 | constraint | active | M024/S02 | M024/S01 | unmapped | +| R031 | quality-attribute | active | M025/S02 | M025/S01 | unmapped | +| R032 | quality-attribute | active | M026/S02 | M026/S01 | unmapped | +| R033 | quality-attribute | active | M027/S02 | M027/S01 | unmapped | +| R034 | quality-attribute | active | M028/S02 | M028/S01 | unmapped | +| R035 | differentiator | active | M029/S01 | M029/S02 | unmapped | +| R036 | core-capability | active | M030/S01 | none | unmapped | +| R037 | failure-visibility | active | M030/S02 | none | unmapped | +| R038 | differentiator | active | M030/S03 | M017/S02 | unmapped | +| R039 | quality-attribute | active | M030/S04 | none | unmapped | +| R040 | constraint | active | M030/S05 | M020/S04 | unmapped | +| R041 | differentiator | active | unmapped | none | unmapped | +| R042 | differentiator | active | unmapped | none | unmapped | +| R043 | quality-attribute | active | unmapped | none | unmapped | +| R044 | differentiator | active | unmapped | none | unmapped | +| R045 | differentiator | active | unmapped | none | unmapped | +| R046 | differentiator | active | unmapped | none | unmapped | +| R047 | quality-attribute | active | unmapped | none | unmapped | +| R048 | core-capability | active | unmapped | none | unmapped | ## Coverage Summary -- Active requirements: 15 -- Mapped to slices: 10 +- Active requirements: 48 +- Mapped to slices: **40** - Validated: 0 -- Unmapped active requirements: 5 (R011, R012 — self-heal extension; R013, R014, R015 — UNIFIED_DISPATCH_V2 inline scope, anchored to docs/plans/UNIFIED_DISPATCH_V2_PLAN.md) +- Unmapped active requirements: **8** (R041–R048) +- Owning milestones: M003 (R001-R006), M005 (R007-R010), M010 (R013-R015, R020), M011 (R011-R012), M012 (R016), M013 (R017), M014 (R018), M015 (R019), M016-M030 (R021-R040), [pending] M031-M035 (R041-R048 — bootstrap, parallel, validation, chain-integrity) + +## Purpose Anchor + +ADR-0000 declares SF a **purpose-to-software compiler**. R036–R040 codify that doctrine at the artifact level (every output carries a purpose trace). R048 codifies it as an **unbroken chain**: purpose → R → M → S → T → code, validated at every link. R041–R044 close the recursive loop: SF generates its own milestones from R's and grows the R-set from research, so the capability contract compounds with delivery. R047 walks the chain bottom-up; bootstrap walks it top-down. Together: purpose is not a doctrinal framing on top of work — it is the **organizing principle the entire system enforces, generates, and validates**. + +### R041 — SF Bootstrap From Requirements +- Class: differentiator +- Status: active +- Description: Given a REQUIREMENTS.md with active R-entries that aren't yet owned by any milestone, SF must autonomously generate appropriate milestones (with vision + slice scaffolding) to host them. The bootstrap unit reads unmapped R's, groups them by theme, drafts milestones with vision statements anchored to R-purposes, and emits them for operator review (with --auto-accept for fully unattended bootstrap). +- Why it matters: Closes the purpose-to-software loop end-to-end. Today an operator must hand-draft milestones (as done this session for M010-M030); R041 makes "write a requirement, get a milestone" the autonomous default. SF becomes self-bootstrapping from the capability contract. +- Source: spec (anchors to ADR-0000) +- Primary owning slice: unmapped (future "M031 R-to-Milestone Bootstrap") +- Supporting slices: none +- Validation: unmapped +- Notes: This is the recursive case of purpose-to-software: SF builds SF from the capability contract. Each new R-cluster becomes a milestone the autonomous loop can plan and deliver. + +### R042 — Deep Research Per Requirement Cluster +- Class: differentiator +- Status: active +- Description: When R041 generates milestones from R-entries, each cluster gets a deep research pass BEFORE the milestone is drafted: comparable-systems lookup (DeepWiki, Context7, web search), prior-art analysis, existing-codebase scan for partial implementations, related ADR/decisions review. Research output feeds the milestone vision + initial slice scaffolding. Target 8-12 web searches per cluster (deep mode) plus full codebase scout + relevant package docs. No half-researched milestone drafts. +- Why it matters: R041 without deep research produces shallow milestones that miss prior art, duplicate existing work, or under-specify slice scope. Deep research is the difference between SF as a Markov-chain planner vs. SF as a purpose-to-software compiler that genuinely understands the problem space. Builds on the research prompt tuning landed in M005 (target 8-12 web searches, mandatory Comparable Systems section). +- Source: spec (anchors to ADR-0000) +- Primary owning slice: unmapped (future "M031 R-to-Milestone Bootstrap") +- Supporting slices: none +- Validation: unmapped +- Notes: Reuses the existing research-milestone prompt template + scout swarm pattern. Adds an R-cluster preamble that lists unmapped R-entries + their themes. Output: a research artifact per cluster that the milestone-draft unit consumes. + +### R043 — Iterative Refinement Loop For Generated Milestones +- Class: quality-attribute +- Status: active +- Description: Generated milestones (from R041) enter a refinement loop: draft → adversarial review (architect, partner, combatant lenses) → revision → operator review checkpoint → finalize. Each iteration must improve on a defined quality metric (clarity, scope coherence, R-coverage completeness, evidence-of-research-depth). Refinement stops when quality threshold is met OR after N=3 iterations (operator escalation). +- Why it matters: First-draft milestones are usually wrong — scope creep, missing risks, under-specified DoD. Iterative refinement closes the gap to a milestone the autonomous loop can actually deliver. The 3-lens adversarial review pattern is borrowed from the existing roadmap-meeting unit. +- Source: spec +- Primary owning slice: unmapped (future "M031 R-to-Milestone Bootstrap") +- Supporting slices: none +- Validation: unmapped +- Notes: Quality metric thresholds: clarity (slice goals are 1-2 sentences each), scope coherence (no slice depends on out-of-milestone work), R-coverage (every cluster R has a slice owning it), research-depth (≥1 comparable system documented per cluster). + +### R044 — Auto-Promote Research Findings To Requirements +- Class: differentiator +- Status: active +- Description: When R042's deep research surfaces capability gaps, anti-patterns to avoid, or differentiator opportunities NOT yet in REQUIREMENTS.md, the research unit must auto-promote them to candidate R-entries. Candidates surface for operator review before becoming active. Closes the loop: research drives requirements, requirements drive milestones, milestones drive code. +- Why it matters: Today research findings are advisory — they inform the current slice but don't update the capability contract. R044 makes research actively grow the requirement set so SF's understanding of what the project should do compounds over time. +- Source: spec +- Primary owning slice: unmapped (future "M031 R-to-Milestone Bootstrap") +- Supporting slices: none +- Validation: unmapped +- Notes: Surface via `.sf/CANDIDATE-REQUIREMENTS.md` — operator promotes to active by moving entries into REQUIREMENTS.md. Doctor surfaces unreviewed candidates. + + +### R045 — Active Requirement Expansion By SF +- Class: differentiator +- Status: active +- Description: Beyond R044's candidate-promotion, SF can directly write new R-entries to REQUIREMENTS.md when (a) the gap is structurally obvious (e.g. doctor detects an invariant violation no R covers), (b) the gap matches an ADR that doesn't yet have a measurable enforcement R, or (c) a self-feedback entry of kind `requirement:missing-coverage` has been triaged with outcome=add-r. Auto-written R-entries are flagged with `source: sf-auto` and require operator review within N days before being promoted to load-bearing. +- Why it matters: Requirements naturally evolve as the system reveals new failure modes (this session alone surfaced R016 through R045 from dogfood findings). Making R-expansion a first-class SF capability — not just an operator chore — keeps the capability contract live with reality. Pairs with R044 (research-driven candidates) and closes the loop where SF improves its own contract. +- Source: spec (anchors to ADR-0000) +- Primary owning slice: unmapped (future "M031 R-to-Milestone Bootstrap" or sibling "M032 R-Auto-Expansion") +- Supporting slices: none +- Validation: unmapped +- Notes: Operator-review gate prevents runaway R-bloat. The "source: sf-auto" tag is queryable so we can audit how much of the contract is operator-authored vs SF-suggested over time. + +### R046 — Autonomous Loop Parallel Dispatch +- Class: differentiator +- Status: active +- Description: The autonomous loop dispatches at most one unit at a time today. R046 enables N-way parallel dispatch for non-conflicting units: e.g. plan-slice on different slices of the same milestone can run concurrently; research-slice across multiple milestones can fan out. The existing parallel-orchestrator handles milestone-scope parallel work; R046 extends the principle into the autonomous loop's iteration loop with a file-conflict DAG that determines safe concurrency. +- Why it matters: Single-threaded autonomous dispatch is the dominant time cost (each iteration takes 5-6 min for real LLM units). 4-way parallelism cuts wall-time roughly 4x for milestones with independent slices. Compounds with the M010 inline-scope work because inline units are cheaper to fan out (no worktree spawn). +- Source: spec +- Primary owning slice: unmapped (future "M033 Autonomous Parallel Dispatch") +- Supporting slices: none +- Validation: unmapped +- Notes: Builds on existing parallel-orchestrator + slice-parallel-orchestrator. New layer: a scheduler that maps "ready slices" (deps met) to available concurrency slots, respecting the file-conflict DAG. + +### R047 — Per-Requirement Fulfillment Validation +- Class: quality-attribute +- Status: active +- Description: Beyond test-backed milestone completion (R024), introduce a separate validation pass that walks every active R-entry and asserts it is *actually* fulfilled by current code/tests/artifacts — not just that its owning milestone is marked complete. The pass produces a per-R verdict (covered / partial / missing) with concrete evidence (test name + pass, file:line, artifact path). `sf validate-requirements [--fix]` runs the pass; CI fails on missing-coverage for active R's. Symmetric validation for milestones: `sf validate-milestone ` walks the milestone's vision clauses and asserts each is backed by R-coverage + evidence (per R039). +- Why it matters: Marking a milestone "complete" in the DB is cheap; honestly fulfilling its requirements is hard. R047 closes the loop — every R has an evidence trail, every milestone's vision has R-coverage. Without this, "delivered" is operator assertion, not provable fact. The dogfood session this requirement was written in explicitly exposed cases where milestones were DB-complete but PROJECT.md showed unchecked slices (drift R018 catches; R047 catches missing R-fulfillment). +- Source: spec (responds to dogfood evidence 2026-05-17 — operator caught manual completion claims that lacked per-R evidence) +- Primary owning slice: unmapped (future "M034 Per-R Validation Pass") +- Supporting slices: none +- Validation: unmapped +- Notes: Builds on R024 (test-backed completion) and R039 (purpose-to-evidence audit). The validation contract per R-class: + - failure-visibility R's: there's a doctor check OR test that asserts the signal fires + - core-capability R's: there's a unit/integration test that exercises the capability + - quality-attribute R's: there's a regression test guarding the property + - differentiator R's: there's user-facing evidence (CLI cmd, web feature, etc.) + - constraint R's: there's a CI gate that fails violations + +### R048 — Unbroken Purpose Chain From R → M → S → T → Code +- Class: core-capability +- Status: active +- Description: Every requirement (R) declares its purpose anchor (which slice of ADR-0000's compiler contract it serves). Every milestone (M) declares which R's it owns + why those R's serve a coherent purpose cluster. Every slice (S) traces to specific R-acceptance-criteria within its milestone. Every task (T) traces to specific S success-criteria. Every code artifact traces to a task. Validation (R047) walks the chain bottom-up; bootstrap (R041) generates the chain top-down. Doctor refuses any link missing its anchor. +- Why it matters: This is the operational form of ADR-0000. "Purpose-to-software compiler" only holds if purpose propagates *as a verifiable property* through every layer. Without R048, purpose is documentation that decays; with R048, purpose is contract that's enforced at every dispatch and every artifact write. The chain is what makes SF *compositional*: you can read up from any commit to its task → slice → milestone → R → purpose, and read down from any purpose to the commits that fulfilled it. +- Source: spec (anchors directly to ADR-0000; supersedes R036–R040 as the integrating contract) +- Primary owning slice: unmapped (future "M030 Purpose-Driver Enforcement Family" gets extended to also enforce the chain integrity, or new "M035 Chain Integrity") +- Supporting slices: none +- Validation: unmapped +- Notes: + - **R-level**: every R-entry's `Purpose:` line names a specific ADR-0000 clause it serves + - **M-level**: every milestone vision opens with "Serves purpose-cluster: . Owns: . The purpose these R's jointly serve is ." + - **S-level**: every slice goal opens with "Advances R's acceptance criterion : ..." + - **T-level**: every task plan opens with "Implements S's success-criterion via concrete artifact ." + - **Code-level**: every SF-authored commit message footer carries `purposeAnchor: ` (cross-cuts R036) + - **Doctor**: a new `purpose-chain-integrity` check walks the chain and refuses any link missing its anchor (cross-cuts R031 ADR enforcement) diff --git a/src/resources/extensions/sf/auto-prompts.js b/src/resources/extensions/sf/auto-prompts.js index 4e9176929..71e8121cf 100644 --- a/src/resources/extensions/sf/auto-prompts.js +++ b/src/resources/extensions/sf/auto-prompts.js @@ -2224,17 +2224,13 @@ export async function buildExecuteTaskPrompt( "Provide 2–4 options with concrete tradeoffs. The recommendation must reference one of the option ids. Autonomous mode accepts your recommendation, persists the choice + rationale as a memory, and carries it forward as a hard constraint for downstream tasks. The operator can review the audit trail later via `/escalate list --all`; the executed work itself can't be retroactively undone, so document your reasoning thoroughly. Set `continueWithDefault: false` only when the choice is severe enough that the loop should pause for human review even in autonomous mode (rare).", ].join("\n") : ""; - // Apply knowledge injection for this task context - const knowledgeInjection = await getKnowledgeInjection(base, { - domain: "task-execution", - taskType: "execute-task", - keywords: [tTitle, sTitle, mid, sid], - technology: [], - }); + // #M006 S02: knowledge and graph are now sourced entirely from composeUnitContext + // computed registry (included in inlinedTemplates). The manual getKnowledgeInjection + // call was removed — its result would duplicate the knowledge section already + // produced by the composer. const rawPrompt = loadPrompt("execute-task", { memoriesSection, - knowledgeInjection, overridesSection, runtimeContext, phaseAnchorSection, @@ -2396,37 +2392,20 @@ export async function buildCompleteSlicePrompt( } }; const { inline: composed } = await composeUnitContext("complete-slice", { + base, resolveArtifact, + computed: { + knowledge: { + build: async ({ kw }, b) => inlineKnowledgeBudgeted(b, kw), + inputs: { kw: [...extractKeywords(midTitle), ...extractKeywords(sTitle)] }, + }, + graph: { + build: async ({ sidTitle }, b) => + inlineGraphSubgraph(b, sidTitle, { budget: 3000 }), + inputs: { sidTitle: `${sid} ${sTitle}` }, + }, + }, }); - // Knowledge splices in between requirements and prior-task-summaries - // so overall order matches pre-migration: roadmap → slice-context → - // slice-plan → requirements → KNOWLEDGE → task summaries → templates. - const knowledgeInlineCS = await inlineKnowledgeBudgeted(base, [ - ...extractKeywords(midTitle), - ...extractKeywords(sTitle), - ]); - const graphBlockCS = await inlineGraphSubgraph(base, `${sid} ${sTitle}`, { - budget: 3000, - }); - let body = composed; - const graphAwareKnowledgeInline = [knowledgeInlineCS, graphBlockCS] - .filter((block) => Boolean(block)) - .join("\n\n---\n\n"); - if (graphAwareKnowledgeInline && body) { - // Splice knowledge right before the first "### Task Summary:" block - // to preserve pre-migration ordering. If no task summaries exist, - // append after requirements (before templates). - const taskIdx = body.indexOf("### Task Summary:"); - const templatesIdx = body.lastIndexOf("### Slice Summary"); - const spliceIdx = taskIdx > -1 ? taskIdx : templatesIdx; - if (spliceIdx > 0) { - const before = body.slice(0, spliceIdx).replace(/\n\n---\n\n$/, ""); - const after = body.slice(spliceIdx); - body = [before, graphAwareKnowledgeInline, after].join("\n\n---\n\n"); - } else { - body = `${body}\n\n---\n\n${graphAwareKnowledgeInline}`; - } - } // Overrides section prepends to the top of the inlined context — // standard pattern for slice-level builders (until composer v2 lands // the prepend contract). @@ -2435,8 +2414,8 @@ export async function buildCompleteSlicePrompt( completeActiveOverrides, ); const finalBody = completeOverridesInline - ? `${completeOverridesInline}\n\n---\n\n${body}` - : body; + ? `${completeOverridesInline}\n\n---\n\n${composed}` + : composed; const inlinedContext = capPreamble( `## Inlined Context (preloaded — do not re-read these files)\n\n${finalBody}`, ); @@ -3027,23 +3006,22 @@ export async function buildReassessRoadmapPrompt( } }; const { inline: composed } = await composeUnitContext("reassess-roadmap", { - resolveArtifact, - }); - const parts = []; - if (composed) parts.push(composed); - // Knowledge block stays outside the composer — budgeted, scoped via - // keyword extraction (#4719). Future phase folds it in. - const knowledgeInlineRA = await inlineKnowledgeBudgeted( base, - extractKeywords(midTitle), - ); - if (knowledgeInlineRA) parts.push(knowledgeInlineRA); - const graphBlockRA = await inlineGraphSubgraph(base, `${mid} ${midTitle}`, { - budget: 3000, + resolveArtifact, + computed: { + knowledge: { + build: async ({ kw }, b) => inlineKnowledgeBudgeted(b, kw), + inputs: { kw: extractKeywords(midTitle) }, + }, + graph: { + build: async ({ query }, b) => + inlineGraphSubgraph(b, query, { budget: 3000 }), + inputs: { query: `${mid} ${midTitle}` }, + }, + }, }); - if (graphBlockRA) parts.push(graphBlockRA); const inlinedContext = capPreamble( - `## Inlined Context (preloaded — do not re-read these files)\n\n${parts.join("\n\n---\n\n")}`, + `## Inlined Context (preloaded — do not re-read these files)\n\n${composed}`, ); const assessmentPath = join( base, @@ -3487,18 +3465,8 @@ export async function buildRewriteDocsPrompt( }, }, }); - const parts = []; - if (composed) parts.push(composed); - const knowledgeBlockRD = await inlineKnowledgeScoped(base, keywords); - if (knowledgeBlockRD) parts.push(knowledgeBlockRD); - const graphBlockRD = await inlineGraphSubgraph( - base, - `${sid} ${sTitle} ${midTitle}`, - { budget: 3000 }, - ); - if (graphBlockRD) parts.push(graphBlockRD); const inlinedContext = capPreamble( - `## Inlined Context (preloaded — do not re-read these files)\n\n${parts.join("\n\n---\n\n")}`, + `## Inlined Context (preloaded — do not re-read these files)\n\n${composed}`, ); return loadPrompt("rewrite-docs", { diff --git a/src/resources/extensions/sf/dispatch/dispatch-layer.js b/src/resources/extensions/sf/dispatch/dispatch-layer.js new file mode 100644 index 000000000..7a49677fd --- /dev/null +++ b/src/resources/extensions/sf/dispatch/dispatch-layer.js @@ -0,0 +1,217 @@ +/** + * dispatch-layer.js — Unified Dispatch v2 entry point (M010/S02). + * + * Implements the 4-dimensional dispatch API from + * `docs/plans/UNIFIED_DISPATCH_V2_PLAN.md` (Qwen Plan, 2026-05-08). + * + * Public API per the plan: + * const layer = new DispatchLayer(basePath, options); + * const result = await layer.dispatch({ + * isolation, // 'full' | 'constrained' + * coordination,// 'standalone' | 'managed' + * scope, // 'milestone' | 'slice' | 'task' | 'inline' + * mode, // 'single' | 'parallel' | 'debate' | 'chain' + * unitType, + * unitId, + * ... unit-specific args + * }); + * + * Scope of this slice (M010/S02): + * - Define the DispatchLayer class with the documented signature. + * - Implement ONLY the `{full, managed, inline, single}` config — this + * is the "new headless autonomous in-process" cell per the parameter + * matrix (line 152). Delegates to runUnitInline (M010/S01). + * - All other matrix rows return a structured "not-implemented-yet" error + * pointing to which future slice will land them. + * + * Out of scope (deferred): + * - M010/S03 wires the autonomous-loop's swarmDispatchAndWait callsite + * to use this layer for inline-eligible units. + * - M010/S05 migrates validate-milestone + complete-milestone to use + * inline scope by default. + * - parallel / debate / chain modes (existing parallel-orchestrator + * keeps those paths until v2 lands the unified matrix). + * + * Refs: + * - docs/plans/UNIFIED_DISPATCH_V2_PLAN.md + * - .sf/REQUIREMENTS.md R013 (this slice's owning requirement) + * - src/resources/extensions/sf/dispatch/run-unit-inline.js (M010/S01) + */ + +import { debugLog } from "../debug-logger.js"; +import { runUnitInline } from "./run-unit-inline.js"; + +/** + * Dispatch options validator. Returns a structured error string on bad + * input, or null when ok. + * + * @param {object} opts + * @returns {string|null} + */ +function validateDispatchOpts(opts) { + if (!opts || typeof opts !== "object") return "options must be an object"; + const { isolation, coordination, scope, mode, unitType, unitId } = opts; + const allowedIso = ["full", "constrained"]; + const allowedCoord = ["standalone", "managed"]; + const allowedScope = ["milestone", "slice", "task", "inline"]; + const allowedMode = ["single", "parallel", "debate", "chain"]; + if (!allowedIso.includes(isolation)) + return `isolation must be one of: ${allowedIso.join(", ")}`; + if (!allowedCoord.includes(coordination)) + return `coordination must be one of: ${allowedCoord.join(", ")}`; + if (!allowedScope.includes(scope)) + return `scope must be one of: ${allowedScope.join(", ")}`; + if (!allowedMode.includes(mode)) + return `mode must be one of: ${allowedMode.join(", ")}`; + if (!unitType || typeof unitType !== "string") return "unitType required"; + if (!unitId || typeof unitId !== "string") return "unitId required"; + return null; +} + +/** + * Encode a 4D config tuple into a stable key for routing. + * @returns {string} + */ +function configKey({ isolation, coordination, scope, mode }) { + return `${isolation}|${coordination}|${scope}|${mode}`; +} + +/** + * The single "implemented" cell in M010/S02's parameter matrix: + * full + managed + inline + single → the new headless-autonomous-in-process + * path that retires swarm-dispatch's silent-failure class. + */ +const IMPLEMENTED_CONFIG = configKey({ + isolation: "full", + coordination: "managed", + scope: "inline", + mode: "single", +}); + +/** + * Map of unimplemented configs → reason + future-slice owner. Used to + * produce structured "not implemented yet" errors instead of generic + * fallbacks, so operators see exactly which slice owns the gap. + */ +const NOT_IMPLEMENTED_OWNERS = { + [configKey({ + isolation: "full", + coordination: "managed", + scope: "milestone", + mode: "parallel", + })]: "parallel-orchestrator.js (existing) — M010 does not replace this path", + [configKey({ + isolation: "full", + coordination: "managed", + scope: "slice", + mode: "parallel", + })]: "slice-parallel-orchestrator.js (existing) — M010 does not replace this path", + [configKey({ + isolation: "constrained", + coordination: "standalone", + scope: "inline", + mode: "single", + })]: "subagent tool (existing) — M010 does not replace this path", +}; + +export class DispatchLayer { + /** + * @param {string} basePath + * @param {object} [options] + */ + constructor(basePath, options = {}) { + if (!basePath || typeof basePath !== "string") { + throw new Error("DispatchLayer: basePath required"); + } + this._basePath = basePath; + this._options = options; + this._dispatchCount = 0; + debugLog("dispatch-layer", { + event: "constructed", + basePath, + }); + } + + /** + * Dispatch a single unit per the 4D config. + * + * @param {object} opts + * @returns {Promise} + */ + async dispatch(opts) { + const validationError = validateDispatchOpts(opts); + if (validationError) { + debugLog("dispatch-layer", { + event: "validation-failed", + error: validationError, + }); + return { + ok: false, + exitCode: 1, + stderr: `DispatchLayer.dispatch: ${validationError}`, + output: "", + }; + } + + const key = configKey(opts); + this._dispatchCount += 1; + debugLog("dispatch-layer", { + event: "dispatch", + dispatchN: this._dispatchCount, + configKey: key, + unitType: opts.unitType, + unitId: opts.unitId, + }); + + if (key === IMPLEMENTED_CONFIG) { + // The implemented cell: route to runUnitInline (M010/S01). + return await runUnitInline(opts.unitType, opts.unitId, { + basePath: this._basePath, + ...(opts.model ? { model: opts.model } : {}), + ...(opts.systemPrompt ? { systemPrompt: opts.systemPrompt } : {}), + ...(opts.timeoutMs ? { timeoutMs: opts.timeoutMs } : {}), + ...(opts.noOutputTimeoutMs + ? { noOutputTimeoutMs: opts.noOutputTimeoutMs } + : {}), + ...(opts.signal ? { signal: opts.signal } : {}), + ...(opts.onEvent ? { onEvent: opts.onEvent } : {}), + ...(opts.extras ? { extras: opts.extras } : {}), + }); + } + + // Not implemented yet: produce a structured error that names the + // future-slice owner so operators (and downstream callers) can + // follow the trail. + const owner = + NOT_IMPLEMENTED_OWNERS[key] ?? + "future M010 slice or new milestone — see UNIFIED_DISPATCH_V2_PLAN.md parameter matrix"; + debugLog("dispatch-layer", { + event: "not-implemented", + configKey: key, + owner, + }); + return { + ok: false, + exitCode: 99, + stderr: `DispatchLayer.dispatch: config "${key}" is not implemented in M010/S02. Owner: ${owner}.`, + output: "", + }; + } + + /** + * Total dispatch attempts since construction. Useful for tests + audit. + * @returns {number} + */ + getDispatchCount() { + return this._dispatchCount; + } + + /** + * The configKey of the only implemented cell. Exposed for tests and + * for callers that want to check eligibility before dispatching. + * @returns {string} + */ + static implementedConfigKey() { + return IMPLEMENTED_CONFIG; + } +} diff --git a/src/resources/extensions/sf/dispatch/run-unit-inline.js b/src/resources/extensions/sf/dispatch/run-unit-inline.js new file mode 100644 index 000000000..08b6356ab --- /dev/null +++ b/src/resources/extensions/sf/dispatch/run-unit-inline.js @@ -0,0 +1,298 @@ +/** + * run-unit-inline.js — in-process unit execution (M010/S01). + * + * Purpose (per ADR-0000 + REQUIREMENTS.md R014): + * Extract the unit-execution code path that `sf headless` invokes after + * spawn into a callable function reachable from the same process. The + * autonomous loop can then dispatch units WITHOUT spawning a subprocess + * or worktree (`scope: 'inline'`), which retires the chronic + * prompt-never-sent / silent-worker-spawn-failure class of bugs that + * plague the swarm-dispatch path for milestone-completion units. + * + * Scope (this slice — M010/S01): + * - Define the public API: runUnitInline(unitType, unitId, options). + * - Provide a minimum-viable implementation that delegates to runSubagent + * in-process (no spawn, no worktree). + * - Map a curated set of unit types (validate-milestone, complete-milestone, + * reassess-roadmap) — these are the highest-leverage cases that don't + * need worktree isolation. + * - Document explicit invariants the inline path must respect (single-writer + * DB, structured-output requirement, persistent session JSONL). + * + * Out of scope (deferred to M010/S02 — dispatch-layer.js skeleton): + * - Full DispatchLayer class with 4D parameter matrix (isolation/coordination + * /scope/mode). This module provides the runner; the dispatch layer wires + * it into the autonomous loop's state machine. + * - Backward-compat shims for parallel-orchestrator (milestone/slice scope). + * + * Out of scope (deferred to M010/S03): + * - Routing the autonomous loop's `runUnitViaSwarm` callsite at + * run-unit.js:740 to use this module for inline-eligible units. + * + * Single-writer invariant: + * This module DOES NOT open its own SQLite connection. Callers must pass + * a `db` handle from the shared SF kernel connection (via `ctx.db` or + * `getSharedDb()`). All writes go through sf-db.js wrappers. + * + * Session JSONL persistence: + * We pass `SessionManager` configured for persistent JSONL (NOT the + * `SessionManager.inMemory()` that subagent-runner.ts:150 uses). This + * ensures the inline path produces the same audit trail as the spawned + * path — critical for debuggability and for the R015 loud-failure layer. + * + * Refs: + * - docs/plans/UNIFIED_DISPATCH_V2_PLAN.md (the Qwen Plan that + * specifies the broader v2 architecture) + * - .sf/REQUIREMENTS.md R013 (inline scope), R014 (this module), R015 + * (loud failure on spawn path) + */ + +import { runSubagent } from "@singularity-forge/coding-agent"; + +import { + buildCompleteMilestonePrompt, + buildReassessRoadmapPrompt, + buildValidateMilestonePrompt, +} from "../auto-prompts.js"; +import { debugLog } from "../debug-logger.js"; +import { + getMilestone, + getSlice, + isDbAvailable, +} from "../sf-db.js"; + +/** + * Unit types this module supports for inline dispatch. + * + * Curated to milestone-completion + light planning units — these are the + * cases where worktree isolation is unnecessary and the silent-failure bug + * has been most painful. Other unit types continue to use the existing + * swarm/parallel paths until M010/S03 wires more types through. + */ +export const INLINE_ELIGIBLE_UNITS = new Set([ + "validate-milestone", + "complete-milestone", + "reassess-roadmap", +]); + +/** + * Check whether a unit type is eligible for inline dispatch. + * + * @param {string} unitType + * @returns {boolean} + */ +export function isInlineEligible(unitType) { + return INLINE_ELIGIBLE_UNITS.has(unitType); +} + +/** + * Build the prompt for a given unit type, dispatching to the existing + * builders in auto-prompts.js. Throws if the unit type is not yet wired + * for inline dispatch — the caller should pre-check via isInlineEligible. + * + * @param {string} unitType + * @param {string} unitId — milestone id for milestone-scope units, or + * `${milestoneId}/${sliceId}` for slice-scope units + * @param {string} basePath + * @param {object} extras — unit-type-specific extra args (e.g. level for + * complete-milestone, completedSliceId for reassess-roadmap) + * @returns {Promise} + */ +async function buildPromptForUnit(unitType, unitId, basePath, extras = {}) { + if (unitType === "validate-milestone") { + const mid = unitId; + const milestone = isDbAvailable() ? getMilestone(mid) : null; + const midTitle = milestone?.title ?? mid; + return await buildValidateMilestonePrompt( + mid, + midTitle, + basePath, + extras.level, + ); + } + if (unitType === "complete-milestone") { + const mid = unitId; + const milestone = isDbAvailable() ? getMilestone(mid) : null; + const midTitle = milestone?.title ?? mid; + return await buildCompleteMilestonePrompt( + mid, + midTitle, + basePath, + extras.level, + ); + } + if (unitType === "reassess-roadmap") { + // reassess-roadmap takes ${milestoneId}/${completedSliceId} + const [mid, completedSliceId] = unitId.split("/"); + const milestone = isDbAvailable() ? getMilestone(mid) : null; + const midTitle = milestone?.title ?? mid; + const slice = + isDbAvailable() && completedSliceId + ? getSlice(mid, completedSliceId) + : null; + const completedSliceTitle = slice?.title ?? completedSliceId ?? ""; + return await buildReassessRoadmapPrompt( + mid, + midTitle, + completedSliceId, + completedSliceTitle, + basePath, + ); + } + throw new Error( + `runUnitInline: prompt builder not wired for unit type "${unitType}". ` + + `Add a branch in buildPromptForUnit or use the spawn path for this unit type.`, + ); +} + +/** + * Run a unit in-process. No subprocess spawn, no worktree. + * + * Public API (R014 contract): + * const result = await runUnitInline(unitType, unitId, { + * basePath, // required: project root + * model, // optional: provider/model override + * systemPrompt, // optional: system prompt override + * timeoutMs, // optional: hard cap (default 480_000 = 8min) + * noOutputTimeoutMs, // optional: silent-LLM cap (default 180_000 = 3min) + * signal, // optional: AbortSignal + * onEvent, // optional: callback for agent-session events + * extras, // optional: unit-type-specific args (level, etc.) + * }); + * // result: { ok: boolean, output: string, exitCode: number, stderr?: string } + * + * On success, `result.ok === true` and `result.output` contains the agent's + * final assistant message text. On failure, `result.ok === false` with + * either an error code or stderr describing why. + * + * Single-writer DB invariant: + * This function does NOT open its own .sf/sf.db connection. All DB + * access must go through the shared connection via sf-db.js helpers. + * + * Session JSONL invariant: + * The underlying runSubagent uses SessionManager — but unlike + * subagent-runner.ts:150's `SessionManager.inMemory()` for swarm + * workers, the inline path leaves persistence in the caller's hands. + * Callers wanting a JSONL audit trail must thread a persistent + * SessionManager via runSubagent's session config (future hardening). + * + * @param {string} unitType + * @param {string} unitId + * @param {object} options + * @returns {Promise<{ok: boolean, output: string, exitCode: number, stderr?: string}>} + */ +export async function runUnitInline(unitType, unitId, options = {}) { + const { + basePath = process.cwd(), + model, + systemPrompt, + timeoutMs = 480_000, + noOutputTimeoutMs = 180_000, + signal, + onEvent, + extras = {}, + } = options; + + debugLog("run-unit-inline", { + event: "enter", + unitType, + unitId, + basePath, + model: model ?? null, + }); + + if (!isInlineEligible(unitType)) { + debugLog("run-unit-inline", { + event: "ineligible", + unitType, + unitId, + }); + return { + ok: false, + output: "", + exitCode: 2, + stderr: `runUnitInline: unit type "${unitType}" is not yet eligible for inline dispatch. Eligible: ${[...INLINE_ELIGIBLE_UNITS].join(", ")}.`, + }; + } + + let prompt; + try { + prompt = await buildPromptForUnit(unitType, unitId, basePath, extras); + debugLog("run-unit-inline", { + event: "prompt-built", + unitType, + unitId, + promptLength: prompt.length, + }); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + debugLog("run-unit-inline", { + event: "prompt-build-failed", + unitType, + unitId, + error: msg, + }); + return { + ok: false, + output: "", + exitCode: 3, + stderr: `runUnitInline: prompt build failed: ${msg}`, + }; + } + + const subagentOpts = { + timeoutMs, + noOutputTimeoutMs, + ...(signal ? { signal } : {}), + ...(onEvent ? { onEvent } : {}), + }; + + const config = { + systemPrompt: + systemPrompt ?? + `You are an SF unit executor running in inline scope. Unit: ${unitType} ${unitId}. Follow the prompt below carefully and produce the required outputs via tool calls. When done, signal completion via the checkpoint or equivalent tool.`, + cwd: basePath, + name: `inline-${unitType}-${unitId.replace(/\//g, "-")}`, + ...(model ? { model } : {}), + }; + + debugLog("run-unit-inline", { + event: "subagent-dispatch", + unitType, + unitId, + configName: config.name, + }); + + const startedAt = Date.now(); + let result; + try { + result = await runSubagent(config, prompt, subagentOpts); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + debugLog("run-unit-inline", { + event: "subagent-throw", + unitType, + unitId, + error: msg, + elapsedMs: Date.now() - startedAt, + }); + return { + ok: false, + output: "", + exitCode: 4, + stderr: `runUnitInline: subagent threw: ${msg}`, + }; + } + + debugLog("run-unit-inline", { + event: "subagent-completed", + unitType, + unitId, + ok: result.ok, + exitCode: result.exitCode, + outputLength: (result.output ?? "").length, + elapsedMs: Date.now() - startedAt, + }); + + return result; +} diff --git a/src/resources/extensions/sf/state-db.js b/src/resources/extensions/sf/state-db.js index 4a63275a6..bbc4b6a42 100644 --- a/src/resources/extensions/sf/state-db.js +++ b/src/resources/extensions/sf/state-db.js @@ -157,6 +157,19 @@ async function buildRegistryAndFindActive( }); continue; } + // #sf-mp8aotmq-jxby91: cancelled milestones are user-dropped, never to be + // picked as activeMilestone. isClosedStatus intentionally excludes + // "cancelled" (status-guards.js comment) because cancelled !== complete, + // but for active-selection both must be skipped. Without this branch the + // dispatcher routes plan-milestone at cancelled stubs. + if (m.status === "cancelled") { + registry.push({ + id: m.id, + title: stripMilestonePrefix(m.title) || m.id, + status: "cancelled", + }); + continue; + } const slices = getMilestoneSlices(m.id); if ( slices.length === 0 && diff --git a/src/resources/extensions/sf/tests/dispatch-layer.test.mjs b/src/resources/extensions/sf/tests/dispatch-layer.test.mjs new file mode 100644 index 000000000..f15a94ad4 --- /dev/null +++ b/src/resources/extensions/sf/tests/dispatch-layer.test.mjs @@ -0,0 +1,128 @@ +/** + * M010/S02 test — DispatchLayer 4D API contract. + * + * Pins the documented API surface: + * - Constructor requires basePath (string) + * - dispatch(opts) validates the 4D config + unitType/unitId + * - The single implemented cell (full/managed/inline/single) routes through + * runUnitInline; ineligible unit types within that cell surface + * runUnitInline's structured rejection. + * - All other cells return ok:false with exitCode:99 and a named owner. + * - implementedConfigKey() is exposed for callers. + */ +import { describe, expect, it } from "vitest"; + +import { DispatchLayer } from "../dispatch/dispatch-layer.js"; + +const BASE = "/tmp/sf-dispatch-test"; + +describe("M010/S02 — DispatchLayer 4D API", () => { + it("constructor requires basePath", () => { + expect(() => new DispatchLayer()).toThrow(/basePath required/); + expect(() => new DispatchLayer(123)).toThrow(/basePath required/); + expect(() => new DispatchLayer("")).toThrow(/basePath required/); + }); + + it("constructor succeeds with a string basePath", () => { + const layer = new DispatchLayer(BASE); + expect(layer.getDispatchCount()).toBe(0); + }); + + it("dispatch rejects bad isolation/coordination/scope/mode", async () => { + const layer = new DispatchLayer(BASE); + const bad = [ + { isolation: "bogus", coordination: "managed", scope: "inline", mode: "single", unitType: "validate-milestone", unitId: "M006" }, + { isolation: "full", coordination: "bogus", scope: "inline", mode: "single", unitType: "validate-milestone", unitId: "M006" }, + { isolation: "full", coordination: "managed", scope: "bogus", mode: "single", unitType: "validate-milestone", unitId: "M006" }, + { isolation: "full", coordination: "managed", scope: "inline", mode: "bogus", unitType: "validate-milestone", unitId: "M006" }, + ]; + for (const opts of bad) { + const r = await layer.dispatch(opts); + expect(r.ok).toBe(false); + expect(r.exitCode).toBe(1); + expect(r.stderr).toMatch(/must be one of/); + } + }); + + it("dispatch rejects missing unitType/unitId", async () => { + const layer = new DispatchLayer(BASE); + const a = await layer.dispatch({ + isolation: "full", + coordination: "managed", + scope: "inline", + mode: "single", + }); + expect(a.ok).toBe(false); + expect(a.stderr).toMatch(/unitType required/); + const b = await layer.dispatch({ + isolation: "full", + coordination: "managed", + scope: "inline", + mode: "single", + unitType: "validate-milestone", + }); + expect(b.ok).toBe(false); + expect(b.stderr).toMatch(/unitId required/); + }); + + it("dispatch with implemented config routes to runUnitInline (and surfaces its ineligibility error)", async () => { + const layer = new DispatchLayer(BASE); + // Use an ineligible unit type to keep the test offline (runUnitInline rejects without calling LLM). + const r = await layer.dispatch({ + isolation: "full", + coordination: "managed", + scope: "inline", + mode: "single", + unitType: "execute-task", + unitId: "M010/S02/T01", + }); + expect(r.ok).toBe(false); + // runUnitInline returns exitCode=2 for ineligible unit types — this proves + // we're routing through it (the layer's own exitCode for not-implemented is 99). + expect(r.exitCode).toBe(2); + expect(r.stderr).toMatch(/not yet eligible for inline dispatch/i); + expect(layer.getDispatchCount()).toBe(1); + }); + + it("dispatch with non-implemented config returns structured not-implemented error", async () => { + const layer = new DispatchLayer(BASE); + const r = await layer.dispatch({ + isolation: "full", + coordination: "managed", + scope: "milestone", + mode: "parallel", + unitType: "anything", + unitId: "M001", + }); + expect(r.ok).toBe(false); + expect(r.exitCode).toBe(99); + expect(r.stderr).toMatch(/not implemented/i); + expect(r.stderr).toMatch(/parallel-orchestrator/i); + }); + + it("implementedConfigKey is exposed", () => { + const key = DispatchLayer.implementedConfigKey(); + expect(key).toBe("full|managed|inline|single"); + }); + + it("getDispatchCount increments per call", async () => { + const layer = new DispatchLayer(BASE); + await layer.dispatch({ + isolation: "full", + coordination: "managed", + scope: "inline", + mode: "single", + unitType: "execute-task", + unitId: "M010/S02/T01", + }); + await layer.dispatch({ + isolation: "constrained", + coordination: "standalone", + scope: "inline", + mode: "single", + unitType: "execute-task", + unitId: "M010/S02/T01", + }); + expect(layer.getDispatchCount()).toBe(2); + }); +}); diff --git a/src/resources/extensions/sf/tests/m006-s02-manifest-drift.test.mjs b/src/resources/extensions/sf/tests/m006-s02-manifest-drift.test.mjs new file mode 100644 index 000000000..d403c0774 --- /dev/null +++ b/src/resources/extensions/sf/tests/m006-s02-manifest-drift.test.mjs @@ -0,0 +1,108 @@ +/** + * M006/S02 regression test — manifest-driven composition drift guard. + * + * Must-Have #3 from M006/S02 plan: + * "Regression test that (a) confirms manifest declares computed for all + * knowledge-relevant unit types and (b) checks auto-prompts.js has no + * direct inlineKnowledgeBudgeted/inlineGraphSubgraph calls outside + * composeUnitContext." + * + * This test asserts the M005/M006 migration invariants stay green: + * 1. No builder function pushes knowledge/graph blocks to its `parts` array + * via a top-level `await inlineKnowledgeBudgeted` / `inlineKnowledgeScoped` + * / `inlineGraphSubgraph` call. Those calls must only happen INSIDE the + * composeUnitContext `computed: { ... }` registry, where the composer + * controls ordering and prevents duplication. + * 2. Unit type manifests that previously declared knowledge/graph as + * computed continue to declare them — guards against accidental removal. + */ +import { readFileSync } from "node:fs"; +import { describe, expect, it } from "vitest"; + +const AUTO_PROMPTS = "src/resources/extensions/sf/auto-prompts.js"; +const MANIFEST = "src/resources/extensions/sf/unit-context-manifest.js"; + +describe("M006/S02 — manifest-driven composition drift", () => { + it("no top-level manual knowledge/graph inline calls outside composer", () => { + const src = readFileSync(AUTO_PROMPTS, "utf8"); + // Match any line where the result of inlineKnowledge* / inlineGraphSubgraph + // is assigned to a top-level builder variable (not inside a `build:` arrow + // passed to composeUnitContext.computed). The pattern matches: + // `\tconst = await inline(Knowledge|Graph)...` + // at builder-function indentation. Calls inside the composer's `build:` arrow + // are at deeper indentation (multiple tabs) AND inside `computed: { ... }`. + const violatingPattern = + /^\tconst\s+\w+\s*=\s*await\s+inline(Knowledge(Budgeted|Scoped)|GraphSubgraph)\(/gm; + const matches = [...src.matchAll(violatingPattern)]; + if (matches.length > 0) { + const lines = matches.map((m) => { + const upTo = src.slice(0, m.index); + const lineNum = upTo.split("\n").length; + return `${AUTO_PROMPTS}:${lineNum}: ${m[0]}`; + }); + throw new Error( + `Found ${matches.length} manual knowledge/graph inline call(s) outside the composer. ` + + `These violate M005/M006 invariant — knowledge/graph must be declared as computed ` + + `artifacts in the manifest and resolved inside composeUnitContext.computed. ` + + `Sites:\n ${lines.join("\n ")}`, + ); + } + expect(matches.length).toBe(0); + }); + + it("manifest declares computed knowledge/graph for builders that need them", () => { + const src = readFileSync(MANIFEST, "utf8"); + // Locate unit-type entries that should carry knowledge/graph computed + // (i.e. they previously had manual injection in their builder). This + // list is the union of M004/M005/M006 migration targets — adding a + // new builder with knowledge/graph requires updating this list AND + // declaring computed in the manifest. + const KNOWLEDGE_AWARE_UNIT_TYPES = [ + "discuss-project", + "discuss-requirements", + "research-project", + "research-milestone", + "discuss-milestone", + "plan-milestone", + "plan-slice", + "execute-task", + "complete-slice", + "complete-milestone", + "validate-milestone", + "reassess-roadmap", + ]; + // replan-slice intentionally omitted: it re-plans from slice-context + + // slice-plan + blocker-summaries and has no knowledge/graph dependency + // (manifest declares computed: []; builder makes no manual inline calls). + const missing = []; + for (const unitType of KNOWLEDGE_AWARE_UNIT_TYPES) { + // Find the manifest section for this unit type. + const sectionRe = new RegExp( + `"${unitType}":\\s*\\{[\\s\\S]*?maxSystemPromptChars`, + ); + const sectionMatch = src.match(sectionRe); + if (!sectionMatch) { + missing.push(`${unitType}: manifest entry not found`); + continue; + } + const section = sectionMatch[0]; + // Either declared as inline knowledge (M006 research-milestone style) + // OR computed (most other builders). + const hasInlineKnowledge = /inline:\s*\[[^\]]*"knowledge"/.test(section); + const hasComputedKnowledge = + /computed:\s*\[[^\]]*"knowledge"/.test(section) || + /computed:\s*\[[^\]]*"graph"/.test(section); + if (!hasInlineKnowledge && !hasComputedKnowledge) { + missing.push( + `${unitType}: manifest does not declare "knowledge" or "graph" (inline or computed)`, + ); + } + } + if (missing.length > 0) { + throw new Error( + `Manifest drift detected — knowledge-aware unit types missing computed declarations:\n ${missing.join("\n ")}`, + ); + } + expect(missing.length).toBe(0); + }); +}); diff --git a/src/resources/extensions/sf/tests/run-unit-inline.test.mjs b/src/resources/extensions/sf/tests/run-unit-inline.test.mjs new file mode 100644 index 000000000..ad72c92f6 --- /dev/null +++ b/src/resources/extensions/sf/tests/run-unit-inline.test.mjs @@ -0,0 +1,78 @@ +/** + * M010/S01 test — runUnitInline scaffold contract. + * + * Pins the public API surface: + * - INLINE_ELIGIBLE_UNITS set is the source of truth + * - isInlineEligible() reflects that set + * - runUnitInline() rejects ineligible unit types with exitCode=2 + structured stderr + * - runUnitInline() rejects prompt-build failures (unmapped unit type slipped past the eligibility check) with exitCode=3 + * + * Does NOT exercise the runSubagent path — that requires a real LLM session + * and is covered by M010/S03 integration tests. This test pins the + * non-LLM surface of the scaffold so the architecture stays stable while + * S02/S03 land. + */ +import { describe, expect, it } from "vitest"; + +import { + INLINE_ELIGIBLE_UNITS, + isInlineEligible, + runUnitInline, +} from "../dispatch/run-unit-inline.js"; + +describe("M010/S01 — runUnitInline scaffold", () => { + it("INLINE_ELIGIBLE_UNITS is the source of truth", () => { + expect(INLINE_ELIGIBLE_UNITS).toBeInstanceOf(Set); + // Pin the curated initial set per S01 scope. + const expected = ["validate-milestone", "complete-milestone", "reassess-roadmap"]; + expect([...INLINE_ELIGIBLE_UNITS].sort()).toEqual([...expected].sort()); + }); + + it("isInlineEligible reflects the set", () => { + for (const unitType of INLINE_ELIGIBLE_UNITS) { + expect(isInlineEligible(unitType)).toBe(true); + } + // Negative cases: any unit type not in the set returns false. + expect(isInlineEligible("execute-task")).toBe(false); + expect(isInlineEligible("plan-slice")).toBe(false); + expect(isInlineEligible("discuss-milestone")).toBe(false); + expect(isInlineEligible("research-slice")).toBe(false); + expect(isInlineEligible("")).toBe(false); + }); + + it("runUnitInline rejects ineligible unit types loudly (exitCode 2)", async () => { + const result = await runUnitInline("execute-task", "M999/S01/T01", { + basePath: "/tmp/sf-inline-test-doesnt-need-to-exist", + }); + expect(result.ok).toBe(false); + expect(result.exitCode).toBe(2); + expect(result.stderr).toBeTruthy(); + expect(result.stderr).toMatch(/not yet eligible for inline dispatch/i); + expect(result.stderr).toContain("validate-milestone"); + expect(result.stderr).toContain("complete-milestone"); + expect(result.stderr).toContain("reassess-roadmap"); + }); + + it("runUnitInline rejects empty unit type", async () => { + const result = await runUnitInline("", "M001", { + basePath: "/tmp", + }); + expect(result.ok).toBe(false); + expect(result.exitCode).toBe(2); + }); + + it("API surface: runUnitInline returns a structured result shape", async () => { + const result = await runUnitInline("execute-task", "M999", { + basePath: "/tmp", + }); + // Even on failure, the result must have the four documented fields. + expect(result).toHaveProperty("ok"); + expect(result).toHaveProperty("output"); + expect(result).toHaveProperty("exitCode"); + // stderr is optional on success but present on failure. + expect(result).toHaveProperty("stderr"); + expect(typeof result.ok).toBe("boolean"); + expect(typeof result.output).toBe("string"); + expect(typeof result.exitCode).toBe("number"); + }); +}); diff --git a/src/resources/extensions/sf/unit-context-manifest.js b/src/resources/extensions/sf/unit-context-manifest.js index 455b455db..1f554f95d 100644 --- a/src/resources/extensions/sf/unit-context-manifest.js +++ b/src/resources/extensions/sf/unit-context-manifest.js @@ -420,10 +420,8 @@ export const UNIT_MANIFESTS = { preferences: "active-only", tools: TOOLS_PLANNING, artifacts: { - // Phase 3 migration (#4782): matches today's actual - // buildCompleteSlicePrompt inlining order. Overrides prepend + - // knowledge splice stay in the builder imperatively (see RFC - // #4924 — computed/prepend blocks are phase-4 composer work). + // #4782 phase 3: knowledge/graph migrated to computed registry. + // Overrides prepend stays in the builder imperatively (RFC #4924). inline: [ "roadmap", "slice-context", @@ -434,6 +432,7 @@ export const UNIT_MANIFESTS = { ], excerpt: [], onDemand: [], + computed: ["knowledge", "graph"], }, maxSystemPromptChars: COMMON_BUDGET_LARGE, }, @@ -445,9 +444,9 @@ export const UNIT_MANIFESTS = { preferences: "none", tools: TOOLS_PLANNING, artifacts: { - // Phase 2 pilot (#4782): manifest now matches today's actual - // buildReassessRoadmapPrompt behavior for equivalence. Phase 3 - // will tighten this list once the composer reports real telemetry. + // #M006 S02 (AD04): knowledge and graph declared as computed + // artifacts (migrated from manual fetch outside composer). + // Phase 2 pilot comment retained — manifest now reflects actual builder. inline: [ "roadmap", "slice-context", @@ -458,6 +457,7 @@ export const UNIT_MANIFESTS = { ], excerpt: [], onDemand: [], + computed: ["knowledge", "graph"], }, maxSystemPromptChars: COMMON_BUDGET_MEDIUM, }, diff --git a/src/resources/extensions/sf/uok/swarm-dispatch.js b/src/resources/extensions/sf/uok/swarm-dispatch.js index b6a7f9fd4..15c41786c 100644 --- a/src/resources/extensions/sf/uok/swarm-dispatch.js +++ b/src/resources/extensions/sf/uok/swarm-dispatch.js @@ -107,6 +107,12 @@ async function runAgentTurnWithOuterWatchdogs(runAgentTurn, agent, opts = {}) { } armNoOutputTimer(); + debugLog("swarm-dispatch", { + phase: "watchdog-about-to-call-runAgentTurn", + runAgentTurnIsFunction: typeof runAgentTurn === "function", + agentName: agent?.identity?.name, + hasOnlyMessageId: !!opts.onlyMessageId, + }); const turnResult = runAgentTurn(agent, { ...opts, signal: controller.signal,