From 617608347dc444bf7ebb656e758abe7fffe323e1 Mon Sep 17 00:00:00 2001 From: Mikael Hugo Date: Sat, 2 May 2026 17:25:53 +0200 Subject: [PATCH] fix(sf): align auto-mode prompts to canonical sf_task_complete / sf_slice_complete MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Auto-mode prompts called legacy aliases (sf_complete_task, sf_complete_slice) while guided used canonical (sf_task_complete, sf_slice_complete). The divergence was locked in by the test 'auto execute-task requires legacy completion alias until prompt contract is aligned' — explicit tech debt marker. Migrated: - workflow-mcp.ts getRequiredWorkflowToolsForAutoUnit: returns canonical - prompts/execute-task.md: 4 callsites - prompts/complete-slice.md: 3 callsites - prompts/reactive-execute.md: any (none on this file) - workflow-mcp.test.ts: assertion + transport-error fixtures - Test rename: 'requires legacy completion alias' → 'requires canonical' The aliases stay registered (sf_complete_task → sf_task_complete) so external callers and old session resumes don't break. Tool-naming.test.ts still asserts both names route to the same handler. Resolves: sf-moohqbza-yyq8sd. Tests: workflow-mcp + tool-naming 29/29 pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- TODO.md | 100 +++++++++++++++++- .../extensions/sf/prompts/complete-slice.md | 6 +- .../extensions/sf/prompts/execute-task.md | 8 +- .../extensions/sf/tests/workflow-mcp.test.ts | 8 +- src/resources/extensions/sf/workflow-mcp.ts | 4 +- 5 files changed, 112 insertions(+), 14 deletions(-) diff --git a/TODO.md b/TODO.md index dbb8f9678..4015911a8 100644 --- a/TODO.md +++ b/TODO.md @@ -1 +1,99 @@ -# Raw Dump Inbox\n\n## Eval Candidates\n\n1. Test note for CI mode verification +# Raw Dump Inbox + +## Eval Candidates + +1. Test note for CI mode verification + +--- + +## SF Hardening Backlog From Claude Code Scan + +### Goal + +Make SF auto-mode impossible to loop from broken docs, missing artifacts, stale runtime state, or ambiguous background-unit completion. `ROADMAP.md` must become a rendered human artifact, not executable dispatch state. + +### Track 1 - Canonical State And No-Doc-Loop Dispatch + +- [ ] Add `getCanonicalMilestonePlan(basePath, milestoneId)` as the only dispatch-facing milestone plan accessor. +- [ ] Prefer DB slice rows when DB is available and populated. +- [ ] Add `.sf/milestones/Mxxx/Mxxx-ROADMAP.json` as the structured fallback projection. +- [ ] Treat `ROADMAP.md` as rendered display only. +- [ ] Keep Markdown parsing only for import, migration, doctor repair, and parser tests. +- [ ] Move parallel research, single-slice research, prior-slice guard, UAT/validation dispatch, and prompt slice enumeration to canonical plan state. +- [ ] Add generated marker/hash metadata to `ROADMAP.md`. +- [ ] Stop dispatch when DB/projection/Markdown disagree. +- [ ] Add `/sf doctor --fix` support to re-render generated roadmap artifacts from canonical state. + +### Track 2 - Unit Runtime FSM + +- [ ] Introduce durable unit runtime state under `.sf/runtime/units/*.json`. +- [ ] Model unit states: `queued`, `claimed`, `running`, `progress`, `completed`, `failed`, `blocked`, `cancelled`, `stale`, `runaway-recovered`, `notified`. +- [ ] Persist `retryCount`, `maxRetries`, `lastHeartbeatAt`, `lastProgressAt`, `lastOutputAt`, `outputPath`, `watchdogReason`, and `notifiedAt`. +- [ ] Prevent redispatch for terminal units with `notifiedAt`. +- [ ] Allow retry only when status is retryable and retry budget remains. +- [ ] Require explicit reset to rerun failed synthetic units like `parallel-research`. + +### Track 3 - Progress And Liveness + +- [ ] Separate heartbeat, progress, and output growth. +- [ ] Treat silent-but-running as valid only when heartbeat is fresh. +- [ ] Add watchdog classifiers: dead PID, expired lease, no heartbeat, no output growth, permission prompt, interactive prompt, runaway recovery. +- [ ] Extend `sf headless --output-format json query` with active unit, status, elapsed time, retry count, watchdog reason, last progress time, and output path. +- [ ] Later: render TUI/footer status rows from the same runtime model. + +### Track 4 - Event And Interrupt Policy + +- [ ] Add explicit event origins: `user-message`, `system-steer`, `task-notification`, `memory-event`, `background-completion`, `permission-request`. +- [ ] Add interrupt behaviors: `interrupt`, `queue`, `block`, `drop-if-stale`. +- [ ] Default user-origin messages to interrupt active work. +- [ ] Default system/task/memory events to non-interrupting unless explicitly marked. +- [ ] Scope queues so main loop does not consume subagent-directed events and subagents do not consume main user prompts. +- [ ] Ensure background completions enqueue once and set `notifiedAt`. + +### Track 5 - Tool, Plugin, And Permission Boundaries + +- [ ] Add explicit tool contracts for read/write behavior, concurrency safety, permission requirements, and interrupt behavior. +- [ ] Treat each background worker as a durable task with `taskId`, `parentUnitId`, status, output path, retry budget, and notification marker. +- [ ] Add doctor checks for suspicious hook/tool/plugin config. +- [ ] Ensure runtime permission errors name the denied tool/action and the relevant policy. + +### Track 6 - Release And Privacy Hygiene + +- [ ] Add packed-artifact scanner before release. +- [ ] Fail release artifacts containing inline source maps, `sourcesContent`, `.ts/.tsx` source, local absolute paths, secrets, or debug-only strings. +- [ ] Use `npm pack --dry-run --json` plus unpack inspection for npm artifacts. +- [ ] Add telemetry wrappers that allow numeric/boolean metadata by default and require reviewed wrappers for string metadata. +- [ ] Add tests for no-telemetry mode and no-nonessential-network mode. + +### Eval Candidates + +- [ ] Bad roadmap: DB has 2 slices, `ROADMAP.md` has 6 stale rows. Expected: dispatch uses DB/projection or stops; never dispatches stale rows. +- [ ] Projection fallback: DB unavailable, `ROADMAP.json` exists. Expected: dispatch succeeds from projection. +- [ ] Legacy unsafe fallback: DB unavailable, only `ROADMAP.md` exists. Expected: dispatch stops with doctor/migration instruction. +- [ ] Drift detection: `ROADMAP.md` marker hash mismatches projection. Expected: `/sf doctor` reports drift. +- [ ] Drift repair: `/sf doctor --fix` re-renders Markdown and clears drift. +- [ ] Synthetic unit failure: `parallel-research` is `runaway-recovered`. Expected: cannot redispatch unless explicitly reset. +- [ ] Notification idempotency: terminal unit with `notifiedAt` does not enqueue another completion notification. +- [ ] Retry budget: retryable failure increments `retryCount`; exceeding `maxRetries` becomes terminal. +- [ ] Stale heartbeat: missing heartbeat becomes `stale`, not infinite redispatch. +- [ ] Interrupt policy: user steering interrupts active work; memory/system/task notifications do not interrupt by default. +- [ ] Queue scoping: subagent-scoped notifications are not consumed by the main loop. +- [ ] Release scanner: fixture artifact containing `sourcesContent` fails; clean packed artifact passes. + +### Verification Commands + +- [ ] `npx vitest run src/resources/extensions/sf/tests/parallel-research-dispatch.test.ts --config vitest.config.ts --reporter=verbose` +- [ ] Focused tests for canonical plan, doctor drift, runtime FSM, interrupt policy, and release scanner. +- [ ] `npm run typecheck:extensions` +- [ ] `sf headless --output-format json query` against bad-roadmap and failed-runtime fixtures. + +### Implementation Order + +1. Canonical plan accessor and structured roadmap projection. +2. Dispatch migration away from Markdown parsing. +3. Doctor drift detection and repair. +4. Unit runtime FSM and redispatch policy. +5. Headless query liveness fields. +6. Event interrupt policy. +7. Tool/plugin permission contracts. +8. Release artifact scanner and privacy wrappers. diff --git a/src/resources/extensions/sf/prompts/complete-slice.md b/src/resources/extensions/sf/prompts/complete-slice.md index 36a4bb173..7536c1f87 100644 --- a/src/resources/extensions/sf/prompts/complete-slice.md +++ b/src/resources/extensions/sf/prompts/complete-slice.md @@ -27,12 +27,12 @@ Then: 4. If the slice plan includes observability/diagnostic surfaces, confirm they work. Skip this for simple slices that don't have observability sections. 5. Address every gate listed in the **Gates to Close** section above — each gate maps to a specific slice-summary section the handler inspects (for example, Q8 maps to **Operational Readiness**: health signal, failure signal, recovery procedure, and monitoring gaps). Leaving a section empty records the gate as `omitted`. 6. If this slice produced evidence that a requirement changed status (Active → Validated, Active → Deferred, etc.), call `sf_requirement_update` with the requirement ID, updated `status`, and `validation` evidence. Do NOT write `.sf/REQUIREMENTS.md` directly — the engine renders it from the database. -7. Prepare the slice completion content you will pass to `sf_complete_slice` using the camelCase fields `milestoneId`, `sliceId`, `sliceTitle`, `oneLiner`, `narrative`, `verification`, and `uatContent`. Do **not** manually write `{{sliceSummaryPath}}`. Do **not** manually write `{{sliceUatPath}}` — the DB-backed tool is the canonical write path for both artifacts. +7. Prepare the slice completion content you will pass to `sf_slice_complete` using the camelCase fields `milestoneId`, `sliceId`, `sliceTitle`, `oneLiner`, `narrative`, `verification`, and `uatContent`. Do **not** manually write `{{sliceSummaryPath}}`. Do **not** manually write `{{sliceUatPath}}` — the DB-backed tool is the canonical write path for both artifacts. 8. Draft the UAT content you will pass as `uatContent` — a concrete UAT script with real test cases derived from the slice plan and task summaries. Include preconditions, numbered steps with expected outcomes, and edge cases. This must NOT be a placeholder or generic template — tailor every test case to what this slice actually built. 9. Review task summaries for `key_decisions`. Append any significant decisions to `.sf/DECISIONS.md` if missing. 10. Review task summaries for patterns, gotchas, or non-obvious lessons learned. If any would save future agents from repeating investigation or hitting the same issues, append them to `.sf/KNOWLEDGE.md`. Only add entries that are genuinely useful — don't pad with obvious observations. 10b. Scan task summaries and the slice's activity log for sf-internal anomalies that the per-task agents may not have reported individually — repeated `Git stage failed`, `Verification failed … advisory`, `Safety: N unexpected file change(s)`, brittle gate predicates, etc. For any genuine sf-the-tool defect that surfaced during this slice but was NOT already filed via `sf_self_report`, file it now via `sf_self_report` with appropriate severity. This is the slice-level sweep — task-level agents file individual reports during execution; the slice-close agent catches systemic issues only visible across multiple tasks. -11. Call `sf_complete_slice` with the camelCase fields `milestoneId`, `sliceId`, `sliceTitle`, `oneLiner`, `narrative`, `verification`, and `uatContent`, plus any optional enrichment fields you have. Do NOT manually mark the roadmap checkbox — the tool writes to the DB, renders `{{sliceSummaryPath}}` and `{{sliceUatPath}}`, and updates the ROADMAP.md projection automatically. +11. Call `sf_slice_complete` with the camelCase fields `milestoneId`, `sliceId`, `sliceTitle`, `oneLiner`, `narrative`, `verification`, and `uatContent`, plus any optional enrichment fields you have. Do NOT manually mark the roadmap checkbox — the tool writes to the DB, renders `{{sliceSummaryPath}}` and `{{sliceUatPath}}`, and updates the ROADMAP.md projection automatically. 12. Do not run git commands — the system commits your changes and handles any merge after this unit succeeds. 13. Update `.sf/PROJECT.md` if it exists — refresh current state if needed: use the `write` tool with `path: ".sf/PROJECT.md"` and `content` containing the full updated document reflecting current project state. Do NOT use the `edit` tool for this — PROJECT.md is a full-document refresh. @@ -40,6 +40,6 @@ Then: **File system safety:** Task summaries are preloaded in the inlined context above. Task artifacts use a **flat file layout** — files such as `T01-SUMMARY.md` and `T02-SUMMARY.md` live directly inside the `tasks/` directory, not inside per-task subdirectories like `tasks/T01/SUMMARY.md`. If you need to re-read any of them, use `find .sf/milestones/{{milestoneId}}/slices/{{sliceId}}/tasks -name "*-SUMMARY.md"` to list file paths first. Never use `tasks/*/SUMMARY.md`, and never pass `{{slicePath}}` or any other directory path directly to the `read` tool. The `read` tool only accepts file paths, not directories. -**You MUST call `sf_complete_slice` with the slice summary and UAT content before finishing. The tool persists to both DB and disk and renders `{{sliceSummaryPath}}` and `{{sliceUatPath}}` automatically.** +**You MUST call `sf_slice_complete` with the slice summary and UAT content before finishing. The tool persists to both DB and disk and renders `{{sliceSummaryPath}}` and `{{sliceUatPath}}` automatically.** When done, say: "Slice {{sliceId}} complete." diff --git a/src/resources/extensions/sf/prompts/execute-task.md b/src/resources/extensions/sf/prompts/execute-task.md index 3693c8c5b..24de649f7 100644 --- a/src/resources/extensions/sf/prompts/execute-task.md +++ b/src/resources/extensions/sf/prompts/execute-task.md @@ -44,7 +44,7 @@ Then: - If you need a one-off script, scratch file, generated fixture, or temporary helper to understand or verify the work, either delete it before completion or promote it into the durable artifact named by the task plan. - Do not leave duplicate sources of truth. When temporary/seed data is normalized into a canonical location, update downstream code to read only the canonical path and remove or clearly mark the old copy as non-authoritative. - Do not satisfy verification with an ad-hoc helper when the task asks for a durable harness, command, test, or report. The durable planned artifact must own the repeatable check. - - Before calling `sf_complete_task` with `milestoneId`, `sliceId`, and `taskId`, inspect `git status --short` and make sure every changed/untracked file is intentional, in-scope, and either listed in the task plan/summary or explicitly explained as a local adaptation. + - Before calling `sf_task_complete` with `milestoneId`, `sliceId`, and `taskId`, inspect `git status --short` and make sure every changed/untracked file is intentional, in-scope, and either listed in the task plan/summary or explicitly explained as a local adaptation. 6. Write or update tests as part of execution — tests are verification, not an afterthought. If the slice plan defines test files in its Verification section and this is the first task, create them (they should initially fail). 7. When implementing non-trivial runtime behavior (async flows, API boundaries, background processes, error paths), add or preserve agent-usable observability. Skip this for simple changes where it doesn't apply. @@ -87,14 +87,14 @@ Then: 18. If you made an architectural, pattern, library, or observability decision during this task that downstream work should know about, append it to `.sf/DECISIONS.md` (read the template at `~/.sf/agent/extensions/sf/templates/decisions.md` if the file doesn't exist yet). Not every task produces decisions — only append when a meaningful choice was made. 19. If you discover a non-obvious rule, recurring gotcha, or useful pattern during execution, append it to `.sf/KNOWLEDGE.md`. Only add entries that would save future agents from repeating your investigation. Don't add obvious things. 20. Read the template at `~/.sf/agent/extensions/sf/templates/task-summary.md` -21. Use that template to prepare the completion content you will pass to `sf_complete_task` using the camelCase fields `milestoneId`, `sliceId`, `taskId`, `oneLiner`, `narrative`, `verification`, and `verificationEvidence`. Do **not** manually write `{{taskSummaryPath}}` — the DB-backed tool is the canonical write path and renders the summary file for you. -22. Call `sf_complete_task` with milestoneId, sliceId, taskId, and the completion fields derived from the template. This is your final required step — do NOT manually edit PLAN.md checkboxes. The tool marks the task complete, updates the DB, renders `{{taskSummaryPath}}`, and updates PLAN.md automatically. +21. Use that template to prepare the completion content you will pass to `sf_task_complete` using the camelCase fields `milestoneId`, `sliceId`, `taskId`, `oneLiner`, `narrative`, `verification`, and `verificationEvidence`. Do **not** manually write `{{taskSummaryPath}}` — the DB-backed tool is the canonical write path and renders the summary file for you. +22. Call `sf_task_complete` with milestoneId, sliceId, taskId, and the completion fields derived from the template. This is your final required step — do NOT manually edit PLAN.md checkboxes. The tool marks the task complete, updates the DB, renders `{{taskSummaryPath}}`, and updates PLAN.md automatically. 23. Do not run git commands — the system reads your task summary after completion and creates a meaningful commit from it (type inferred from title, message from your one-liner, key files from frontmatter). Write a clear, specific one-liner in the summary — it becomes the commit message. All work stays in your working directory: `{{workingDirectory}}`. **Autonomous execution:** Do not call `ask_user_questions` or `secure_env_collect`. You are running in auto-mode — there is no human available to answer questions. Make reasonable assumptions and document them in the task summary. If a decision genuinely requires human input, note it in the summary and proceed with the best available option. -**You MUST call `sf_complete_task` before finishing. Do not manually write `{{taskSummaryPath}}`.** +**You MUST call `sf_task_complete` before finishing. Do not manually write `{{taskSummaryPath}}`.** When done, say: "Task {{taskId}} complete." diff --git a/src/resources/extensions/sf/tests/workflow-mcp.test.ts b/src/resources/extensions/sf/tests/workflow-mcp.test.ts index 97090542e..b5816ec3f 100644 --- a/src/resources/extensions/sf/tests/workflow-mcp.test.ts +++ b/src/resources/extensions/sf/tests/workflow-mcp.test.ts @@ -71,9 +71,9 @@ test("guided execute-task requires canonical task completion tool", () => { ]); }); -test("auto execute-task requires legacy completion alias until prompt contract is aligned", () => { +test("auto execute-task requires canonical sf_task_complete (prompt contract aligned)", () => { assert.deepEqual(getRequiredWorkflowToolsForAutoUnit("execute-task"), [ - "sf_complete_task", + "sf_task_complete", ]); }); @@ -643,7 +643,7 @@ test("transport compatibility discovers the bundled MCP server without env overr test("transport compatibility now allows auto execute-task over workflow MCP surface", () => { const error = getWorkflowTransportSupportError( "claude-code", - ["sf_complete_task"], + ["sf_task_complete"], { projectRoot: "/tmp/project", env: { SF_WORKFLOW_MCP_COMMAND: "node" }, @@ -694,7 +694,7 @@ test("transport compatibility now allows plan-slice over workflow MCP surface", test("transport compatibility now allows complete-slice over workflow MCP surface", () => { const error = getWorkflowTransportSupportError( "claude-code", - ["sf_complete_slice"], + ["sf_slice_complete"], { projectRoot: "/tmp/project", env: { SF_WORKFLOW_MCP_COMMAND: "node" }, diff --git a/src/resources/extensions/sf/workflow-mcp.ts b/src/resources/extensions/sf/workflow-mcp.ts index bf4cbfee1..7cd97dc83 100644 --- a/src/resources/extensions/sf/workflow-mcp.ts +++ b/src/resources/extensions/sf/workflow-mcp.ts @@ -444,9 +444,9 @@ export function getRequiredWorkflowToolsForAutoUnit( case "execute-task": case "execute-task-simple": case "reactive-execute": - return ["sf_complete_task"]; + return ["sf_task_complete"]; case "complete-slice": - return ["sf_complete_slice"]; + return ["sf_slice_complete"]; case "replan-slice": return ["sf_replan_slice"]; case "reassess-roadmap":