singularity-forge/src/resources/extensions/sf/tests/prompt-contracts.test.ts

383 lines
19 KiB
TypeScript

import test from "node:test";
import assert from "node:assert/strict";
import { readFileSync } from "node:fs";
import { join } from "node:path";
const promptsDir = join(process.cwd(), "src/resources/extensions/sf/prompts");
function readPrompt(name: string): string {
return readFileSync(join(promptsDir, `${name}.md`), "utf-8");
}
test("reactive-execute prompt keeps task summaries with subagents and avoids batch commits", () => {
const prompt = readPrompt("reactive-execute");
assert.match(prompt, /subagent-written summary as authoritative/i);
assert.match(prompt, /Do NOT create a batch commit/i);
assert.doesNotMatch(prompt, /\*\*Write task summaries\*\*/i);
assert.doesNotMatch(prompt, /\*\*Commit\*\* all changes/i);
});
test("run-uat prompt branches on dynamic UAT mode and supports runtime evidence", () => {
const prompt = readPrompt("run-uat");
assert.match(prompt, /\*\*Detected UAT mode:\*\*\s*`\{\{uatType\}\}`/);
assert.match(prompt, /uatType:\s*\{\{uatType\}\}/);
assert.match(prompt, /live-runtime/);
assert.match(prompt, /browser\/runtime\/network/i);
assert.match(prompt, /NEEDS-HUMAN/);
assert.doesNotMatch(prompt, /uatType:\s*artifact-driven/);
});
test("workflow-start prompt defaults to autonomy instead of per-phase confirmation", () => {
const prompt = readPrompt("workflow-start");
assert.match(prompt, /Keep moving by default/i);
assert.match(prompt, /Decision gates, not ceremony/i);
assert.doesNotMatch(prompt, /confirm with the user before proceeding/i);
assert.doesNotMatch(prompt, /Gate between phases/i);
});
test("system prompt references CODEBASE.md and /sf codebase", () => {
const prompt = readPrompt("system");
assert.match(prompt, /CODEBASE\.md/);
assert.match(prompt, /\/sf codebase \[generate\|update\|stats\|rag\]/);
assert.match(prompt, /auto-refreshes it when tracked files change/i);
});
test("system prompt routes broad code search through optional Project RAG when available", () => {
const prompt = readPrompt("system");
assert.match(prompt, /PROJECT CODE INTELLIGENCE/);
assert.match(prompt, /Project RAG is configured/);
assert.match(prompt, /hybrid semantic \+ BM25 code retrieval/i);
});
test("system prompt hard rules forbid fabricating user responses", () => {
const prompt = readPrompt("system");
assert.match(prompt, /never fabricate, simulate, or role-play user responses/i);
assert.match(prompt, /never generate markers like `?\[User\]`?, `?\[Human\]`?, `?User:`?/i);
assert.match(prompt, /ask one question round \(1-3 questions\), then stop and wait for the user's actual response/i);
assert.match(prompt, /ask_user_questions.*only valid structured user input/i);
});
test("system prompt makes question rounds efficient and progress-oriented", () => {
const prompt = readPrompt("system");
assert.match(prompt, /Question Efficiency Contract/i);
assert.match(prompt, /State current understanding in 2-5 concise bullets/i);
assert.match(prompt, /Name the blocked decision/i);
assert.match(prompt, /continue with a documented assumption instead of blocking/i);
assert.match(prompt, /After each answer, summarize what changed/i);
});
test("discuss prompt allows implementation questions when they materially matter", () => {
const prompt = readPrompt("discuss");
assert.match(prompt, /Lead with experience, but ask implementation when it materially matters/i);
assert.match(prompt, /Never fabricate, simulate, or role-play user responses/i);
assert.match(prompt, /Ask one question round \(1-3 questions\) per turn, then stop and wait for the user's actual response/i);
assert.match(prompt, /one gate, not two/i);
assert.doesNotMatch(prompt, /Questions must be about the experience, not the implementation/i);
});
test("guided discussion prompts avoid wrap-up prompts after every round", () => {
const milestonePrompt = readPrompt("guided-discuss-milestone");
const slicePrompt = readPrompt("guided-discuss-slice");
assert.match(milestonePrompt, /Do \*\*not\*\* ask a meta "ready to wrap up\?" question after every round/i);
assert.match(slicePrompt, /Do \*\*not\*\* ask a meta "ready to wrap up\?" question after every round/i);
assert.doesNotMatch(milestonePrompt, /I think I have a solid picture of this milestone\. Ready to wrap up/i);
assert.doesNotMatch(slicePrompt, /I think I have a solid picture of this slice\. Ready to wrap up/i);
assert.match(milestonePrompt, /Never fabricate or simulate user input/i);
assert.match(slicePrompt, /Never fabricate or simulate user input/i);
});
test("guided discussion prompts require understanding and progress before questions", () => {
const milestonePrompt = readPrompt("guided-discuss-milestone");
const slicePrompt = readPrompt("guided-discuss-slice");
assert.match(milestonePrompt, /Understanding \+ progress preface/i);
assert.match(slicePrompt, /Understanding \+ progress preface/i);
assert.match(milestonePrompt, /Current understanding/i);
assert.match(slicePrompt, /Current understanding/i);
assert.match(milestonePrompt, /Blocked decision/i);
assert.match(slicePrompt, /Blocked decision/i);
assert.match(milestonePrompt, /After each answer, summarize what materially changed/i);
assert.match(slicePrompt, /After each answer, summarize what materially changed/i);
});
test("discuss prompt keeps each question round tied to progress", () => {
const prompt = readPrompt("discuss");
assert.match(prompt, /Question round shape/i);
assert.match(prompt, /Current understanding/i);
assert.match(prompt, /Blocked decision/i);
assert.match(prompt, /Why these questions/i);
assert.match(prompt, /documented assumption/i);
assert.match(prompt, /After each answer, summarize what materially changed/i);
});
test("guided milestone discussion scopes depth verification to the milestone id", () => {
const prompt = readPrompt("guided-discuss-milestone");
assert.match(prompt, /depth_verification_\{\{milestoneId\}\}/, "depth verification id should include the milestone id");
assert.doesNotMatch(prompt, /depth_verification_confirm" — this enables the write-gate downstream/i, "legacy global depth gate wording should be gone");
});
test("headless milestone creation builds project knowledge before final context", () => {
const prompt = readPrompt("discuss-headless");
assert.match(prompt, /\.sf\/CODEBASE\.md/);
assert.match(prompt, /stack signals, critical paths, verification commands, skill needs/i);
assert.match(prompt, /Discover needed skills/i);
assert.match(prompt, /Use code intelligence when available/i);
assert.match(prompt, /A headless run that only prints reflection has failed its contract/i);
assert.match(prompt, /Build project knowledge first/i);
assert.doesNotMatch(prompt, /\.sf\/CODEBASE-ANALYSIS\.md/);
});
test("headless milestone creation preserves depth gate and draft fallback", () => {
const prompt = readPrompt("discuss-headless");
assert.match(prompt, /depth_verification_\{\{milestoneId\}\}_confirm/);
assert.match(prompt, /Proceed with final context \(Recommended\)/);
assert.match(prompt, /\{\{milestoneId\}\}-CONTEXT-DRAFT\.md/);
assert.match(prompt, /Do \*\*not\*\* call `sf_plan_milestone`/);
assert.match(prompt, /Milestone \{\{milestoneId\}\} drafted for discussion\./);
assert.doesNotMatch(prompt, /\*\*DO NOT ask the user any questions\*\*/);
});
test("headless milestone creation uses one final question gate, not exploratory questions", () => {
const prompt = readPrompt("discuss-headless");
assert.match(prompt, /The final gate is the only question in headless mode/i);
assert.match(prompt, /not an exploratory question round/i);
assert.match(prompt, /compact depth summary/i);
assert.match(prompt, /write or should remain a draft/i);
});
test("queue prompt requires waiting for user response between rounds", () => {
const prompt = readPrompt("queue");
assert.match(prompt, /Never fabricate or simulate user input during this discussion/i);
assert.match(prompt, /Ask 1-3 questions per round, then wait for the user's response before asking the next round\./i);
assert.doesNotMatch(prompt, /treat that as permission to continue/i);
});
test("guided-resume-task prompt preserves recovery state until work is superseded", () => {
const prompt = readPrompt("guided-resume-task");
assert.match(prompt, /Do \*\*not\*\* delete the continue file immediately/i);
assert.match(prompt, /successfully completed or you have written a newer summary\/continue artifact/i);
assert.doesNotMatch(prompt, /Delete the continue file after reading it/i);
});
// ─── Prompt migration: execute-task → sf_complete_task ───────────────
test("execute-task prompt references sf_complete_task tool", () => {
const prompt = readPrompt("execute-task");
assert.match(prompt, /sf_complete_task/);
});
test("execute-task prompt uses sf_complete_task as canonical summary write path", () => {
const prompt = readPrompt("execute-task");
assert.match(prompt, /\{\{taskSummaryPath\}\}/);
assert.match(prompt, /sf_complete_task/);
assert.match(prompt, /DB-backed tool is the canonical write path/i);
assert.match(prompt, /Do \*\*not\*\* manually write `?\{\{taskSummaryPath\}\}`?/i);
assert.doesNotMatch(prompt, /^\d+\.\s+Write `?\{\{taskSummaryPath\}\}`?\s*$/m);
});
test("execute-task prompt does not instruct LLM to toggle checkboxes manually", () => {
const prompt = readPrompt("execute-task");
assert.doesNotMatch(prompt, /change \[ \] to \[x\]/);
assert.doesNotMatch(prompt, /Mark \{\{taskId\}\} done in/);
});
test("execute-task prompt still contains template variables for context", () => {
const prompt = readPrompt("execute-task");
assert.match(prompt, /\{\{taskSummaryPath\}\}/);
assert.match(prompt, /\{\{planPath\}\}/);
});
test("guided-execute-task prompt references sf_task_complete tool", () => {
const prompt = readPrompt("guided-execute-task");
assert.match(prompt, /sf_task_complete/);
});
test("guided-execute-task prompt does not instruct manual file write", () => {
const prompt = readPrompt("guided-execute-task");
assert.doesNotMatch(prompt, /Write `?\{\{taskId\}\}-SUMMARY\.md`?.*mark it done/i);
});
// ─── Prompt migration: complete-slice → sf_complete_slice ────────────
test("complete-slice prompt references sf_complete_slice tool", () => {
const prompt = readPrompt("complete-slice");
assert.match(prompt, /sf_complete_slice/);
});
test("complete-slice prompt does not instruct LLM to toggle checkboxes manually", () => {
const prompt = readPrompt("complete-slice");
assert.doesNotMatch(prompt, /change \[ \] to \[x\]/);
});
test("guided-complete-slice prompt references sf_slice_complete tool", () => {
const prompt = readPrompt("guided-complete-slice");
assert.match(prompt, /sf_slice_complete/);
});
test("complete-slice prompt instructs writing summary and UAT files before tool call", () => {
const prompt = readPrompt("complete-slice");
assert.match(prompt, /\{\{sliceSummaryPath\}\}/);
assert.match(prompt, /\{\{sliceUatPath\}\}/);
assert.match(prompt, /sf_complete_slice/);
assert.match(prompt, /DB-backed tool is the canonical write path/i);
assert.match(prompt, /Do \*\*not\*\* manually write `?\{\{sliceSummaryPath\}\}`?/i);
assert.match(prompt, /Do \*\*not\*\* manually write `?\{\{sliceUatPath\}\}`?/i);
assert.doesNotMatch(prompt, /^\d+\.\s+Write `?\{\{sliceSummaryPath\}\}`?.*$/m);
assert.doesNotMatch(prompt, /^\d+\.\s+Write `?\{\{sliceUatPath\}\}`?.*$/m);
});
test("complete-slice prompt preserves decisions and knowledge review steps", () => {
const prompt = readPrompt("complete-slice");
assert.match(prompt, /DECISIONS\.md/);
assert.match(prompt, /KNOWLEDGE\.md/);
});
test("validate-milestone prompt uses sf_validate_milestone as canonical validation write path", () => {
const prompt = readPrompt("validate-milestone");
assert.match(prompt, /sf_validate_milestone/);
assert.match(prompt, /\{\{validationPath\}\}/);
assert.match(prompt, /DB-backed tool is the canonical write path/i);
assert.match(prompt, /Do \*\*not\*\* manually write `?\{\{validationPath\}\}`?/i);
assert.doesNotMatch(prompt, /Write to `?\{\{validationPath\}\}`?:/i);
});
test("complete-slice prompt still contains template variables for context", () => {
const prompt = readPrompt("complete-slice");
assert.match(prompt, /\{\{sliceSummaryPath\}\}/);
assert.match(prompt, /\{\{sliceUatPath\}\}/);
});
test("plan-milestone prompt references DB-backed planning tool and explicitly forbids manual roadmap writes", () => {
const prompt = readPrompt("plan-milestone");
assert.match(prompt, /sf_plan_milestone/);
assert.match(prompt, /Do \*\*not\*\* write `?\{\{outputPath\}\}`?, `?ROADMAP\.md`?, or other planning artifacts manually/i);
});
test("guided-plan-milestone prompt references DB-backed planning tool and explicitly forbids manual roadmap writes", () => {
const prompt = readPrompt("guided-plan-milestone");
assert.match(prompt, /sf_plan_milestone/);
assert.match(prompt, /Do \*\*not\*\* write `?\{\{milestoneId\}\}-ROADMAP\.md`?, `?ROADMAP\.md`?, or other planning artifacts manually/i);
});
test("plan-slice prompt no longer frames direct PLAN writes as the source of truth", () => {
const prompt = readPrompt("plan-slice");
assert.match(prompt, /Do \*\*not\*\* rely on direct `PLAN\.md` writes as the source of truth/i);
});
test("plan-slice prompt explicitly names sf_plan_slice as DB-backed planning tool", () => {
const prompt = readPrompt("plan-slice");
assert.match(prompt, /sf_plan_slice/);
assert.match(prompt, /sf_plan_task/);
// The prompt should describe the DB-backed tool as the canonical write path
assert.match(prompt, /DB-backed tool is the canonical write path/i);
});
test("plan-slice prompt does not instruct direct file writes as a primary step", () => {
const prompt = readPrompt("plan-slice");
// Should not instruct to "Write {{outputPath}}" as a primary step — tools handle rendering
assert.doesNotMatch(prompt, /^\d+\.\s+Write `?\{\{outputPath\}\}`?\s*$/m);
});
test("plan-slice prompt clarifies sf_plan_slice handles task persistence", () => {
const prompt = readPrompt("plan-slice");
// sf_plan_slice persists tasks in its transaction — no separate sf_plan_task calls needed
assert.match(prompt, /sf_plan_task/);
assert.match(prompt, /sf_plan_slice` handles task persistence/i);
});
test("replan-slice prompt uses sf_replan_slice as canonical DB-backed tool", () => {
const prompt = readPrompt("replan-slice");
assert.match(prompt, /sf_replan_slice/);
// Degraded fallback (direct file writes) was removed — DB tools are always available
assert.doesNotMatch(prompt, /Degraded fallback/i);
});
test("reassess-roadmap prompt references sf_reassess_roadmap tool", () => {
const prompt = readPrompt("reassess-roadmap");
assert.match(prompt, /sf_reassess_roadmap/);
});
test("validate-milestone prompt dispatches parallel reviewers", () => {
const prompt = readPrompt("validate-milestone");
assert.match(prompt, /Reviewer A/);
assert.match(prompt, /Reviewer B/);
assert.match(prompt, /Reviewer C/);
assert.match(prompt, /Requirements Coverage/);
assert.match(prompt, /Cross-Slice Integration/);
assert.match(prompt, /Assessment & Acceptance Criteria/);
assert.match(prompt, /assessment evidence/i);
});
// ─── Prompt migration: replan-slice → sf_replan_slice ────────────────
test("replan-slice prompt names sf_replan_slice as the tool to use", () => {
const prompt = readPrompt("replan-slice");
assert.match(prompt, /sf_replan_slice/);
});
// ─── Prompt migration: reassess-roadmap → sf_reassess_roadmap ───────
test("reassess-roadmap prompt names sf_reassess_roadmap as the tool to use", () => {
const prompt = readPrompt("reassess-roadmap");
assert.match(prompt, /sf_reassess_roadmap/);
});
// ─── Bug #2933: prompt parameter names must match camelCase TypeBox schema ───
test("execute-task prompt uses camelCase parameter names matching TypeBox schema", () => {
const prompt = readPrompt("execute-task");
// The sf_complete_task tool schema uses camelCase: milestoneId, sliceId, taskId
// Prompts must NOT tell the LLM to use snake_case (milestone_id, slice_id, task_id)
const toolCallLine = prompt.split("\n").find((l) => /sf_complete_task/.test(l) || /sf_task_complete/.test(l));
assert.ok(toolCallLine, "prompt must contain a sf_complete_task or sf_task_complete tool call line");
assert.doesNotMatch(toolCallLine!, /milestone_id/, "must use milestoneId, not milestone_id");
assert.doesNotMatch(toolCallLine!, /slice_id/, "must use sliceId, not slice_id");
assert.doesNotMatch(toolCallLine!, /task_id/, "must use taskId, not task_id");
// Positive: must mention the camelCase names
assert.match(toolCallLine!, /milestoneId/);
assert.match(toolCallLine!, /sliceId/);
assert.match(toolCallLine!, /taskId/);
});
test("complete-slice prompt uses camelCase parameter names matching TypeBox schema", () => {
const prompt = readPrompt("complete-slice");
// The sf_complete_slice tool schema uses camelCase: milestoneId, sliceId
const toolCallLine = prompt.split("\n").find((l) => /sf_complete_slice/.test(l) || /sf_slice_complete/.test(l));
assert.ok(toolCallLine, "prompt must contain a sf_complete_slice or sf_slice_complete tool call line");
assert.doesNotMatch(toolCallLine!, /milestone_id/, "must use milestoneId, not milestone_id");
assert.doesNotMatch(toolCallLine!, /slice_id/, "must use sliceId, not slice_id");
// Positive: must mention the camelCase names
assert.match(toolCallLine!, /milestoneId/);
assert.match(toolCallLine!, /sliceId/);
});
// ─── File system safety: complete-slice parity with complete-milestone (#2935) ──
test("complete-slice prompt includes filesystem safety guard against EISDIR", () => {
const prompt = readPrompt("complete-slice");
assert.match(
prompt,
/File system safety/i,
"complete-slice.md must include a 'File system safety' instruction to prevent EISDIR errors when the LLM passes a directory path to the read tool"
);
assert.match(
prompt,
/never pass.*directory path.*directly to the.*read.*tool/i,
"complete-slice.md must warn against passing directory paths to the read tool"
);
});
test("complete-milestone prompt still has its filesystem safety guard (regression)", () => {
const prompt = readPrompt("complete-milestone");
assert.match(
prompt,
/File system safety/i,
"complete-milestone.md must keep its filesystem safety guard"
);
});
test("reactive-execute prompt references tool calls instead of checkbox updates", () => {
const prompt = readPrompt("reactive-execute");
assert.doesNotMatch(prompt, /checkbox updates/);
assert.doesNotMatch(prompt, /checkbox edits/);
assert.match(prompt, /completion tool calls/);
});