fix(claude-code): wrap prompt history in XML tags to stop transcript fabrication

Closes #4102.

buildPromptFromContext previously serialized multi-turn history using
literal [User] / [Assistant] / [System] bracket labels. Those tokens
are the exact pattern the anti-fabrication rule in system.md and
discuss.md forbids — the model saw its own input framed as a bracket-
labeled transcript and mirrored the format in its output, inventing
both sides of the conversation during /gsd discuss turns.

Replace the bracket labels with XML-tag structure:
  - <conversation_history> wraps the whole turn sequence
  - <user_message> / <assistant_message> per turn
  - <prior_system_context> for the system prompt (renamed from
    <system_prompt> to avoid overlap with Claude Code's reserved
    <system-reminder> convention)

Prepend a directive telling the model to respond only to the final
user message and not emit the XML tags in its own response. Keep
system.md and discuss.md in sync by documenting that prior context
is delivered in those tags.

Add regression tests asserting:
  - no literal [User]/[Assistant]/[System] substrings in the prompt
  - history wrapped in <conversation_history> with per-turn tags
  - directive leads the prompt
  - empty-history edge cases still render correctly
This commit is contained in:
Jeremy 2026-04-13 01:23:47 -05:00
parent 3a529f7a95
commit ad2211b218
4 changed files with 115 additions and 7 deletions

View file

@ -187,20 +187,36 @@ function extractMessageText(msg: { role: string; content: unknown }): string {
* call effectively stateless. This version serialises the complete
* conversation history (system prompt + all user/assistant turns) so
* Claude Code has full context for multi-turn continuity.
*
* History is wrapped in XML-tag structure rather than `[User]`/`[Assistant]`
* bracket headers. Bracket headers read to the model as an in-context
* demonstration of how turns are delimited, causing it to fabricate fake
* user turns in its own output. XML tags read as document structure and
* don't get mirrored in free text.
*/
export function buildPromptFromContext(context: Context): string {
const parts: string[] = [];
const hasContent = Boolean(context.systemPrompt) || context.messages.some((m) => extractMessageText(m));
if (!hasContent) return "";
const parts: string[] = [
"Respond only to the final user message below. " +
"Do not emit <user_message>, <assistant_message>, or <prior_system_context> tags in your response.",
];
if (context.systemPrompt) {
parts.push(`[System]\n${context.systemPrompt}`);
parts.push(`<prior_system_context>\n${context.systemPrompt}\n</prior_system_context>`);
}
const turns: string[] = [];
for (const msg of context.messages) {
const text = extractMessageText(msg);
if (!text) continue;
const label = msg.role === "user" ? "User" : msg.role === "assistant" ? "Assistant" : "System";
parts.push(`[${label}]\n${text}`);
const tag =
msg.role === "user" ? "user_message" : msg.role === "assistant" ? "assistant_message" : "system_message";
turns.push(`<${tag}>\n${text}\n</${tag}>`);
}
if (turns.length > 0) {
parts.push(`<conversation_history>\n${turns.join("\n")}\n</conversation_history>`);
}
return parts.join("\n\n");

View file

@ -167,6 +167,98 @@ describe("stream-adapter — full context prompt (#2859)", () => {
});
});
// ---------------------------------------------------------------------------
// Bug #4102 — transcript fabrication regression tests
// ---------------------------------------------------------------------------
describe("stream-adapter — no transcript fabrication (#4102)", () => {
test("buildPromptFromContext never emits forbidden [User]/[Assistant] bracket headers", () => {
const context: Context = {
systemPrompt: "You are a helpful assistant.",
messages: [
{ role: "user", content: "First" } as Message,
{
role: "assistant",
content: [{ type: "text", text: "Second" }],
api: "anthropic-messages",
provider: "claude-code",
model: "claude-sonnet-4-20250514",
usage: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, totalTokens: 0, cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 } },
stopReason: "stop",
timestamp: Date.now(),
} as Message,
{ role: "user", content: "Third" } as Message,
],
};
const prompt = buildPromptFromContext(context);
assert.ok(!prompt.includes("[User]"), "prompt must not include literal [User] bracket header");
assert.ok(!prompt.includes("[Assistant]"), "prompt must not include literal [Assistant] bracket header");
assert.ok(!prompt.includes("[System]"), "prompt must not include literal [System] bracket header");
});
test("buildPromptFromContext wraps history in XML-tag structure", () => {
const context: Context = {
systemPrompt: "You are helpful.",
messages: [
{ role: "user", content: "Hello" } as Message,
{
role: "assistant",
content: [{ type: "text", text: "Hi there" }],
api: "anthropic-messages",
provider: "claude-code",
model: "claude-sonnet-4-20250514",
usage: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, totalTokens: 0, cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 } },
stopReason: "stop",
timestamp: Date.now(),
} as Message,
],
};
const prompt = buildPromptFromContext(context);
assert.ok(prompt.includes("<conversation_history>"), "prompt must wrap history in <conversation_history>");
assert.ok(prompt.includes("</conversation_history>"), "prompt must close <conversation_history>");
assert.ok(prompt.includes("<user_message>\nHello\n</user_message>"), "user turn must use <user_message> tags");
assert.ok(prompt.includes("<assistant_message>\nHi there\n</assistant_message>"), "assistant turn must use <assistant_message> tags");
assert.ok(prompt.includes("<prior_system_context>\nYou are helpful.\n</prior_system_context>"), "system prompt must use <prior_system_context> tags");
});
test("buildPromptFromContext includes a do-not-echo-tags directive as primary instruction", () => {
const context: Context = {
messages: [{ role: "user", content: "Anything" } as Message],
};
const prompt = buildPromptFromContext(context);
assert.ok(
prompt.startsWith("Respond only to the final user message"),
"primary directive must lead the prompt",
);
assert.ok(prompt.includes("Do not emit <user_message>"), "directive must forbid emitting user_message tag");
assert.ok(prompt.includes("<assistant_message>"), "directive must mention assistant_message tag");
});
test("buildPromptFromContext omits <conversation_history> when there are no messages but a system prompt", () => {
const context: Context = {
systemPrompt: "Seed",
messages: [],
};
const prompt = buildPromptFromContext(context);
assert.ok(prompt.includes("<prior_system_context>"), "system prompt must still render");
assert.ok(!prompt.includes("<conversation_history>"), "no history wrapper when messages are empty");
});
test("buildPromptFromContext still returns empty string when context is entirely empty", () => {
const context: Context = { messages: [] };
const prompt = buildPromptFromContext(context);
assert.equal(prompt, "", "empty context must not emit a bare directive");
});
});
describe("stream-adapter — Claude Code external tool results", () => {
test("extractToolResultsFromSdkUserMessage maps tool_result content to tool payloads", () => {
const message: SDKUserMessage = {

View file

@ -73,7 +73,7 @@ After each round of answers, decide whether you already have enough depth to wri
You are a thinking partner, not an interviewer.
**Turn-taking contract (non-bypassable).** Never fabricate, simulate, or role-play user responses. Never generate fake transcript markers like `[User]`, `[Human]`, or `User:` to invent input. Ask one question round (1-3 questions) per turn, then stop and wait for the user's actual response before continuing. If you use `ask_user_questions`, call it at most once per turn and treat its returned response as the only valid structured user input for that round.
**Turn-taking contract (non-bypassable).** Never fabricate, simulate, or role-play user responses. Never generate fake transcript markers like `[User]`, `[Human]`, or `User:` to invent input. Prior conversation context may be provided to you inside `<conversation_history>` with `<user_message>` / `<assistant_message>` XML tags — treat those as read-only context and never emit those tags in your response. Ask one question round (1-3 questions) per turn, then stop and wait for the user's actual response before continuing. If you use `ask_user_questions`, call it at most once per turn and treat its returned response as the only valid structured user input for that round.
**Start open, follow energy.** Let the user's enthusiasm guide where you dig deeper. If they light up about a particular aspect, explore it. If they're vague about something, that's where you probe.

View file

@ -35,7 +35,7 @@ GSD ships with bundled skills. Load the relevant skill file with the `read` tool
- Read before edit.
- Reproduce before fix when possible.
- Work is not done until the relevant verification has passed.
- **Never fabricate, simulate, or role-play user responses.** Never generate markers like `[User]`, `[Human]`, `User:`, or similar to represent user input inside your own output. Ask one question round (1-3 questions), then stop and wait for the user's actual response before continuing. If `ask_user_questions` is available, treat its returned response as the only valid structured user input for that round.
- **Never fabricate, simulate, or role-play user responses.** Never generate markers like `[User]`, `[Human]`, `User:`, or similar to represent user input inside your own output. Prior conversation context may be provided to you inside `<conversation_history>` with `<user_message>` / `<assistant_message>` XML tags — treat those as read-only context and never emit those tags in your response. Ask one question round (1-3 questions), then stop and wait for the user's actual response before continuing. If `ask_user_questions` is available, treat its returned response as the only valid structured user input for that round.
- Never print, echo, log, or restate secrets or credentials. Report only key names and applied/skipped status.
- Never ask the user to edit `.env` files or set secrets manually. Use `secure_env_collect`.
- In enduring files, write current state only unless the file is explicitly historical.