fix: exclude lastReasoning from retry diagnostic to prevent hallucination loops (#2663)
formatTraceSummary() is used by getDeepDiagnostic() which feeds into retry prompts in phases.ts. Including the prior assistant's free-text reasoning caused hallucination loops when the previous turn was truncated or malformed — the model would recycle its own interrupted reasoning as if it were diagnostic truth. The fix removes the lastReasoning field from formatTraceSummary() output. The crash recovery path (formatCrashRecoveryBriefing) has its own safe handling of lastReasoning with explicit framing and is not affected. Closes #2195
This commit is contained in:
parent
e9a41c0df1
commit
9f071fc1b0
2 changed files with 168 additions and 3 deletions
|
|
@ -472,9 +472,13 @@ function formatTraceSummary(trace: ExecutionTrace): string {
|
|||
if (trace.errors.length > 0) {
|
||||
parts.push(`Errors: ${trace.errors.slice(-3).join("; ")}`);
|
||||
}
|
||||
if (trace.lastReasoning) {
|
||||
parts.push(`Last reasoning: "${trace.lastReasoning}"`);
|
||||
}
|
||||
// NOTE: lastReasoning is intentionally excluded from the retry diagnostic.
|
||||
// This summary is injected into retry prompts via getDeepDiagnostic() →
|
||||
// phases.ts. Including prior assistant free-text causes hallucination loops
|
||||
// when the previous turn was truncated or malformed. Crash recovery has its
|
||||
// own path (formatCrashRecoveryBriefing) that handles lastReasoning safely
|
||||
// with explicit "Last Agent Reasoning Before Interruption" framing.
|
||||
// See: https://github.com/gsd-build/gsd-2/issues/2195
|
||||
return parts.join("\n");
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,161 @@
|
|||
/**
|
||||
* Regression tests for #2195: formatTraceSummary (used by getDeepDiagnostic →
|
||||
* retry prompts) must NOT include lastReasoning from prior assistant text.
|
||||
*
|
||||
* Including prior assistant free-text in retry diagnostics causes hallucination
|
||||
* loops when the previous turn was truncated or malformed.
|
||||
*
|
||||
* The crash recovery path (formatCrashRecoveryBriefing) has its own safe handling
|
||||
* of lastReasoning and is NOT affected by this change.
|
||||
*/
|
||||
import { describe, test } from "node:test";
|
||||
import assert from "node:assert/strict";
|
||||
import { mkdirSync, writeFileSync, rmSync, mkdtempSync } from "node:fs";
|
||||
import { join } from "node:path";
|
||||
import { tmpdir } from "node:os";
|
||||
|
||||
import { extractTrace, getDeepDiagnostic } from "../session-forensics.ts";
|
||||
|
||||
/** Build a minimal assistant text reasoning entry. */
|
||||
function makeAssistantText(text: string): unknown {
|
||||
return {
|
||||
type: "message",
|
||||
message: {
|
||||
role: "assistant",
|
||||
content: [{ type: "text", text }],
|
||||
},
|
||||
};
|
||||
}
|
||||
|
||||
/** Build a minimal assistant tool call + tool result pair. */
|
||||
function makeToolPair(
|
||||
toolName: string,
|
||||
input: Record<string, unknown>,
|
||||
resultText: string,
|
||||
isError: boolean,
|
||||
): unknown[] {
|
||||
const toolCallId = `toolu_${Math.random().toString(36).slice(2, 10)}`;
|
||||
return [
|
||||
{
|
||||
type: "message",
|
||||
message: {
|
||||
role: "assistant",
|
||||
content: [
|
||||
{
|
||||
type: "toolCall",
|
||||
id: toolCallId,
|
||||
name: toolName,
|
||||
arguments: input,
|
||||
},
|
||||
],
|
||||
},
|
||||
},
|
||||
{
|
||||
type: "message",
|
||||
message: {
|
||||
role: "toolResult",
|
||||
toolCallId,
|
||||
toolName,
|
||||
isError,
|
||||
content: [{ type: "text", text: resultText }],
|
||||
},
|
||||
},
|
||||
];
|
||||
}
|
||||
|
||||
describe("retry diagnostic excludes lastReasoning (#2195)", () => {
|
||||
test("extractTrace still captures lastReasoning in the trace object", () => {
|
||||
const entries = [
|
||||
makeAssistantText("I am going to write the summary file now"),
|
||||
...makeToolPair("write", { path: "/tmp/SUMMARY.md" }, "ok", false),
|
||||
makeAssistantText("The task is complete — all files written."),
|
||||
];
|
||||
|
||||
const trace = extractTrace(entries);
|
||||
// extractTrace should still collect lastReasoning for crash recovery
|
||||
assert.ok(trace.lastReasoning.length > 0,
|
||||
"extractTrace should still populate lastReasoning");
|
||||
assert.ok(trace.lastReasoning.includes("all files written"),
|
||||
"lastReasoning should contain the last assistant text");
|
||||
});
|
||||
|
||||
test("getDeepDiagnostic output does NOT contain lastReasoning", () => {
|
||||
// Create a temporary activity directory with a JSONL file
|
||||
const tempBase = mkdtempSync(join(tmpdir(), "gsd-diag-test-"));
|
||||
const gsdDir = join(tempBase, ".gsd");
|
||||
const activityDir = join(gsdDir, "activity");
|
||||
mkdirSync(activityDir, { recursive: true });
|
||||
|
||||
try {
|
||||
// Build entries with both tool calls and assistant reasoning
|
||||
const entries = [
|
||||
makeAssistantText("Let me analyze the codebase structure first"),
|
||||
...makeToolPair("bash", { command: "ls src/" }, "index.ts\nutils.ts", false),
|
||||
makeAssistantText("I see the milestone/M001 branch has a significantly different ... 3. "),
|
||||
];
|
||||
|
||||
// Write JSONL activity file
|
||||
const jsonl = entries.map(e => JSON.stringify(e)).join("\n");
|
||||
writeFileSync(join(activityDir, "2025-01-01T00-00-00.jsonl"), jsonl);
|
||||
|
||||
const diagnostic = getDeepDiagnostic(tempBase);
|
||||
|
||||
// Diagnostic should exist (we have tool calls)
|
||||
assert.ok(diagnostic !== null, "diagnostic should not be null");
|
||||
|
||||
// Diagnostic should contain structured execution evidence
|
||||
assert.ok(diagnostic!.includes("Tool calls completed:"),
|
||||
"should include tool call count");
|
||||
assert.ok(diagnostic!.includes("ls src/"),
|
||||
"should include commands run");
|
||||
|
||||
// Diagnostic must NOT contain the assistant's free-text reasoning
|
||||
assert.ok(!diagnostic!.includes("Last reasoning"),
|
||||
"diagnostic must not include 'Last reasoning' label");
|
||||
assert.ok(!diagnostic!.includes("analyze the codebase"),
|
||||
"diagnostic must not include prior assistant text");
|
||||
assert.ok(!diagnostic!.includes("significantly different"),
|
||||
"diagnostic must not include truncated assistant reasoning");
|
||||
} finally {
|
||||
rmSync(tempBase, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
|
||||
test("getDeepDiagnostic still includes errors and file operations", () => {
|
||||
const tempBase = mkdtempSync(join(tmpdir(), "gsd-diag-test-"));
|
||||
const gsdDir = join(tempBase, ".gsd");
|
||||
const activityDir = join(gsdDir, "activity");
|
||||
mkdirSync(activityDir, { recursive: true });
|
||||
|
||||
try {
|
||||
const entries = [
|
||||
makeAssistantText("Writing the plan file"),
|
||||
...makeToolPair("write", { path: "M001/S01/S01-PLAN.md" }, "ok", false),
|
||||
...makeToolPair("bash", { command: "npm run build" }, "Error: type mismatch", true),
|
||||
makeAssistantText("The build failed, let me investigate"),
|
||||
];
|
||||
|
||||
const jsonl = entries.map(e => JSON.stringify(e)).join("\n");
|
||||
writeFileSync(join(activityDir, "2025-01-01T00-00-00.jsonl"), jsonl);
|
||||
|
||||
const diagnostic = getDeepDiagnostic(tempBase);
|
||||
assert.ok(diagnostic !== null);
|
||||
|
||||
// Structured evidence should be present
|
||||
assert.ok(diagnostic!.includes("S01-PLAN.md"),
|
||||
"should include files written");
|
||||
assert.ok(diagnostic!.includes("npm run build"),
|
||||
"should include commands run");
|
||||
assert.ok(diagnostic!.includes("type mismatch"),
|
||||
"should include errors");
|
||||
|
||||
// But NOT the assistant's free-text
|
||||
assert.ok(!diagnostic!.includes("Writing the plan"),
|
||||
"must not include assistant reasoning");
|
||||
assert.ok(!diagnostic!.includes("build failed"),
|
||||
"must not include assistant reasoning about failures");
|
||||
} finally {
|
||||
rmSync(tempBase, { recursive: true, force: true });
|
||||
}
|
||||
});
|
||||
});
|
||||
Loading…
Add table
Reference in a new issue