fix: cap consecutive tool validation failures to prevent stuck-loop (#3301)

* fix: cap consecutive tool validation failures to prevent stuck-loop (#2783) When the LLM repeatedly emits tool calls with arguments that fail schema validation, the agent loop retries indefinitely — each failed validation returns an error tool result, the LLM retries with the same broken args, and the cycle burns budget with no progress. Add a consecutive-failure counter in runLoop that tracks turns where ALL tool calls fail. After MAX_CONSECUTIVE_VALIDATION_FAILURES (3) consecutive all-error turns, the loop emits a diagnostic stop message and terminates cleanly. The counter resets whenever any tool call in a turn succeeds, so intermittent failures do not trigger early termination. Closes #2783 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * chore: retrigger CI * fix(test): repair agent-loop.test.ts — close unclosed blocks, merge imports Two test suites were concatenated without closing the first suite's it+describe blocks, placing the second suite's imports inside a function body and triggering 'Unexpected "{" ' from esbuild. Merged into a single well-structured file with consolidated imports and proper closings. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: trek-e <trek-e@users.noreply.github.com>
2026-04-05 01:04:58 -04:00 · 2026-04-05 01:04:58 -04:00 · db7a6372a6
commit db7a6372a6
parent 4f6b3433d6
2 changed files with 274 additions and 5 deletions
--- a/packages/pi-agent-core/src/agent-loop.test.ts
+++ b/packages/pi-agent-core/src/agent-loop.test.ts
@ -1,13 +1,16 @@
-// agent-loop pauseTurn handling tests
-// Verifies that pause_turn / pauseTurn stop reason causes the inner loop
-// to continue (re-invoke the LLM) instead of exiting.
-// Regression test for https://github.com/gsd-build/gsd-2/issues/2869
+// agent-loop tests
+// Covers: pauseTurn handling (#2869), schema overload retry cap (#2783)

-import { describe, it } from "node:test";
+import { describe, it, mock } from "node:test";
 import assert from "node:assert/strict";
 import { readFileSync } from "node:fs";
 import { join, dirname } from "node:path";
 import { fileURLToPath } from "node:url";
+import { Type } from "@sinclair/typebox";
+import { agentLoop, MAX_CONSECUTIVE_VALIDATION_FAILURES } from "./agent-loop.js";
+import type { AgentContext, AgentLoopConfig, AgentTool, AgentEvent, AgentMessage } from "./types.js";
+import { AssistantMessageEventStream, EventStream } from "@gsd/pi-ai";
+import type { AssistantMessage, AssistantMessageEvent, Model } from "@gsd/pi-ai";

 const __dirname = dirname(fileURLToPath(import.meta.url));

@ -43,3 +46,216 @@ describe("agent-loop — pauseTurn handling (#2869)", () => {
 		);
 	});
 });
+
+/**
+ * Regression tests for #2783: Stuck-loop on execute-task — tool-call schema
+ * overload causes unbounded retry + budget burn.
+ *
+ * When the LLM repeatedly emits tool calls with arguments that fail schema
+ * validation, the agent loop retries indefinitely. Each failed validation
+ * returns an error tool result, the LLM retries with the same broken args,
+ * and the cycle never breaks — burning budget with no progress.
+ *
+ * The fix caps consecutive validation failures per turn at
+ * MAX_CONSECUTIVE_VALIDATION_FAILURES (default 3). Once the cap is hit, the
+ * loop injects a synthetic stop so the agent terminates cleanly instead of
+ * spinning forever.
+ */
+
+// ─── Helpers ──────────────────────────────────────────────────────────────────
+
+const TEST_MODEL: Model<"anthropic-messages"> = {
+	id: "claude-test",
+	name: "Test Model",
+	api: "anthropic-messages",
+	provider: "anthropic",
+	contextWindow: 200_000,
+	maxOutput: 4096,
+	supportsImages: false,
+	supportsPromptCache: false,
+	thinkingLevel: undefined,
+};
+
+function makeToolWithSchema(): AgentTool<any> {
+	return {
+		name: "write_file",
+		label: "Write File",
+		description: "Write content to a file",
+		parameters: Type.Object({
+			path: Type.String(),
+			content: Type.String(),
+		}),
+		execute: async () => ({
+			content: [{ type: "text" as const, text: "done" }],
+			details: {},
+		}),
+	};
+}
+
+/**
+ * Creates a mock streamFn that returns assistant messages from a queue.
+ * Each call pops the next message. The messages simulate the LLM repeatedly
+ * emitting the same tool call with broken arguments.
+ */
+function createMockStreamFn(responses: AssistantMessage[]) {
+	let callIndex = 0;
+
+	return function mockStreamFn(): AssistantMessageEventStream {
+		const message = responses[callIndex] ?? responses[responses.length - 1];
+		callIndex++;
+
+		const stream = new AssistantMessageEventStream();
+		// Simulate async delivery
+		queueMicrotask(() => {
+			stream.push({ type: "start", partial: message });
+			stream.push({ type: "done", message });
+			stream.end(message);
+		});
+		return stream;
+	};
+}
+
+function makeAssistantMessage(overrides: Partial<AssistantMessage> = {}): AssistantMessage {
+	return {
+		role: "assistant",
+		content: [],
+		api: "anthropic-messages",
+		provider: "anthropic",
+		model: "claude-test",
+		usage: { input: 100, output: 50, cacheRead: 0, cacheWrite: 0, totalTokens: 150, cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 } },
+		stopReason: "end_turn",
+		timestamp: Date.now(),
+		...overrides,
+	};
+}
+
+function makeToolCallMessage(toolCallArgs: Record<string, unknown>): AssistantMessage {
+	return makeAssistantMessage({
+		content: [
+			{
+				type: "toolCall",
+				id: `tc_${Date.now()}_${Math.random()}`,
+				name: "write_file",
+				arguments: toolCallArgs,
+			},
+		],
+		stopReason: "tool_use",
+	});
+}
+
+function collectEvents(stream: EventStream<AgentEvent, AgentMessage[]>): Promise<AgentEvent[]> {
+	return new Promise(async (resolve) => {
+		const events: AgentEvent[] = [];
+		for await (const event of stream) {
+			events.push(event);
+		}
+		resolve(events);
+	});
+}
+
+// ─── Tests ────────────────────────────────────────────────────────────────────
+
+describe("agent-loop — schema overload retry cap (#2783)", () => {
+
+	it("terminates after MAX_CONSECUTIVE_VALIDATION_FAILURES consecutive schema failures", async () => {
+		const tool = makeToolWithSchema();
+
+		// LLM keeps sending tool calls with invalid args (missing required 'content' field)
+		const badToolCall = makeToolCallMessage({ path: "/tmp/test" }); // missing 'content'
+		const finalStop = makeAssistantMessage({ content: [{ type: "text", text: "I give up." }], stopReason: "end_turn" });
+
+		// Create enough bad responses to exceed the cap, plus a final stop
+		const responses: AssistantMessage[] = [];
+		for (let i = 0; i < MAX_CONSECUTIVE_VALIDATION_FAILURES + 5; i++) {
+			responses.push(badToolCall);
+		}
+		responses.push(finalStop);
+
+		const mockStream = createMockStreamFn(responses);
+
+		const context: AgentContext = {
+			systemPrompt: "You are a test agent.",
+			messages: [{ role: "user", content: [{ type: "text", text: "Write a file" }], timestamp: Date.now() }],
+			tools: [tool],
+		};
+
+		const config: AgentLoopConfig = {
+			model: TEST_MODEL,
+			convertToLlm: (msgs) => msgs.filter((m): m is any => m.role !== "custom"),
+			toolExecution: "sequential",
+		};
+
+		const stream = agentLoop(
+			[{ role: "user", content: [{ type: "text", text: "Write a file" }], timestamp: Date.now() }],
+			context,
+			config,
+			undefined,
+			mockStream as any,
+		);
+
+		const events = await collectEvents(stream);
+
+		// Must have terminated (agent_end event present)
+		const agentEnd = events.find((e) => e.type === "agent_end");
+		assert.ok(agentEnd, "agent loop must emit agent_end after hitting retry cap");
+
+		// Count how many turns had validation errors (tool_execution_end with isError: true)
+		const toolErrors = events.filter(
+			(e) => e.type === "tool_execution_end" && e.isError === true,
+		);
+
+		// Must not exceed the cap
+		assert.ok(
+			toolErrors.length <= MAX_CONSECUTIVE_VALIDATION_FAILURES,
+			`Expected at most ${MAX_CONSECUTIVE_VALIDATION_FAILURES} validation error tool results, got ${toolErrors.length}`,
+		);
+	});
+
+	it("resets the failure counter when a tool call succeeds", async () => {
+		const tool = makeToolWithSchema();
+
+		// Pattern: 2 failures, 1 success, 2 failures, 1 success, then stop
+		const badCall = makeToolCallMessage({ path: "/tmp/test" }); // missing 'content'
+		const goodCall = makeToolCallMessage({ path: "/tmp/test", content: "hello" });
+		const finalStop = makeAssistantMessage({ content: [{ type: "text", text: "Done." }], stopReason: "end_turn" });
+
+		const responses = [badCall, badCall, goodCall, badCall, badCall, goodCall, finalStop];
+		const mockStream = createMockStreamFn(responses);
+
+		const context: AgentContext = {
+			systemPrompt: "You are a test agent.",
+			messages: [{ role: "user", content: [{ type: "text", text: "Write a file" }], timestamp: Date.now() }],
+			tools: [tool],
+		};
+
+		const config: AgentLoopConfig = {
+			model: TEST_MODEL,
+			convertToLlm: (msgs) => msgs.filter((m): m is any => m.role !== "custom"),
+			toolExecution: "sequential",
+		};
+
+		const stream = agentLoop(
+			[{ role: "user", content: [{ type: "text", text: "Write a file" }], timestamp: Date.now() }],
+			context,
+			config,
+			undefined,
+			mockStream as any,
+		);
+
+		const events = await collectEvents(stream);
+
+		// Must complete successfully since failures never reached cap consecutively
+		const agentEnd = events.find((e) => e.type === "agent_end");
+		assert.ok(agentEnd, "agent loop must complete normally when failures are interspersed with successes");
+
+		// Should have processed all 6 tool-bearing turns
+		const toolExecEnds = events.filter((e) => e.type === "tool_execution_end");
+		assert.ok(toolExecEnds.length >= 4, `Expected at least 4 tool executions (2 bad + 1 good + 2 bad + 1 good), got ${toolExecEnds.length}`);
+	});
+
+	it("exports MAX_CONSECUTIVE_VALIDATION_FAILURES as a configurable constant", () => {
+		assert.equal(typeof MAX_CONSECUTIVE_VALIDATION_FAILURES, "number");
+		assert.ok(MAX_CONSECUTIVE_VALIDATION_FAILURES >= 2, "Cap must be at least 2 to allow one retry");
+		assert.ok(MAX_CONSECUTIVE_VALIDATION_FAILURES <= 10, "Cap must not be unreasonably high");
+	});
+});
--- a/packages/pi-agent-core/src/agent-loop.ts
+++ b/packages/pi-agent-core/src/agent-loop.ts
@ -22,6 +22,15 @@ import type {
 	StreamFn,
 } from "./types.js";

+/**
+ * Maximum number of consecutive turns where ALL tool calls in the turn fail
+ * schema validation before the loop terminates. This prevents unbounded retry
+ * loops when the LLM repeatedly emits tool calls with arguments that cannot
+ * pass validation (e.g., schema overload, truncated JSON, missing required
+ * fields). See: https://github.com/gsd-build/gsd-2/issues/2783
+ */
+export const MAX_CONSECUTIVE_VALIDATION_FAILURES = 3;
+
 export const ZERO_USAGE = {
 	input: 0,
 	output: 0,
@ -175,6 +184,12 @@ async function runLoop(
 	// Check for steering messages at start (user may have typed while waiting)
 	let pendingMessages: AgentMessage[] = (await config.getSteeringMessages?.()) || [];

+	// Track consecutive turns where ALL tool calls fail validation.
+	// When the LLM repeatedly emits tool calls with schema-overloaded or malformed
+	// arguments, each turn produces only error tool results. Without a cap, this
+	// creates an unbounded retry loop that burns budget. (#2783)
+	let consecutiveAllToolErrorTurns = 0;
+
 	// Outer loop: continues when queued follow-up messages arrive after agent would stop
 	while (true) {
 		let hasMoreToolCalls = true;
@ -277,6 +292,44 @@ async function runLoop(
 					currentContext.messages.push(result);
 					newMessages.push(result);
 				}
+
+				// Schema overload detection (#2783): if EVERY tool result in this turn
+				// is an error (validation failure, missing tool, etc.), increment the
+				// consecutive failure counter. If any tool succeeded, reset to zero.
+				const allToolsFailed = toolResults.length > 0 && toolResults.every((r) => r.isError);
+				if (allToolsFailed) {
+					consecutiveAllToolErrorTurns++;
+				} else {
+					consecutiveAllToolErrorTurns = 0;
+				}
+
+				if (consecutiveAllToolErrorTurns >= MAX_CONSECUTIVE_VALIDATION_FAILURES) {
+					// Force-stop: the LLM is stuck retrying broken tool calls.
+					// Emit the turn_end and terminate the agent loop cleanly.
+					stream.push({ type: "turn_end", message, toolResults });
+					const stopMessage: AssistantMessage = {
+						role: "assistant",
+						content: [
+							{
+								type: "text",
+								text: `Agent stopped: ${consecutiveAllToolErrorTurns} consecutive turns with all tool calls failing. This usually means the model is repeatedly sending arguments that do not match the tool schema.`,
+							},
+						],
+						api: config.model.api,
+						provider: config.model.provider,
+						model: config.model.id,
+						usage: ZERO_USAGE,
+						stopReason: "error",
+						errorMessage: "Schema overload: consecutive tool validation failures exceeded cap",
+						timestamp: Date.now(),
+					};
+					emitMessagePair(stream, stopMessage);
+					newMessages.push(stopMessage);
+					stream.push({ type: "turn_end", message: stopMessage, toolResults: [] });
+					stream.push({ type: "agent_end", messages: newMessages });
+					stream.end(newMessages);
+					return;
+				}
 			}

 			stream.push({ type: "turn_end", message, toolResults });