fix: cap consecutive tool validation failures to prevent stuck-loop (#3301)

* fix: cap consecutive tool validation failures to prevent stuck-loop (#2783)

When the LLM repeatedly emits tool calls with arguments that fail schema
validation, the agent loop retries indefinitely — each failed validation
returns an error tool result, the LLM retries with the same broken args,
and the cycle burns budget with no progress.

Add a consecutive-failure counter in runLoop that tracks turns where ALL
tool calls fail. After MAX_CONSECUTIVE_VALIDATION_FAILURES (3) consecutive
all-error turns, the loop emits a diagnostic stop message and terminates
cleanly. The counter resets whenever any tool call in a turn succeeds, so
intermittent failures do not trigger early termination.

Closes #2783

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

* chore: retrigger CI

* fix(test): repair agent-loop.test.ts — close unclosed blocks, merge imports

Two test suites were concatenated without closing the first suite's
it+describe blocks, placing the second suite's imports inside a function
body and triggering 'Unexpected "{" ' from esbuild. Merged into a single
well-structured file with consolidated imports and proper closings.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

---------

Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: trek-e <trek-e@users.noreply.github.com>
This commit is contained in:
Tom Boucher 2026-04-05 01:04:58 -04:00 committed by GitHub
parent 4f6b3433d6
commit db7a6372a6
2 changed files with 274 additions and 5 deletions

View file

@ -1,13 +1,16 @@
// agent-loop pauseTurn handling tests
// Verifies that pause_turn / pauseTurn stop reason causes the inner loop
// to continue (re-invoke the LLM) instead of exiting.
// Regression test for https://github.com/gsd-build/gsd-2/issues/2869
// agent-loop tests
// Covers: pauseTurn handling (#2869), schema overload retry cap (#2783)
import { describe, it } from "node:test";
import { describe, it, mock } from "node:test";
import assert from "node:assert/strict";
import { readFileSync } from "node:fs";
import { join, dirname } from "node:path";
import { fileURLToPath } from "node:url";
import { Type } from "@sinclair/typebox";
import { agentLoop, MAX_CONSECUTIVE_VALIDATION_FAILURES } from "./agent-loop.js";
import type { AgentContext, AgentLoopConfig, AgentTool, AgentEvent, AgentMessage } from "./types.js";
import { AssistantMessageEventStream, EventStream } from "@gsd/pi-ai";
import type { AssistantMessage, AssistantMessageEvent, Model } from "@gsd/pi-ai";
const __dirname = dirname(fileURLToPath(import.meta.url));
@ -43,3 +46,216 @@ describe("agent-loop — pauseTurn handling (#2869)", () => {
);
});
});
/**
* Regression tests for #2783: Stuck-loop on execute-task tool-call schema
* overload causes unbounded retry + budget burn.
*
* When the LLM repeatedly emits tool calls with arguments that fail schema
* validation, the agent loop retries indefinitely. Each failed validation
* returns an error tool result, the LLM retries with the same broken args,
* and the cycle never breaks burning budget with no progress.
*
* The fix caps consecutive validation failures per turn at
* MAX_CONSECUTIVE_VALIDATION_FAILURES (default 3). Once the cap is hit, the
* loop injects a synthetic stop so the agent terminates cleanly instead of
* spinning forever.
*/
// ─── Helpers ──────────────────────────────────────────────────────────────────
const TEST_MODEL: Model<"anthropic-messages"> = {
id: "claude-test",
name: "Test Model",
api: "anthropic-messages",
provider: "anthropic",
contextWindow: 200_000,
maxOutput: 4096,
supportsImages: false,
supportsPromptCache: false,
thinkingLevel: undefined,
};
function makeToolWithSchema(): AgentTool<any> {
return {
name: "write_file",
label: "Write File",
description: "Write content to a file",
parameters: Type.Object({
path: Type.String(),
content: Type.String(),
}),
execute: async () => ({
content: [{ type: "text" as const, text: "done" }],
details: {},
}),
};
}
/**
* Creates a mock streamFn that returns assistant messages from a queue.
* Each call pops the next message. The messages simulate the LLM repeatedly
* emitting the same tool call with broken arguments.
*/
function createMockStreamFn(responses: AssistantMessage[]) {
let callIndex = 0;
return function mockStreamFn(): AssistantMessageEventStream {
const message = responses[callIndex] ?? responses[responses.length - 1];
callIndex++;
const stream = new AssistantMessageEventStream();
// Simulate async delivery
queueMicrotask(() => {
stream.push({ type: "start", partial: message });
stream.push({ type: "done", message });
stream.end(message);
});
return stream;
};
}
function makeAssistantMessage(overrides: Partial<AssistantMessage> = {}): AssistantMessage {
return {
role: "assistant",
content: [],
api: "anthropic-messages",
provider: "anthropic",
model: "claude-test",
usage: { input: 100, output: 50, cacheRead: 0, cacheWrite: 0, totalTokens: 150, cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 } },
stopReason: "end_turn",
timestamp: Date.now(),
...overrides,
};
}
function makeToolCallMessage(toolCallArgs: Record<string, unknown>): AssistantMessage {
return makeAssistantMessage({
content: [
{
type: "toolCall",
id: `tc_${Date.now()}_${Math.random()}`,
name: "write_file",
arguments: toolCallArgs,
},
],
stopReason: "tool_use",
});
}
function collectEvents(stream: EventStream<AgentEvent, AgentMessage[]>): Promise<AgentEvent[]> {
return new Promise(async (resolve) => {
const events: AgentEvent[] = [];
for await (const event of stream) {
events.push(event);
}
resolve(events);
});
}
// ─── Tests ────────────────────────────────────────────────────────────────────
describe("agent-loop — schema overload retry cap (#2783)", () => {
it("terminates after MAX_CONSECUTIVE_VALIDATION_FAILURES consecutive schema failures", async () => {
const tool = makeToolWithSchema();
// LLM keeps sending tool calls with invalid args (missing required 'content' field)
const badToolCall = makeToolCallMessage({ path: "/tmp/test" }); // missing 'content'
const finalStop = makeAssistantMessage({ content: [{ type: "text", text: "I give up." }], stopReason: "end_turn" });
// Create enough bad responses to exceed the cap, plus a final stop
const responses: AssistantMessage[] = [];
for (let i = 0; i < MAX_CONSECUTIVE_VALIDATION_FAILURES + 5; i++) {
responses.push(badToolCall);
}
responses.push(finalStop);
const mockStream = createMockStreamFn(responses);
const context: AgentContext = {
systemPrompt: "You are a test agent.",
messages: [{ role: "user", content: [{ type: "text", text: "Write a file" }], timestamp: Date.now() }],
tools: [tool],
};
const config: AgentLoopConfig = {
model: TEST_MODEL,
convertToLlm: (msgs) => msgs.filter((m): m is any => m.role !== "custom"),
toolExecution: "sequential",
};
const stream = agentLoop(
[{ role: "user", content: [{ type: "text", text: "Write a file" }], timestamp: Date.now() }],
context,
config,
undefined,
mockStream as any,
);
const events = await collectEvents(stream);
// Must have terminated (agent_end event present)
const agentEnd = events.find((e) => e.type === "agent_end");
assert.ok(agentEnd, "agent loop must emit agent_end after hitting retry cap");
// Count how many turns had validation errors (tool_execution_end with isError: true)
const toolErrors = events.filter(
(e) => e.type === "tool_execution_end" && e.isError === true,
);
// Must not exceed the cap
assert.ok(
toolErrors.length <= MAX_CONSECUTIVE_VALIDATION_FAILURES,
`Expected at most ${MAX_CONSECUTIVE_VALIDATION_FAILURES} validation error tool results, got ${toolErrors.length}`,
);
});
it("resets the failure counter when a tool call succeeds", async () => {
const tool = makeToolWithSchema();
// Pattern: 2 failures, 1 success, 2 failures, 1 success, then stop
const badCall = makeToolCallMessage({ path: "/tmp/test" }); // missing 'content'
const goodCall = makeToolCallMessage({ path: "/tmp/test", content: "hello" });
const finalStop = makeAssistantMessage({ content: [{ type: "text", text: "Done." }], stopReason: "end_turn" });
const responses = [badCall, badCall, goodCall, badCall, badCall, goodCall, finalStop];
const mockStream = createMockStreamFn(responses);
const context: AgentContext = {
systemPrompt: "You are a test agent.",
messages: [{ role: "user", content: [{ type: "text", text: "Write a file" }], timestamp: Date.now() }],
tools: [tool],
};
const config: AgentLoopConfig = {
model: TEST_MODEL,
convertToLlm: (msgs) => msgs.filter((m): m is any => m.role !== "custom"),
toolExecution: "sequential",
};
const stream = agentLoop(
[{ role: "user", content: [{ type: "text", text: "Write a file" }], timestamp: Date.now() }],
context,
config,
undefined,
mockStream as any,
);
const events = await collectEvents(stream);
// Must complete successfully since failures never reached cap consecutively
const agentEnd = events.find((e) => e.type === "agent_end");
assert.ok(agentEnd, "agent loop must complete normally when failures are interspersed with successes");
// Should have processed all 6 tool-bearing turns
const toolExecEnds = events.filter((e) => e.type === "tool_execution_end");
assert.ok(toolExecEnds.length >= 4, `Expected at least 4 tool executions (2 bad + 1 good + 2 bad + 1 good), got ${toolExecEnds.length}`);
});
it("exports MAX_CONSECUTIVE_VALIDATION_FAILURES as a configurable constant", () => {
assert.equal(typeof MAX_CONSECUTIVE_VALIDATION_FAILURES, "number");
assert.ok(MAX_CONSECUTIVE_VALIDATION_FAILURES >= 2, "Cap must be at least 2 to allow one retry");
assert.ok(MAX_CONSECUTIVE_VALIDATION_FAILURES <= 10, "Cap must not be unreasonably high");
});
});

View file

@ -22,6 +22,15 @@ import type {
StreamFn,
} from "./types.js";
/**
* Maximum number of consecutive turns where ALL tool calls in the turn fail
* schema validation before the loop terminates. This prevents unbounded retry
* loops when the LLM repeatedly emits tool calls with arguments that cannot
* pass validation (e.g., schema overload, truncated JSON, missing required
* fields). See: https://github.com/gsd-build/gsd-2/issues/2783
*/
export const MAX_CONSECUTIVE_VALIDATION_FAILURES = 3;
export const ZERO_USAGE = {
input: 0,
output: 0,
@ -175,6 +184,12 @@ async function runLoop(
// Check for steering messages at start (user may have typed while waiting)
let pendingMessages: AgentMessage[] = (await config.getSteeringMessages?.()) || [];
// Track consecutive turns where ALL tool calls fail validation.
// When the LLM repeatedly emits tool calls with schema-overloaded or malformed
// arguments, each turn produces only error tool results. Without a cap, this
// creates an unbounded retry loop that burns budget. (#2783)
let consecutiveAllToolErrorTurns = 0;
// Outer loop: continues when queued follow-up messages arrive after agent would stop
while (true) {
let hasMoreToolCalls = true;
@ -277,6 +292,44 @@ async function runLoop(
currentContext.messages.push(result);
newMessages.push(result);
}
// Schema overload detection (#2783): if EVERY tool result in this turn
// is an error (validation failure, missing tool, etc.), increment the
// consecutive failure counter. If any tool succeeded, reset to zero.
const allToolsFailed = toolResults.length > 0 && toolResults.every((r) => r.isError);
if (allToolsFailed) {
consecutiveAllToolErrorTurns++;
} else {
consecutiveAllToolErrorTurns = 0;
}
if (consecutiveAllToolErrorTurns >= MAX_CONSECUTIVE_VALIDATION_FAILURES) {
// Force-stop: the LLM is stuck retrying broken tool calls.
// Emit the turn_end and terminate the agent loop cleanly.
stream.push({ type: "turn_end", message, toolResults });
const stopMessage: AssistantMessage = {
role: "assistant",
content: [
{
type: "text",
text: `Agent stopped: ${consecutiveAllToolErrorTurns} consecutive turns with all tool calls failing. This usually means the model is repeatedly sending arguments that do not match the tool schema.`,
},
],
api: config.model.api,
provider: config.model.provider,
model: config.model.id,
usage: ZERO_USAGE,
stopReason: "error",
errorMessage: "Schema overload: consecutive tool validation failures exceeded cap",
timestamp: Date.now(),
};
emitMessagePair(stream, stopMessage);
newMessages.push(stopMessage);
stream.push({ type: "turn_end", message: stopMessage, toolResults: [] });
stream.push({ type: "agent_end", messages: newMessages });
stream.end(newMessages);
return;
}
}
stream.push({ type: "turn_end", message, toolResults });