fix: cap consecutive tool validation failures to prevent stuck-loop (#3301)
* fix: cap consecutive tool validation failures to prevent stuck-loop (#2783) When the LLM repeatedly emits tool calls with arguments that fail schema validation, the agent loop retries indefinitely — each failed validation returns an error tool result, the LLM retries with the same broken args, and the cycle burns budget with no progress. Add a consecutive-failure counter in runLoop that tracks turns where ALL tool calls fail. After MAX_CONSECUTIVE_VALIDATION_FAILURES (3) consecutive all-error turns, the loop emits a diagnostic stop message and terminates cleanly. The counter resets whenever any tool call in a turn succeeds, so intermittent failures do not trigger early termination. Closes #2783 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com> * chore: retrigger CI * fix(test): repair agent-loop.test.ts — close unclosed blocks, merge imports Two test suites were concatenated without closing the first suite's it+describe blocks, placing the second suite's imports inside a function body and triggering 'Unexpected "{" ' from esbuild. Merged into a single well-structured file with consolidated imports and proper closings. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> --------- Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: trek-e <trek-e@users.noreply.github.com>
This commit is contained in:
parent
4f6b3433d6
commit
db7a6372a6
2 changed files with 274 additions and 5 deletions
|
|
@ -1,13 +1,16 @@
|
|||
// agent-loop pauseTurn handling tests
|
||||
// Verifies that pause_turn / pauseTurn stop reason causes the inner loop
|
||||
// to continue (re-invoke the LLM) instead of exiting.
|
||||
// Regression test for https://github.com/gsd-build/gsd-2/issues/2869
|
||||
// agent-loop tests
|
||||
// Covers: pauseTurn handling (#2869), schema overload retry cap (#2783)
|
||||
|
||||
import { describe, it } from "node:test";
|
||||
import { describe, it, mock } from "node:test";
|
||||
import assert from "node:assert/strict";
|
||||
import { readFileSync } from "node:fs";
|
||||
import { join, dirname } from "node:path";
|
||||
import { fileURLToPath } from "node:url";
|
||||
import { Type } from "@sinclair/typebox";
|
||||
import { agentLoop, MAX_CONSECUTIVE_VALIDATION_FAILURES } from "./agent-loop.js";
|
||||
import type { AgentContext, AgentLoopConfig, AgentTool, AgentEvent, AgentMessage } from "./types.js";
|
||||
import { AssistantMessageEventStream, EventStream } from "@gsd/pi-ai";
|
||||
import type { AssistantMessage, AssistantMessageEvent, Model } from "@gsd/pi-ai";
|
||||
|
||||
const __dirname = dirname(fileURLToPath(import.meta.url));
|
||||
|
||||
|
|
@ -43,3 +46,216 @@ describe("agent-loop — pauseTurn handling (#2869)", () => {
|
|||
);
|
||||
});
|
||||
});
|
||||
|
||||
/**
|
||||
* Regression tests for #2783: Stuck-loop on execute-task — tool-call schema
|
||||
* overload causes unbounded retry + budget burn.
|
||||
*
|
||||
* When the LLM repeatedly emits tool calls with arguments that fail schema
|
||||
* validation, the agent loop retries indefinitely. Each failed validation
|
||||
* returns an error tool result, the LLM retries with the same broken args,
|
||||
* and the cycle never breaks — burning budget with no progress.
|
||||
*
|
||||
* The fix caps consecutive validation failures per turn at
|
||||
* MAX_CONSECUTIVE_VALIDATION_FAILURES (default 3). Once the cap is hit, the
|
||||
* loop injects a synthetic stop so the agent terminates cleanly instead of
|
||||
* spinning forever.
|
||||
*/
|
||||
|
||||
// ─── Helpers ──────────────────────────────────────────────────────────────────
|
||||
|
||||
const TEST_MODEL: Model<"anthropic-messages"> = {
|
||||
id: "claude-test",
|
||||
name: "Test Model",
|
||||
api: "anthropic-messages",
|
||||
provider: "anthropic",
|
||||
contextWindow: 200_000,
|
||||
maxOutput: 4096,
|
||||
supportsImages: false,
|
||||
supportsPromptCache: false,
|
||||
thinkingLevel: undefined,
|
||||
};
|
||||
|
||||
function makeToolWithSchema(): AgentTool<any> {
|
||||
return {
|
||||
name: "write_file",
|
||||
label: "Write File",
|
||||
description: "Write content to a file",
|
||||
parameters: Type.Object({
|
||||
path: Type.String(),
|
||||
content: Type.String(),
|
||||
}),
|
||||
execute: async () => ({
|
||||
content: [{ type: "text" as const, text: "done" }],
|
||||
details: {},
|
||||
}),
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a mock streamFn that returns assistant messages from a queue.
|
||||
* Each call pops the next message. The messages simulate the LLM repeatedly
|
||||
* emitting the same tool call with broken arguments.
|
||||
*/
|
||||
function createMockStreamFn(responses: AssistantMessage[]) {
|
||||
let callIndex = 0;
|
||||
|
||||
return function mockStreamFn(): AssistantMessageEventStream {
|
||||
const message = responses[callIndex] ?? responses[responses.length - 1];
|
||||
callIndex++;
|
||||
|
||||
const stream = new AssistantMessageEventStream();
|
||||
// Simulate async delivery
|
||||
queueMicrotask(() => {
|
||||
stream.push({ type: "start", partial: message });
|
||||
stream.push({ type: "done", message });
|
||||
stream.end(message);
|
||||
});
|
||||
return stream;
|
||||
};
|
||||
}
|
||||
|
||||
function makeAssistantMessage(overrides: Partial<AssistantMessage> = {}): AssistantMessage {
|
||||
return {
|
||||
role: "assistant",
|
||||
content: [],
|
||||
api: "anthropic-messages",
|
||||
provider: "anthropic",
|
||||
model: "claude-test",
|
||||
usage: { input: 100, output: 50, cacheRead: 0, cacheWrite: 0, totalTokens: 150, cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 } },
|
||||
stopReason: "end_turn",
|
||||
timestamp: Date.now(),
|
||||
...overrides,
|
||||
};
|
||||
}
|
||||
|
||||
function makeToolCallMessage(toolCallArgs: Record<string, unknown>): AssistantMessage {
|
||||
return makeAssistantMessage({
|
||||
content: [
|
||||
{
|
||||
type: "toolCall",
|
||||
id: `tc_${Date.now()}_${Math.random()}`,
|
||||
name: "write_file",
|
||||
arguments: toolCallArgs,
|
||||
},
|
||||
],
|
||||
stopReason: "tool_use",
|
||||
});
|
||||
}
|
||||
|
||||
function collectEvents(stream: EventStream<AgentEvent, AgentMessage[]>): Promise<AgentEvent[]> {
|
||||
return new Promise(async (resolve) => {
|
||||
const events: AgentEvent[] = [];
|
||||
for await (const event of stream) {
|
||||
events.push(event);
|
||||
}
|
||||
resolve(events);
|
||||
});
|
||||
}
|
||||
|
||||
// ─── Tests ────────────────────────────────────────────────────────────────────
|
||||
|
||||
describe("agent-loop — schema overload retry cap (#2783)", () => {
|
||||
|
||||
it("terminates after MAX_CONSECUTIVE_VALIDATION_FAILURES consecutive schema failures", async () => {
|
||||
const tool = makeToolWithSchema();
|
||||
|
||||
// LLM keeps sending tool calls with invalid args (missing required 'content' field)
|
||||
const badToolCall = makeToolCallMessage({ path: "/tmp/test" }); // missing 'content'
|
||||
const finalStop = makeAssistantMessage({ content: [{ type: "text", text: "I give up." }], stopReason: "end_turn" });
|
||||
|
||||
// Create enough bad responses to exceed the cap, plus a final stop
|
||||
const responses: AssistantMessage[] = [];
|
||||
for (let i = 0; i < MAX_CONSECUTIVE_VALIDATION_FAILURES + 5; i++) {
|
||||
responses.push(badToolCall);
|
||||
}
|
||||
responses.push(finalStop);
|
||||
|
||||
const mockStream = createMockStreamFn(responses);
|
||||
|
||||
const context: AgentContext = {
|
||||
systemPrompt: "You are a test agent.",
|
||||
messages: [{ role: "user", content: [{ type: "text", text: "Write a file" }], timestamp: Date.now() }],
|
||||
tools: [tool],
|
||||
};
|
||||
|
||||
const config: AgentLoopConfig = {
|
||||
model: TEST_MODEL,
|
||||
convertToLlm: (msgs) => msgs.filter((m): m is any => m.role !== "custom"),
|
||||
toolExecution: "sequential",
|
||||
};
|
||||
|
||||
const stream = agentLoop(
|
||||
[{ role: "user", content: [{ type: "text", text: "Write a file" }], timestamp: Date.now() }],
|
||||
context,
|
||||
config,
|
||||
undefined,
|
||||
mockStream as any,
|
||||
);
|
||||
|
||||
const events = await collectEvents(stream);
|
||||
|
||||
// Must have terminated (agent_end event present)
|
||||
const agentEnd = events.find((e) => e.type === "agent_end");
|
||||
assert.ok(agentEnd, "agent loop must emit agent_end after hitting retry cap");
|
||||
|
||||
// Count how many turns had validation errors (tool_execution_end with isError: true)
|
||||
const toolErrors = events.filter(
|
||||
(e) => e.type === "tool_execution_end" && e.isError === true,
|
||||
);
|
||||
|
||||
// Must not exceed the cap
|
||||
assert.ok(
|
||||
toolErrors.length <= MAX_CONSECUTIVE_VALIDATION_FAILURES,
|
||||
`Expected at most ${MAX_CONSECUTIVE_VALIDATION_FAILURES} validation error tool results, got ${toolErrors.length}`,
|
||||
);
|
||||
});
|
||||
|
||||
it("resets the failure counter when a tool call succeeds", async () => {
|
||||
const tool = makeToolWithSchema();
|
||||
|
||||
// Pattern: 2 failures, 1 success, 2 failures, 1 success, then stop
|
||||
const badCall = makeToolCallMessage({ path: "/tmp/test" }); // missing 'content'
|
||||
const goodCall = makeToolCallMessage({ path: "/tmp/test", content: "hello" });
|
||||
const finalStop = makeAssistantMessage({ content: [{ type: "text", text: "Done." }], stopReason: "end_turn" });
|
||||
|
||||
const responses = [badCall, badCall, goodCall, badCall, badCall, goodCall, finalStop];
|
||||
const mockStream = createMockStreamFn(responses);
|
||||
|
||||
const context: AgentContext = {
|
||||
systemPrompt: "You are a test agent.",
|
||||
messages: [{ role: "user", content: [{ type: "text", text: "Write a file" }], timestamp: Date.now() }],
|
||||
tools: [tool],
|
||||
};
|
||||
|
||||
const config: AgentLoopConfig = {
|
||||
model: TEST_MODEL,
|
||||
convertToLlm: (msgs) => msgs.filter((m): m is any => m.role !== "custom"),
|
||||
toolExecution: "sequential",
|
||||
};
|
||||
|
||||
const stream = agentLoop(
|
||||
[{ role: "user", content: [{ type: "text", text: "Write a file" }], timestamp: Date.now() }],
|
||||
context,
|
||||
config,
|
||||
undefined,
|
||||
mockStream as any,
|
||||
);
|
||||
|
||||
const events = await collectEvents(stream);
|
||||
|
||||
// Must complete successfully since failures never reached cap consecutively
|
||||
const agentEnd = events.find((e) => e.type === "agent_end");
|
||||
assert.ok(agentEnd, "agent loop must complete normally when failures are interspersed with successes");
|
||||
|
||||
// Should have processed all 6 tool-bearing turns
|
||||
const toolExecEnds = events.filter((e) => e.type === "tool_execution_end");
|
||||
assert.ok(toolExecEnds.length >= 4, `Expected at least 4 tool executions (2 bad + 1 good + 2 bad + 1 good), got ${toolExecEnds.length}`);
|
||||
});
|
||||
|
||||
it("exports MAX_CONSECUTIVE_VALIDATION_FAILURES as a configurable constant", () => {
|
||||
assert.equal(typeof MAX_CONSECUTIVE_VALIDATION_FAILURES, "number");
|
||||
assert.ok(MAX_CONSECUTIVE_VALIDATION_FAILURES >= 2, "Cap must be at least 2 to allow one retry");
|
||||
assert.ok(MAX_CONSECUTIVE_VALIDATION_FAILURES <= 10, "Cap must not be unreasonably high");
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -22,6 +22,15 @@ import type {
|
|||
StreamFn,
|
||||
} from "./types.js";
|
||||
|
||||
/**
|
||||
* Maximum number of consecutive turns where ALL tool calls in the turn fail
|
||||
* schema validation before the loop terminates. This prevents unbounded retry
|
||||
* loops when the LLM repeatedly emits tool calls with arguments that cannot
|
||||
* pass validation (e.g., schema overload, truncated JSON, missing required
|
||||
* fields). See: https://github.com/gsd-build/gsd-2/issues/2783
|
||||
*/
|
||||
export const MAX_CONSECUTIVE_VALIDATION_FAILURES = 3;
|
||||
|
||||
export const ZERO_USAGE = {
|
||||
input: 0,
|
||||
output: 0,
|
||||
|
|
@ -175,6 +184,12 @@ async function runLoop(
|
|||
// Check for steering messages at start (user may have typed while waiting)
|
||||
let pendingMessages: AgentMessage[] = (await config.getSteeringMessages?.()) || [];
|
||||
|
||||
// Track consecutive turns where ALL tool calls fail validation.
|
||||
// When the LLM repeatedly emits tool calls with schema-overloaded or malformed
|
||||
// arguments, each turn produces only error tool results. Without a cap, this
|
||||
// creates an unbounded retry loop that burns budget. (#2783)
|
||||
let consecutiveAllToolErrorTurns = 0;
|
||||
|
||||
// Outer loop: continues when queued follow-up messages arrive after agent would stop
|
||||
while (true) {
|
||||
let hasMoreToolCalls = true;
|
||||
|
|
@ -277,6 +292,44 @@ async function runLoop(
|
|||
currentContext.messages.push(result);
|
||||
newMessages.push(result);
|
||||
}
|
||||
|
||||
// Schema overload detection (#2783): if EVERY tool result in this turn
|
||||
// is an error (validation failure, missing tool, etc.), increment the
|
||||
// consecutive failure counter. If any tool succeeded, reset to zero.
|
||||
const allToolsFailed = toolResults.length > 0 && toolResults.every((r) => r.isError);
|
||||
if (allToolsFailed) {
|
||||
consecutiveAllToolErrorTurns++;
|
||||
} else {
|
||||
consecutiveAllToolErrorTurns = 0;
|
||||
}
|
||||
|
||||
if (consecutiveAllToolErrorTurns >= MAX_CONSECUTIVE_VALIDATION_FAILURES) {
|
||||
// Force-stop: the LLM is stuck retrying broken tool calls.
|
||||
// Emit the turn_end and terminate the agent loop cleanly.
|
||||
stream.push({ type: "turn_end", message, toolResults });
|
||||
const stopMessage: AssistantMessage = {
|
||||
role: "assistant",
|
||||
content: [
|
||||
{
|
||||
type: "text",
|
||||
text: `Agent stopped: ${consecutiveAllToolErrorTurns} consecutive turns with all tool calls failing. This usually means the model is repeatedly sending arguments that do not match the tool schema.`,
|
||||
},
|
||||
],
|
||||
api: config.model.api,
|
||||
provider: config.model.provider,
|
||||
model: config.model.id,
|
||||
usage: ZERO_USAGE,
|
||||
stopReason: "error",
|
||||
errorMessage: "Schema overload: consecutive tool validation failures exceeded cap",
|
||||
timestamp: Date.now(),
|
||||
};
|
||||
emitMessagePair(stream, stopMessage);
|
||||
newMessages.push(stopMessage);
|
||||
stream.push({ type: "turn_end", message: stopMessage, toolResults: [] });
|
||||
stream.push({ type: "agent_end", messages: newMessages });
|
||||
stream.end(newMessages);
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
stream.push({ type: "turn_end", message, toolResults });
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue