diff --git a/packages/pi-agent-core/src/agent-loop.test.ts b/packages/pi-agent-core/src/agent-loop.test.ts index 0e61d9701..5ddb2ac84 100644 --- a/packages/pi-agent-core/src/agent-loop.test.ts +++ b/packages/pi-agent-core/src/agent-loop.test.ts @@ -1,13 +1,16 @@ -// agent-loop pauseTurn handling tests -// Verifies that pause_turn / pauseTurn stop reason causes the inner loop -// to continue (re-invoke the LLM) instead of exiting. -// Regression test for https://github.com/gsd-build/gsd-2/issues/2869 +// agent-loop tests +// Covers: pauseTurn handling (#2869), schema overload retry cap (#2783) -import { describe, it } from "node:test"; +import { describe, it, mock } from "node:test"; import assert from "node:assert/strict"; import { readFileSync } from "node:fs"; import { join, dirname } from "node:path"; import { fileURLToPath } from "node:url"; +import { Type } from "@sinclair/typebox"; +import { agentLoop, MAX_CONSECUTIVE_VALIDATION_FAILURES } from "./agent-loop.js"; +import type { AgentContext, AgentLoopConfig, AgentTool, AgentEvent, AgentMessage } from "./types.js"; +import { AssistantMessageEventStream, EventStream } from "@gsd/pi-ai"; +import type { AssistantMessage, AssistantMessageEvent, Model } from "@gsd/pi-ai"; const __dirname = dirname(fileURLToPath(import.meta.url)); @@ -43,3 +46,216 @@ describe("agent-loop — pauseTurn handling (#2869)", () => { ); }); }); + +/** + * Regression tests for #2783: Stuck-loop on execute-task — tool-call schema + * overload causes unbounded retry + budget burn. + * + * When the LLM repeatedly emits tool calls with arguments that fail schema + * validation, the agent loop retries indefinitely. Each failed validation + * returns an error tool result, the LLM retries with the same broken args, + * and the cycle never breaks — burning budget with no progress. + * + * The fix caps consecutive validation failures per turn at + * MAX_CONSECUTIVE_VALIDATION_FAILURES (default 3). Once the cap is hit, the + * loop injects a synthetic stop so the agent terminates cleanly instead of + * spinning forever. + */ + +// ─── Helpers ────────────────────────────────────────────────────────────────── + +const TEST_MODEL: Model<"anthropic-messages"> = { + id: "claude-test", + name: "Test Model", + api: "anthropic-messages", + provider: "anthropic", + contextWindow: 200_000, + maxOutput: 4096, + supportsImages: false, + supportsPromptCache: false, + thinkingLevel: undefined, +}; + +function makeToolWithSchema(): AgentTool { + return { + name: "write_file", + label: "Write File", + description: "Write content to a file", + parameters: Type.Object({ + path: Type.String(), + content: Type.String(), + }), + execute: async () => ({ + content: [{ type: "text" as const, text: "done" }], + details: {}, + }), + }; +} + +/** + * Creates a mock streamFn that returns assistant messages from a queue. + * Each call pops the next message. The messages simulate the LLM repeatedly + * emitting the same tool call with broken arguments. + */ +function createMockStreamFn(responses: AssistantMessage[]) { + let callIndex = 0; + + return function mockStreamFn(): AssistantMessageEventStream { + const message = responses[callIndex] ?? responses[responses.length - 1]; + callIndex++; + + const stream = new AssistantMessageEventStream(); + // Simulate async delivery + queueMicrotask(() => { + stream.push({ type: "start", partial: message }); + stream.push({ type: "done", message }); + stream.end(message); + }); + return stream; + }; +} + +function makeAssistantMessage(overrides: Partial = {}): AssistantMessage { + return { + role: "assistant", + content: [], + api: "anthropic-messages", + provider: "anthropic", + model: "claude-test", + usage: { input: 100, output: 50, cacheRead: 0, cacheWrite: 0, totalTokens: 150, cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 } }, + stopReason: "end_turn", + timestamp: Date.now(), + ...overrides, + }; +} + +function makeToolCallMessage(toolCallArgs: Record): AssistantMessage { + return makeAssistantMessage({ + content: [ + { + type: "toolCall", + id: `tc_${Date.now()}_${Math.random()}`, + name: "write_file", + arguments: toolCallArgs, + }, + ], + stopReason: "tool_use", + }); +} + +function collectEvents(stream: EventStream): Promise { + return new Promise(async (resolve) => { + const events: AgentEvent[] = []; + for await (const event of stream) { + events.push(event); + } + resolve(events); + }); +} + +// ─── Tests ──────────────────────────────────────────────────────────────────── + +describe("agent-loop — schema overload retry cap (#2783)", () => { + + it("terminates after MAX_CONSECUTIVE_VALIDATION_FAILURES consecutive schema failures", async () => { + const tool = makeToolWithSchema(); + + // LLM keeps sending tool calls with invalid args (missing required 'content' field) + const badToolCall = makeToolCallMessage({ path: "/tmp/test" }); // missing 'content' + const finalStop = makeAssistantMessage({ content: [{ type: "text", text: "I give up." }], stopReason: "end_turn" }); + + // Create enough bad responses to exceed the cap, plus a final stop + const responses: AssistantMessage[] = []; + for (let i = 0; i < MAX_CONSECUTIVE_VALIDATION_FAILURES + 5; i++) { + responses.push(badToolCall); + } + responses.push(finalStop); + + const mockStream = createMockStreamFn(responses); + + const context: AgentContext = { + systemPrompt: "You are a test agent.", + messages: [{ role: "user", content: [{ type: "text", text: "Write a file" }], timestamp: Date.now() }], + tools: [tool], + }; + + const config: AgentLoopConfig = { + model: TEST_MODEL, + convertToLlm: (msgs) => msgs.filter((m): m is any => m.role !== "custom"), + toolExecution: "sequential", + }; + + const stream = agentLoop( + [{ role: "user", content: [{ type: "text", text: "Write a file" }], timestamp: Date.now() }], + context, + config, + undefined, + mockStream as any, + ); + + const events = await collectEvents(stream); + + // Must have terminated (agent_end event present) + const agentEnd = events.find((e) => e.type === "agent_end"); + assert.ok(agentEnd, "agent loop must emit agent_end after hitting retry cap"); + + // Count how many turns had validation errors (tool_execution_end with isError: true) + const toolErrors = events.filter( + (e) => e.type === "tool_execution_end" && e.isError === true, + ); + + // Must not exceed the cap + assert.ok( + toolErrors.length <= MAX_CONSECUTIVE_VALIDATION_FAILURES, + `Expected at most ${MAX_CONSECUTIVE_VALIDATION_FAILURES} validation error tool results, got ${toolErrors.length}`, + ); + }); + + it("resets the failure counter when a tool call succeeds", async () => { + const tool = makeToolWithSchema(); + + // Pattern: 2 failures, 1 success, 2 failures, 1 success, then stop + const badCall = makeToolCallMessage({ path: "/tmp/test" }); // missing 'content' + const goodCall = makeToolCallMessage({ path: "/tmp/test", content: "hello" }); + const finalStop = makeAssistantMessage({ content: [{ type: "text", text: "Done." }], stopReason: "end_turn" }); + + const responses = [badCall, badCall, goodCall, badCall, badCall, goodCall, finalStop]; + const mockStream = createMockStreamFn(responses); + + const context: AgentContext = { + systemPrompt: "You are a test agent.", + messages: [{ role: "user", content: [{ type: "text", text: "Write a file" }], timestamp: Date.now() }], + tools: [tool], + }; + + const config: AgentLoopConfig = { + model: TEST_MODEL, + convertToLlm: (msgs) => msgs.filter((m): m is any => m.role !== "custom"), + toolExecution: "sequential", + }; + + const stream = agentLoop( + [{ role: "user", content: [{ type: "text", text: "Write a file" }], timestamp: Date.now() }], + context, + config, + undefined, + mockStream as any, + ); + + const events = await collectEvents(stream); + + // Must complete successfully since failures never reached cap consecutively + const agentEnd = events.find((e) => e.type === "agent_end"); + assert.ok(agentEnd, "agent loop must complete normally when failures are interspersed with successes"); + + // Should have processed all 6 tool-bearing turns + const toolExecEnds = events.filter((e) => e.type === "tool_execution_end"); + assert.ok(toolExecEnds.length >= 4, `Expected at least 4 tool executions (2 bad + 1 good + 2 bad + 1 good), got ${toolExecEnds.length}`); + }); + + it("exports MAX_CONSECUTIVE_VALIDATION_FAILURES as a configurable constant", () => { + assert.equal(typeof MAX_CONSECUTIVE_VALIDATION_FAILURES, "number"); + assert.ok(MAX_CONSECUTIVE_VALIDATION_FAILURES >= 2, "Cap must be at least 2 to allow one retry"); + assert.ok(MAX_CONSECUTIVE_VALIDATION_FAILURES <= 10, "Cap must not be unreasonably high"); + }); +}); diff --git a/packages/pi-agent-core/src/agent-loop.ts b/packages/pi-agent-core/src/agent-loop.ts index 82254d3bf..21e2a4531 100644 --- a/packages/pi-agent-core/src/agent-loop.ts +++ b/packages/pi-agent-core/src/agent-loop.ts @@ -22,6 +22,15 @@ import type { StreamFn, } from "./types.js"; +/** + * Maximum number of consecutive turns where ALL tool calls in the turn fail + * schema validation before the loop terminates. This prevents unbounded retry + * loops when the LLM repeatedly emits tool calls with arguments that cannot + * pass validation (e.g., schema overload, truncated JSON, missing required + * fields). See: https://github.com/gsd-build/gsd-2/issues/2783 + */ +export const MAX_CONSECUTIVE_VALIDATION_FAILURES = 3; + export const ZERO_USAGE = { input: 0, output: 0, @@ -175,6 +184,12 @@ async function runLoop( // Check for steering messages at start (user may have typed while waiting) let pendingMessages: AgentMessage[] = (await config.getSteeringMessages?.()) || []; + // Track consecutive turns where ALL tool calls fail validation. + // When the LLM repeatedly emits tool calls with schema-overloaded or malformed + // arguments, each turn produces only error tool results. Without a cap, this + // creates an unbounded retry loop that burns budget. (#2783) + let consecutiveAllToolErrorTurns = 0; + // Outer loop: continues when queued follow-up messages arrive after agent would stop while (true) { let hasMoreToolCalls = true; @@ -277,6 +292,44 @@ async function runLoop( currentContext.messages.push(result); newMessages.push(result); } + + // Schema overload detection (#2783): if EVERY tool result in this turn + // is an error (validation failure, missing tool, etc.), increment the + // consecutive failure counter. If any tool succeeded, reset to zero. + const allToolsFailed = toolResults.length > 0 && toolResults.every((r) => r.isError); + if (allToolsFailed) { + consecutiveAllToolErrorTurns++; + } else { + consecutiveAllToolErrorTurns = 0; + } + + if (consecutiveAllToolErrorTurns >= MAX_CONSECUTIVE_VALIDATION_FAILURES) { + // Force-stop: the LLM is stuck retrying broken tool calls. + // Emit the turn_end and terminate the agent loop cleanly. + stream.push({ type: "turn_end", message, toolResults }); + const stopMessage: AssistantMessage = { + role: "assistant", + content: [ + { + type: "text", + text: `Agent stopped: ${consecutiveAllToolErrorTurns} consecutive turns with all tool calls failing. This usually means the model is repeatedly sending arguments that do not match the tool schema.`, + }, + ], + api: config.model.api, + provider: config.model.provider, + model: config.model.id, + usage: ZERO_USAGE, + stopReason: "error", + errorMessage: "Schema overload: consecutive tool validation failures exceeded cap", + timestamp: Date.now(), + }; + emitMessagePair(stream, stopMessage); + newMessages.push(stopMessage); + stream.push({ type: "turn_end", message: stopMessage, toolResults: [] }); + stream.push({ type: "agent_end", messages: newMessages }); + stream.end(newMessages); + return; + } } stream.push({ type: "turn_end", message, toolResults });