From cd14a4c7654d2036a52b16efc45711c8c5995747 Mon Sep 17 00:00:00 2001 From: frizynn Date: Mon, 6 Apr 2026 11:35:41 -0300 Subject: [PATCH] fix(agent-loop): schema overload cap ignores bash execution errors (#3618) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The schema overload detector counted ALL isError tool results toward the consecutive-failure cap, including bash commands that returned non-zero exit codes (e.g. rg/grep exit 1 = 'no matches'). Three consecutive exploratory searches with no matches would trigger the cap and abort the session. Root cause: the allToolsFailed check used toolResults.every(r => r.isError) which conflates preparation-phase errors (schema validation, tool-not-found, tool-blocked) with execution-phase errors (the tool ran successfully but returned a non-zero exit code). Fix: track preparationErrorCount alongside tool results. Only preparation errors (schema/validation failures) increment the consecutive failure counter. Tool execution errors — like bash exit code 1 — are valid usage and do not count toward the cap. Also fixes pre-existing StopReason type mismatches in agent-loop tests (end_turn → stop, tool_use → toolUse). --- packages/pi-agent-core/src/agent-loop.test.ts | 104 +++++++++++++++++- packages/pi-agent-core/src/agent-loop.ts | 55 +++++++-- 2 files changed, 143 insertions(+), 16 deletions(-) diff --git a/packages/pi-agent-core/src/agent-loop.test.ts b/packages/pi-agent-core/src/agent-loop.test.ts index 5ddb2ac84..9eda6af35 100644 --- a/packages/pi-agent-core/src/agent-loop.test.ts +++ b/packages/pi-agent-core/src/agent-loop.test.ts @@ -123,7 +123,7 @@ function makeAssistantMessage(overrides: Partial = {}): Assist provider: "anthropic", model: "claude-test", usage: { input: 100, output: 50, cacheRead: 0, cacheWrite: 0, totalTokens: 150, cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 } }, - stopReason: "end_turn", + stopReason: "stop", timestamp: Date.now(), ...overrides, }; @@ -139,7 +139,7 @@ function makeToolCallMessage(toolCallArgs: Record): AssistantMe arguments: toolCallArgs, }, ], - stopReason: "tool_use", + stopReason: "toolUse", }); } @@ -162,7 +162,7 @@ describe("agent-loop — schema overload retry cap (#2783)", () => { // LLM keeps sending tool calls with invalid args (missing required 'content' field) const badToolCall = makeToolCallMessage({ path: "/tmp/test" }); // missing 'content' - const finalStop = makeAssistantMessage({ content: [{ type: "text", text: "I give up." }], stopReason: "end_turn" }); + const finalStop = makeAssistantMessage({ content: [{ type: "text", text: "I give up." }], stopReason: "stop" }); // Create enough bad responses to exceed the cap, plus a final stop const responses: AssistantMessage[] = []; @@ -217,7 +217,7 @@ describe("agent-loop — schema overload retry cap (#2783)", () => { // Pattern: 2 failures, 1 success, 2 failures, 1 success, then stop const badCall = makeToolCallMessage({ path: "/tmp/test" }); // missing 'content' const goodCall = makeToolCallMessage({ path: "/tmp/test", content: "hello" }); - const finalStop = makeAssistantMessage({ content: [{ type: "text", text: "Done." }], stopReason: "end_turn" }); + const finalStop = makeAssistantMessage({ content: [{ type: "text", text: "Done." }], stopReason: "stop" }); const responses = [badCall, badCall, goodCall, badCall, badCall, goodCall, finalStop]; const mockStream = createMockStreamFn(responses); @@ -258,4 +258,100 @@ describe("agent-loop — schema overload retry cap (#2783)", () => { assert.ok(MAX_CONSECUTIVE_VALIDATION_FAILURES >= 2, "Cap must be at least 2 to allow one retry"); assert.ok(MAX_CONSECUTIVE_VALIDATION_FAILURES <= 10, "Cap must not be unreasonably high"); }); + + it("does NOT trip schema overload cap on tool execution errors like bash exit code 1 (#3618)", async () => { + // Simulates the real scenario: a tool (bash) that passes validation but + // throws during execution (e.g. rg/grep returning exit code 1 = no matches). + // These are valid tool invocations — the schema was correct, the tool ran, + // it just returned a non-zero exit code. The cap should only trigger for + // preparation/schema failures, not execution failures. + const bashTool: AgentTool = { + name: "bash", + label: "Bash", + description: "Run a bash command", + parameters: Type.Object({ + command: Type.String(), + }), + execute: async () => { + // Simulate bash tool rejecting on non-zero exit code + throw new Error("(no output)\n\nCommand exited with code 1"); + }, + }; + + // LLM sends valid tool calls (schema is correct) that fail at execution + const validBashCall = makeAssistantMessage({ + content: [ + { + type: "toolCall", + id: `tc_bash_${Date.now()}_${Math.random()}`, + name: "bash", + arguments: { command: "rg -l 'nonexistent' src/" }, + }, + ], + stopReason: "toolUse", + }); + const finalStop = makeAssistantMessage({ + content: [{ type: "text", text: "No references found." }], + stopReason: "stop", + }); + + // Send more than MAX_CONSECUTIVE_VALIDATION_FAILURES bash calls that throw + const responses: AssistantMessage[] = []; + for (let i = 0; i < MAX_CONSECUTIVE_VALIDATION_FAILURES + 2; i++) { + responses.push(validBashCall); + } + responses.push(finalStop); + + const mockStream = createMockStreamFn(responses); + + const context: AgentContext = { + systemPrompt: "You are a test agent.", + messages: [{ role: "user", content: [{ type: "text", text: "Search for references" }], timestamp: Date.now() }], + tools: [bashTool], + }; + + const config: AgentLoopConfig = { + model: TEST_MODEL, + convertToLlm: (msgs) => msgs.filter((m): m is any => m.role !== "custom"), + toolExecution: "sequential", + }; + + const stream = agentLoop( + [{ role: "user", content: [{ type: "text", text: "Search for references" }], timestamp: Date.now() }], + context, + config, + undefined, + mockStream as any, + ); + + const events = await collectEvents(stream); + + // Must complete normally — execution errors should NOT trigger the cap + const agentEnd = events.find((e) => e.type === "agent_end"); + assert.ok(agentEnd, "agent loop must emit agent_end"); + + // Count tool execution errors + const toolErrors = events.filter( + (e) => e.type === "tool_execution_end" && e.isError === true, + ); + + // All bash calls should have been attempted (not capped early) + assert.ok( + toolErrors.length >= MAX_CONSECUTIVE_VALIDATION_FAILURES + 2, + `Expected all ${MAX_CONSECUTIVE_VALIDATION_FAILURES + 2} bash execution errors to be processed (not capped), got ${toolErrors.length}`, + ); + + // The stop message should NOT contain the schema overload text + const allMessages = (agentEnd as any).messages as AgentMessage[]; + const lastMessage = allMessages[allMessages.length - 1]; + const lastText = lastMessage.role === "assistant" + ? (lastMessage as AssistantMessage).content.find((c) => c.type === "text") + : undefined; + if (lastText && lastText.type === "text") { + assert.ok( + !lastText.text.includes("consecutive turns with all tool calls failing"), + "Final message must NOT contain schema overload stop text for execution-only errors", + ); + } + }); }); diff --git a/packages/pi-agent-core/src/agent-loop.ts b/packages/pi-agent-core/src/agent-loop.ts index 21e2a4531..f8c7e9231 100644 --- a/packages/pi-agent-core/src/agent-loop.ts +++ b/packages/pi-agent-core/src/agent-loop.ts @@ -293,13 +293,23 @@ async function runLoop( newMessages.push(result); } - // Schema overload detection (#2783): if EVERY tool result in this turn - // is an error (validation failure, missing tool, etc.), increment the - // consecutive failure counter. If any tool succeeded, reset to zero. - const allToolsFailed = toolResults.length > 0 && toolResults.every((r) => r.isError); - if (allToolsFailed) { + // Schema overload detection (#2783): count only preparation-phase + // errors (schema validation, tool-not-found, tool-blocked) toward the + // consecutive failure cap. Tool execution errors — such as bash + // commands returning non-zero exit codes (e.g. grep/rg exit 1 for + // "no matches") — are valid tool usage and must NOT trigger the cap. + // See: #3618 + const hasPreparationErrors = toolExecution.preparationErrorCount > 0; + const allToolsFailedPreparation = + toolResults.length > 0 && + toolExecution.preparationErrorCount === toolResults.length; + if (allToolsFailedPreparation) { consecutiveAllToolErrorTurns++; - } else { + } else if (!hasPreparationErrors) { + // Reset only when there are zero preparation errors this turn. + // Mixed turns (some prep errors, some successes) don't reset, + // but they also don't increment — this avoids masking a + // pattern of alternating schema failures with one working call. consecutiveAllToolErrorTurns = 0; } @@ -452,6 +462,19 @@ async function streamAssistantResponse( return await response.result(); } +/** + * Result from executing tool calls in a turn. Includes metadata about + * error provenance so the schema overload detector can distinguish + * preparation failures (schema validation, tool-not-found, tool-blocked) + * from execution failures (the tool ran but threw, e.g. bash exit code 1). + */ +interface ToolExecutionResult { + toolResults: ToolResultMessage[]; + steeringMessages?: AgentMessage[]; + /** Number of tool results that failed during preparation (validation/schema). */ + preparationErrorCount: number; +} + /** * Execute tool calls from an assistant message. */ @@ -461,7 +484,7 @@ async function executeToolCalls( config: AgentLoopConfig, signal: AbortSignal | undefined, stream: EventStream, -): Promise<{ toolResults: ToolResultMessage[]; steeringMessages?: AgentMessage[] }> { +): Promise { const toolCalls = assistantMessage.content.filter((c) => c.type === "toolCall") as AgentToolCall[]; if (config.toolExecution === "sequential") { return executeToolCallsSequential(currentContext, assistantMessage, toolCalls, config, signal, stream); @@ -476,9 +499,10 @@ async function executeToolCallsSequential( config: AgentLoopConfig, signal: AbortSignal | undefined, stream: EventStream, -): Promise<{ toolResults: ToolResultMessage[]; steeringMessages?: AgentMessage[] }> { +): Promise { const results: ToolResultMessage[] = []; let steeringMessages: AgentMessage[] | undefined; + let preparationErrorCount = 0; for (let index = 0; index < toolCalls.length; index++) { const toolCall = toolCalls[index]; @@ -491,6 +515,9 @@ async function executeToolCallsSequential( const preparation = await prepareToolCall(currentContext, assistantMessage, toolCall, config, signal); if (preparation.kind === "immediate") { + if (preparation.isError) { + preparationErrorCount++; + } results.push(emitToolCallOutcome(toolCall, preparation.result, preparation.isError, stream)); } else { const executed = await executePreparedToolCall(preparation, signal, stream); @@ -520,7 +547,7 @@ async function executeToolCallsSequential( } } - return { toolResults: results, steeringMessages }; + return { toolResults: results, steeringMessages, preparationErrorCount }; } async function executeToolCallsParallel( @@ -530,10 +557,11 @@ async function executeToolCallsParallel( config: AgentLoopConfig, signal: AbortSignal | undefined, stream: EventStream, -): Promise<{ toolResults: ToolResultMessage[]; steeringMessages?: AgentMessage[] }> { +): Promise { const results: ToolResultMessage[] = []; const runnableCalls: PreparedToolCall[] = []; let steeringMessages: AgentMessage[] | undefined; + let preparationErrorCount = 0; for (let index = 0; index < toolCalls.length; index++) { const toolCall = toolCalls[index]; @@ -546,6 +574,9 @@ async function executeToolCallsParallel( const preparation = await prepareToolCall(currentContext, assistantMessage, toolCall, config, signal); if (preparation.kind === "immediate") { + if (preparation.isError) { + preparationErrorCount++; + } results.push(emitToolCallOutcome(toolCall, preparation.result, preparation.isError, stream)); } else { runnableCalls.push(preparation); @@ -562,7 +593,7 @@ async function executeToolCallsParallel( for (const skipped of remainingCalls) { results.push(skipToolCall(skipped, stream)); } - return { toolResults: results, steeringMessages }; + return { toolResults: results, steeringMessages, preparationErrorCount }; } } } @@ -594,7 +625,7 @@ async function executeToolCallsParallel( } } - return { toolResults: results, steeringMessages }; + return { toolResults: results, steeringMessages, preparationErrorCount }; } type PreparedToolCall = {