Merge pull request #3619 from frizynn/fix/3618-schema-overload-bash-exit-code

fix(agent-loop): schema overload cap ignores bash execution errors (#3618)
2026-04-06 10:13:03 -05:00 · 2026-04-06 10:13:03 -05:00 · 0d87df94be
commit 0d87df94be
parent 4aa7fe3940 cd14a4c765
2 changed files with 143 additions and 16 deletions
--- a/packages/pi-agent-core/src/agent-loop.test.ts
+++ b/packages/pi-agent-core/src/agent-loop.test.ts
@ -123,7 +123,7 @@ function makeAssistantMessage(overrides: Partial<AssistantMessage> = {}): Assist
 		provider: "anthropic",
 		model: "claude-test",
 		usage: { input: 100, output: 50, cacheRead: 0, cacheWrite: 0, totalTokens: 150, cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 } },
-		stopReason: "end_turn",
+		stopReason: "stop",
 		timestamp: Date.now(),
 		...overrides,
 	};
@ -139,7 +139,7 @@ function makeToolCallMessage(toolCallArgs: Record<string, unknown>): AssistantMe
 				arguments: toolCallArgs,
 			},
 		],
-		stopReason: "tool_use",
+		stopReason: "toolUse",
 	});
 }

@ -162,7 +162,7 @@ describe("agent-loop — schema overload retry cap (#2783)", () => {

 		// LLM keeps sending tool calls with invalid args (missing required 'content' field)
 		const badToolCall = makeToolCallMessage({ path: "/tmp/test" }); // missing 'content'
-		const finalStop = makeAssistantMessage({ content: [{ type: "text", text: "I give up." }], stopReason: "end_turn" });
+		const finalStop = makeAssistantMessage({ content: [{ type: "text", text: "I give up." }], stopReason: "stop" });

 		// Create enough bad responses to exceed the cap, plus a final stop
 		const responses: AssistantMessage[] = [];
@ -217,7 +217,7 @@ describe("agent-loop — schema overload retry cap (#2783)", () => {
 		// Pattern: 2 failures, 1 success, 2 failures, 1 success, then stop
 		const badCall = makeToolCallMessage({ path: "/tmp/test" }); // missing 'content'
 		const goodCall = makeToolCallMessage({ path: "/tmp/test", content: "hello" });
-		const finalStop = makeAssistantMessage({ content: [{ type: "text", text: "Done." }], stopReason: "end_turn" });
+		const finalStop = makeAssistantMessage({ content: [{ type: "text", text: "Done." }], stopReason: "stop" });

 		const responses = [badCall, badCall, goodCall, badCall, badCall, goodCall, finalStop];
 		const mockStream = createMockStreamFn(responses);
@ -258,4 +258,100 @@ describe("agent-loop — schema overload retry cap (#2783)", () => {
 		assert.ok(MAX_CONSECUTIVE_VALIDATION_FAILURES >= 2, "Cap must be at least 2 to allow one retry");
 		assert.ok(MAX_CONSECUTIVE_VALIDATION_FAILURES <= 10, "Cap must not be unreasonably high");
 	});
+
+	it("does NOT trip schema overload cap on tool execution errors like bash exit code 1 (#3618)", async () => {
+		// Simulates the real scenario: a tool (bash) that passes validation but
+		// throws during execution (e.g. rg/grep returning exit code 1 = no matches).
+		// These are valid tool invocations — the schema was correct, the tool ran,
+		// it just returned a non-zero exit code. The cap should only trigger for
+		// preparation/schema failures, not execution failures.
+		const bashTool: AgentTool<any> = {
+			name: "bash",
+			label: "Bash",
+			description: "Run a bash command",
+			parameters: Type.Object({
+				command: Type.String(),
+			}),
+			execute: async () => {
+				// Simulate bash tool rejecting on non-zero exit code
+				throw new Error("(no output)\n\nCommand exited with code 1");
+			},
+		};
+
+		// LLM sends valid tool calls (schema is correct) that fail at execution
+		const validBashCall = makeAssistantMessage({
+			content: [
+				{
+					type: "toolCall",
+					id: `tc_bash_${Date.now()}_${Math.random()}`,
+					name: "bash",
+					arguments: { command: "rg -l 'nonexistent' src/" },
+				},
+			],
+			stopReason: "toolUse",
+		});
+		const finalStop = makeAssistantMessage({
+			content: [{ type: "text", text: "No references found." }],
+			stopReason: "stop",
+		});
+
+		// Send more than MAX_CONSECUTIVE_VALIDATION_FAILURES bash calls that throw
+		const responses: AssistantMessage[] = [];
+		for (let i = 0; i < MAX_CONSECUTIVE_VALIDATION_FAILURES + 2; i++) {
+			responses.push(validBashCall);
+		}
+		responses.push(finalStop);
+
+		const mockStream = createMockStreamFn(responses);
+
+		const context: AgentContext = {
+			systemPrompt: "You are a test agent.",
+			messages: [{ role: "user", content: [{ type: "text", text: "Search for references" }], timestamp: Date.now() }],
+			tools: [bashTool],
+		};
+
+		const config: AgentLoopConfig = {
+			model: TEST_MODEL,
+			convertToLlm: (msgs) => msgs.filter((m): m is any => m.role !== "custom"),
+			toolExecution: "sequential",
+		};
+
+		const stream = agentLoop(
+			[{ role: "user", content: [{ type: "text", text: "Search for references" }], timestamp: Date.now() }],
+			context,
+			config,
+			undefined,
+			mockStream as any,
+		);
+
+		const events = await collectEvents(stream);
+
+		// Must complete normally — execution errors should NOT trigger the cap
+		const agentEnd = events.find((e) => e.type === "agent_end");
+		assert.ok(agentEnd, "agent loop must emit agent_end");
+
+		// Count tool execution errors
+		const toolErrors = events.filter(
+			(e) => e.type === "tool_execution_end" && e.isError === true,
+		);
+
+		// All bash calls should have been attempted (not capped early)
+		assert.ok(
+			toolErrors.length >= MAX_CONSECUTIVE_VALIDATION_FAILURES + 2,
+			`Expected all ${MAX_CONSECUTIVE_VALIDATION_FAILURES + 2} bash execution errors to be processed (not capped), got ${toolErrors.length}`,
+		);
+
+		// The stop message should NOT contain the schema overload text
+		const allMessages = (agentEnd as any).messages as AgentMessage[];
+		const lastMessage = allMessages[allMessages.length - 1];
+		const lastText = lastMessage.role === "assistant"
+			? (lastMessage as AssistantMessage).content.find((c) => c.type === "text")
+			: undefined;
+		if (lastText && lastText.type === "text") {
+			assert.ok(
+				!lastText.text.includes("consecutive turns with all tool calls failing"),
+				"Final message must NOT contain schema overload stop text for execution-only errors",
+			);
+		}
+	});
 });
--- a/packages/pi-agent-core/src/agent-loop.ts
+++ b/packages/pi-agent-core/src/agent-loop.ts
@ -293,13 +293,23 @@ async function runLoop(
 					newMessages.push(result);
 				}

-				// Schema overload detection (#2783): if EVERY tool result in this turn
-				// is an error (validation failure, missing tool, etc.), increment the
-				// consecutive failure counter. If any tool succeeded, reset to zero.
-				const allToolsFailed = toolResults.length > 0 && toolResults.every((r) => r.isError);
-				if (allToolsFailed) {
+				// Schema overload detection (#2783): count only preparation-phase
+				// errors (schema validation, tool-not-found, tool-blocked) toward the
+				// consecutive failure cap. Tool execution errors — such as bash
+				// commands returning non-zero exit codes (e.g. grep/rg exit 1 for
+				// "no matches") — are valid tool usage and must NOT trigger the cap.
+				// See: #3618
+				const hasPreparationErrors = toolExecution.preparationErrorCount > 0;
+				const allToolsFailedPreparation =
+					toolResults.length > 0 &&
+					toolExecution.preparationErrorCount === toolResults.length;
+				if (allToolsFailedPreparation) {
 					consecutiveAllToolErrorTurns++;
-				} else {
+				} else if (!hasPreparationErrors) {
+					// Reset only when there are zero preparation errors this turn.
+					// Mixed turns (some prep errors, some successes) don't reset,
+					// but they also don't increment — this avoids masking a
+					// pattern of alternating schema failures with one working call.
 					consecutiveAllToolErrorTurns = 0;
 				}

@ -452,6 +462,19 @@ async function streamAssistantResponse(
 	return await response.result();
 }

+/**
+ * Result from executing tool calls in a turn. Includes metadata about
+ * error provenance so the schema overload detector can distinguish
+ * preparation failures (schema validation, tool-not-found, tool-blocked)
+ * from execution failures (the tool ran but threw, e.g. bash exit code 1).
+ */
+interface ToolExecutionResult {
+	toolResults: ToolResultMessage[];
+	steeringMessages?: AgentMessage[];
+	/** Number of tool results that failed during preparation (validation/schema). */
+	preparationErrorCount: number;
+}
+
 /**
 * Execute tool calls from an assistant message.
 */
@ -461,7 +484,7 @@ async function executeToolCalls(
 	config: AgentLoopConfig,
 	signal: AbortSignal | undefined,
 	stream: EventStream<AgentEvent, AgentMessage[]>,
-): Promise<{ toolResults: ToolResultMessage[]; steeringMessages?: AgentMessage[] }> {
+): Promise<ToolExecutionResult> {
 	const toolCalls = assistantMessage.content.filter((c) => c.type === "toolCall") as AgentToolCall[];
 	if (config.toolExecution === "sequential") {
 		return executeToolCallsSequential(currentContext, assistantMessage, toolCalls, config, signal, stream);
@ -476,9 +499,10 @@ async function executeToolCallsSequential(
 	config: AgentLoopConfig,
 	signal: AbortSignal | undefined,
 	stream: EventStream<AgentEvent, AgentMessage[]>,
-): Promise<{ toolResults: ToolResultMessage[]; steeringMessages?: AgentMessage[] }> {
+): Promise<ToolExecutionResult> {
 	const results: ToolResultMessage[] = [];
 	let steeringMessages: AgentMessage[] | undefined;
+	let preparationErrorCount = 0;

 	for (let index = 0; index < toolCalls.length; index++) {
 		const toolCall = toolCalls[index];
@ -491,6 +515,9 @@ async function executeToolCallsSequential(

 		const preparation = await prepareToolCall(currentContext, assistantMessage, toolCall, config, signal);
 		if (preparation.kind === "immediate") {
+			if (preparation.isError) {
+				preparationErrorCount++;
+			}
 			results.push(emitToolCallOutcome(toolCall, preparation.result, preparation.isError, stream));
 		} else {
 			const executed = await executePreparedToolCall(preparation, signal, stream);
@ -520,7 +547,7 @@ async function executeToolCallsSequential(
 		}
 	}

-	return { toolResults: results, steeringMessages };
+	return { toolResults: results, steeringMessages, preparationErrorCount };
 }

 async function executeToolCallsParallel(
@ -530,10 +557,11 @@ async function executeToolCallsParallel(
 	config: AgentLoopConfig,
 	signal: AbortSignal | undefined,
 	stream: EventStream<AgentEvent, AgentMessage[]>,
-): Promise<{ toolResults: ToolResultMessage[]; steeringMessages?: AgentMessage[] }> {
+): Promise<ToolExecutionResult> {
 	const results: ToolResultMessage[] = [];
 	const runnableCalls: PreparedToolCall[] = [];
 	let steeringMessages: AgentMessage[] | undefined;
+	let preparationErrorCount = 0;

 	for (let index = 0; index < toolCalls.length; index++) {
 		const toolCall = toolCalls[index];
@ -546,6 +574,9 @@ async function executeToolCallsParallel(

 		const preparation = await prepareToolCall(currentContext, assistantMessage, toolCall, config, signal);
 		if (preparation.kind === "immediate") {
+			if (preparation.isError) {
+				preparationErrorCount++;
+			}
 			results.push(emitToolCallOutcome(toolCall, preparation.result, preparation.isError, stream));
 		} else {
 			runnableCalls.push(preparation);
@ -562,7 +593,7 @@ async function executeToolCallsParallel(
 				for (const skipped of remainingCalls) {
 					results.push(skipToolCall(skipped, stream));
 				}
-				return { toolResults: results, steeringMessages };
+				return { toolResults: results, steeringMessages, preparationErrorCount };
 			}
 		}
 	}
@ -594,7 +625,7 @@ async function executeToolCallsParallel(
 		}
 	}

-	return { toolResults: results, steeringMessages };
+	return { toolResults: results, steeringMessages, preparationErrorCount };
 }

 type PreparedToolCall = {