Merge pull request #3619 from frizynn/fix/3618-schema-overload-bash-exit-code

fix(agent-loop): schema overload cap ignores bash execution errors (#3618)
This commit is contained in:
Jeremy McSpadden 2026-04-06 10:13:03 -05:00 committed by GitHub
commit 0d87df94be
2 changed files with 143 additions and 16 deletions

View file

@ -123,7 +123,7 @@ function makeAssistantMessage(overrides: Partial<AssistantMessage> = {}): Assist
provider: "anthropic",
model: "claude-test",
usage: { input: 100, output: 50, cacheRead: 0, cacheWrite: 0, totalTokens: 150, cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 } },
stopReason: "end_turn",
stopReason: "stop",
timestamp: Date.now(),
...overrides,
};
@ -139,7 +139,7 @@ function makeToolCallMessage(toolCallArgs: Record<string, unknown>): AssistantMe
arguments: toolCallArgs,
},
],
stopReason: "tool_use",
stopReason: "toolUse",
});
}
@ -162,7 +162,7 @@ describe("agent-loop — schema overload retry cap (#2783)", () => {
// LLM keeps sending tool calls with invalid args (missing required 'content' field)
const badToolCall = makeToolCallMessage({ path: "/tmp/test" }); // missing 'content'
const finalStop = makeAssistantMessage({ content: [{ type: "text", text: "I give up." }], stopReason: "end_turn" });
const finalStop = makeAssistantMessage({ content: [{ type: "text", text: "I give up." }], stopReason: "stop" });
// Create enough bad responses to exceed the cap, plus a final stop
const responses: AssistantMessage[] = [];
@ -217,7 +217,7 @@ describe("agent-loop — schema overload retry cap (#2783)", () => {
// Pattern: 2 failures, 1 success, 2 failures, 1 success, then stop
const badCall = makeToolCallMessage({ path: "/tmp/test" }); // missing 'content'
const goodCall = makeToolCallMessage({ path: "/tmp/test", content: "hello" });
const finalStop = makeAssistantMessage({ content: [{ type: "text", text: "Done." }], stopReason: "end_turn" });
const finalStop = makeAssistantMessage({ content: [{ type: "text", text: "Done." }], stopReason: "stop" });
const responses = [badCall, badCall, goodCall, badCall, badCall, goodCall, finalStop];
const mockStream = createMockStreamFn(responses);
@ -258,4 +258,100 @@ describe("agent-loop — schema overload retry cap (#2783)", () => {
assert.ok(MAX_CONSECUTIVE_VALIDATION_FAILURES >= 2, "Cap must be at least 2 to allow one retry");
assert.ok(MAX_CONSECUTIVE_VALIDATION_FAILURES <= 10, "Cap must not be unreasonably high");
});
it("does NOT trip schema overload cap on tool execution errors like bash exit code 1 (#3618)", async () => {
// Simulates the real scenario: a tool (bash) that passes validation but
// throws during execution (e.g. rg/grep returning exit code 1 = no matches).
// These are valid tool invocations — the schema was correct, the tool ran,
// it just returned a non-zero exit code. The cap should only trigger for
// preparation/schema failures, not execution failures.
const bashTool: AgentTool<any> = {
name: "bash",
label: "Bash",
description: "Run a bash command",
parameters: Type.Object({
command: Type.String(),
}),
execute: async () => {
// Simulate bash tool rejecting on non-zero exit code
throw new Error("(no output)\n\nCommand exited with code 1");
},
};
// LLM sends valid tool calls (schema is correct) that fail at execution
const validBashCall = makeAssistantMessage({
content: [
{
type: "toolCall",
id: `tc_bash_${Date.now()}_${Math.random()}`,
name: "bash",
arguments: { command: "rg -l 'nonexistent' src/" },
},
],
stopReason: "toolUse",
});
const finalStop = makeAssistantMessage({
content: [{ type: "text", text: "No references found." }],
stopReason: "stop",
});
// Send more than MAX_CONSECUTIVE_VALIDATION_FAILURES bash calls that throw
const responses: AssistantMessage[] = [];
for (let i = 0; i < MAX_CONSECUTIVE_VALIDATION_FAILURES + 2; i++) {
responses.push(validBashCall);
}
responses.push(finalStop);
const mockStream = createMockStreamFn(responses);
const context: AgentContext = {
systemPrompt: "You are a test agent.",
messages: [{ role: "user", content: [{ type: "text", text: "Search for references" }], timestamp: Date.now() }],
tools: [bashTool],
};
const config: AgentLoopConfig = {
model: TEST_MODEL,
convertToLlm: (msgs) => msgs.filter((m): m is any => m.role !== "custom"),
toolExecution: "sequential",
};
const stream = agentLoop(
[{ role: "user", content: [{ type: "text", text: "Search for references" }], timestamp: Date.now() }],
context,
config,
undefined,
mockStream as any,
);
const events = await collectEvents(stream);
// Must complete normally — execution errors should NOT trigger the cap
const agentEnd = events.find((e) => e.type === "agent_end");
assert.ok(agentEnd, "agent loop must emit agent_end");
// Count tool execution errors
const toolErrors = events.filter(
(e) => e.type === "tool_execution_end" && e.isError === true,
);
// All bash calls should have been attempted (not capped early)
assert.ok(
toolErrors.length >= MAX_CONSECUTIVE_VALIDATION_FAILURES + 2,
`Expected all ${MAX_CONSECUTIVE_VALIDATION_FAILURES + 2} bash execution errors to be processed (not capped), got ${toolErrors.length}`,
);
// The stop message should NOT contain the schema overload text
const allMessages = (agentEnd as any).messages as AgentMessage[];
const lastMessage = allMessages[allMessages.length - 1];
const lastText = lastMessage.role === "assistant"
? (lastMessage as AssistantMessage).content.find((c) => c.type === "text")
: undefined;
if (lastText && lastText.type === "text") {
assert.ok(
!lastText.text.includes("consecutive turns with all tool calls failing"),
"Final message must NOT contain schema overload stop text for execution-only errors",
);
}
});
});

View file

@ -293,13 +293,23 @@ async function runLoop(
newMessages.push(result);
}
// Schema overload detection (#2783): if EVERY tool result in this turn
// is an error (validation failure, missing tool, etc.), increment the
// consecutive failure counter. If any tool succeeded, reset to zero.
const allToolsFailed = toolResults.length > 0 && toolResults.every((r) => r.isError);
if (allToolsFailed) {
// Schema overload detection (#2783): count only preparation-phase
// errors (schema validation, tool-not-found, tool-blocked) toward the
// consecutive failure cap. Tool execution errors — such as bash
// commands returning non-zero exit codes (e.g. grep/rg exit 1 for
// "no matches") — are valid tool usage and must NOT trigger the cap.
// See: #3618
const hasPreparationErrors = toolExecution.preparationErrorCount > 0;
const allToolsFailedPreparation =
toolResults.length > 0 &&
toolExecution.preparationErrorCount === toolResults.length;
if (allToolsFailedPreparation) {
consecutiveAllToolErrorTurns++;
} else {
} else if (!hasPreparationErrors) {
// Reset only when there are zero preparation errors this turn.
// Mixed turns (some prep errors, some successes) don't reset,
// but they also don't increment — this avoids masking a
// pattern of alternating schema failures with one working call.
consecutiveAllToolErrorTurns = 0;
}
@ -452,6 +462,19 @@ async function streamAssistantResponse(
return await response.result();
}
/**
* Result from executing tool calls in a turn. Includes metadata about
* error provenance so the schema overload detector can distinguish
* preparation failures (schema validation, tool-not-found, tool-blocked)
* from execution failures (the tool ran but threw, e.g. bash exit code 1).
*/
interface ToolExecutionResult {
toolResults: ToolResultMessage[];
steeringMessages?: AgentMessage[];
/** Number of tool results that failed during preparation (validation/schema). */
preparationErrorCount: number;
}
/**
* Execute tool calls from an assistant message.
*/
@ -461,7 +484,7 @@ async function executeToolCalls(
config: AgentLoopConfig,
signal: AbortSignal | undefined,
stream: EventStream<AgentEvent, AgentMessage[]>,
): Promise<{ toolResults: ToolResultMessage[]; steeringMessages?: AgentMessage[] }> {
): Promise<ToolExecutionResult> {
const toolCalls = assistantMessage.content.filter((c) => c.type === "toolCall") as AgentToolCall[];
if (config.toolExecution === "sequential") {
return executeToolCallsSequential(currentContext, assistantMessage, toolCalls, config, signal, stream);
@ -476,9 +499,10 @@ async function executeToolCallsSequential(
config: AgentLoopConfig,
signal: AbortSignal | undefined,
stream: EventStream<AgentEvent, AgentMessage[]>,
): Promise<{ toolResults: ToolResultMessage[]; steeringMessages?: AgentMessage[] }> {
): Promise<ToolExecutionResult> {
const results: ToolResultMessage[] = [];
let steeringMessages: AgentMessage[] | undefined;
let preparationErrorCount = 0;
for (let index = 0; index < toolCalls.length; index++) {
const toolCall = toolCalls[index];
@ -491,6 +515,9 @@ async function executeToolCallsSequential(
const preparation = await prepareToolCall(currentContext, assistantMessage, toolCall, config, signal);
if (preparation.kind === "immediate") {
if (preparation.isError) {
preparationErrorCount++;
}
results.push(emitToolCallOutcome(toolCall, preparation.result, preparation.isError, stream));
} else {
const executed = await executePreparedToolCall(preparation, signal, stream);
@ -520,7 +547,7 @@ async function executeToolCallsSequential(
}
}
return { toolResults: results, steeringMessages };
return { toolResults: results, steeringMessages, preparationErrorCount };
}
async function executeToolCallsParallel(
@ -530,10 +557,11 @@ async function executeToolCallsParallel(
config: AgentLoopConfig,
signal: AbortSignal | undefined,
stream: EventStream<AgentEvent, AgentMessage[]>,
): Promise<{ toolResults: ToolResultMessage[]; steeringMessages?: AgentMessage[] }> {
): Promise<ToolExecutionResult> {
const results: ToolResultMessage[] = [];
const runnableCalls: PreparedToolCall[] = [];
let steeringMessages: AgentMessage[] | undefined;
let preparationErrorCount = 0;
for (let index = 0; index < toolCalls.length; index++) {
const toolCall = toolCalls[index];
@ -546,6 +574,9 @@ async function executeToolCallsParallel(
const preparation = await prepareToolCall(currentContext, assistantMessage, toolCall, config, signal);
if (preparation.kind === "immediate") {
if (preparation.isError) {
preparationErrorCount++;
}
results.push(emitToolCallOutcome(toolCall, preparation.result, preparation.isError, stream));
} else {
runnableCalls.push(preparation);
@ -562,7 +593,7 @@ async function executeToolCallsParallel(
for (const skipped of remainingCalls) {
results.push(skipToolCall(skipped, stream));
}
return { toolResults: results, steeringMessages };
return { toolResults: results, steeringMessages, preparationErrorCount };
}
}
}
@ -594,7 +625,7 @@ async function executeToolCallsParallel(
}
}
return { toolResults: results, steeringMessages };
return { toolResults: results, steeringMessages, preparationErrorCount };
}
type PreparedToolCall = {