Merge pull request #3619 from frizynn/fix/3618-schema-overload-bash-exit-code
fix(agent-loop): schema overload cap ignores bash execution errors (#3618)
This commit is contained in:
commit
0d87df94be
2 changed files with 143 additions and 16 deletions
|
|
@ -123,7 +123,7 @@ function makeAssistantMessage(overrides: Partial<AssistantMessage> = {}): Assist
|
|||
provider: "anthropic",
|
||||
model: "claude-test",
|
||||
usage: { input: 100, output: 50, cacheRead: 0, cacheWrite: 0, totalTokens: 150, cost: { input: 0, output: 0, cacheRead: 0, cacheWrite: 0, total: 0 } },
|
||||
stopReason: "end_turn",
|
||||
stopReason: "stop",
|
||||
timestamp: Date.now(),
|
||||
...overrides,
|
||||
};
|
||||
|
|
@ -139,7 +139,7 @@ function makeToolCallMessage(toolCallArgs: Record<string, unknown>): AssistantMe
|
|||
arguments: toolCallArgs,
|
||||
},
|
||||
],
|
||||
stopReason: "tool_use",
|
||||
stopReason: "toolUse",
|
||||
});
|
||||
}
|
||||
|
||||
|
|
@ -162,7 +162,7 @@ describe("agent-loop — schema overload retry cap (#2783)", () => {
|
|||
|
||||
// LLM keeps sending tool calls with invalid args (missing required 'content' field)
|
||||
const badToolCall = makeToolCallMessage({ path: "/tmp/test" }); // missing 'content'
|
||||
const finalStop = makeAssistantMessage({ content: [{ type: "text", text: "I give up." }], stopReason: "end_turn" });
|
||||
const finalStop = makeAssistantMessage({ content: [{ type: "text", text: "I give up." }], stopReason: "stop" });
|
||||
|
||||
// Create enough bad responses to exceed the cap, plus a final stop
|
||||
const responses: AssistantMessage[] = [];
|
||||
|
|
@ -217,7 +217,7 @@ describe("agent-loop — schema overload retry cap (#2783)", () => {
|
|||
// Pattern: 2 failures, 1 success, 2 failures, 1 success, then stop
|
||||
const badCall = makeToolCallMessage({ path: "/tmp/test" }); // missing 'content'
|
||||
const goodCall = makeToolCallMessage({ path: "/tmp/test", content: "hello" });
|
||||
const finalStop = makeAssistantMessage({ content: [{ type: "text", text: "Done." }], stopReason: "end_turn" });
|
||||
const finalStop = makeAssistantMessage({ content: [{ type: "text", text: "Done." }], stopReason: "stop" });
|
||||
|
||||
const responses = [badCall, badCall, goodCall, badCall, badCall, goodCall, finalStop];
|
||||
const mockStream = createMockStreamFn(responses);
|
||||
|
|
@ -258,4 +258,100 @@ describe("agent-loop — schema overload retry cap (#2783)", () => {
|
|||
assert.ok(MAX_CONSECUTIVE_VALIDATION_FAILURES >= 2, "Cap must be at least 2 to allow one retry");
|
||||
assert.ok(MAX_CONSECUTIVE_VALIDATION_FAILURES <= 10, "Cap must not be unreasonably high");
|
||||
});
|
||||
|
||||
it("does NOT trip schema overload cap on tool execution errors like bash exit code 1 (#3618)", async () => {
|
||||
// Simulates the real scenario: a tool (bash) that passes validation but
|
||||
// throws during execution (e.g. rg/grep returning exit code 1 = no matches).
|
||||
// These are valid tool invocations — the schema was correct, the tool ran,
|
||||
// it just returned a non-zero exit code. The cap should only trigger for
|
||||
// preparation/schema failures, not execution failures.
|
||||
const bashTool: AgentTool<any> = {
|
||||
name: "bash",
|
||||
label: "Bash",
|
||||
description: "Run a bash command",
|
||||
parameters: Type.Object({
|
||||
command: Type.String(),
|
||||
}),
|
||||
execute: async () => {
|
||||
// Simulate bash tool rejecting on non-zero exit code
|
||||
throw new Error("(no output)\n\nCommand exited with code 1");
|
||||
},
|
||||
};
|
||||
|
||||
// LLM sends valid tool calls (schema is correct) that fail at execution
|
||||
const validBashCall = makeAssistantMessage({
|
||||
content: [
|
||||
{
|
||||
type: "toolCall",
|
||||
id: `tc_bash_${Date.now()}_${Math.random()}`,
|
||||
name: "bash",
|
||||
arguments: { command: "rg -l 'nonexistent' src/" },
|
||||
},
|
||||
],
|
||||
stopReason: "toolUse",
|
||||
});
|
||||
const finalStop = makeAssistantMessage({
|
||||
content: [{ type: "text", text: "No references found." }],
|
||||
stopReason: "stop",
|
||||
});
|
||||
|
||||
// Send more than MAX_CONSECUTIVE_VALIDATION_FAILURES bash calls that throw
|
||||
const responses: AssistantMessage[] = [];
|
||||
for (let i = 0; i < MAX_CONSECUTIVE_VALIDATION_FAILURES + 2; i++) {
|
||||
responses.push(validBashCall);
|
||||
}
|
||||
responses.push(finalStop);
|
||||
|
||||
const mockStream = createMockStreamFn(responses);
|
||||
|
||||
const context: AgentContext = {
|
||||
systemPrompt: "You are a test agent.",
|
||||
messages: [{ role: "user", content: [{ type: "text", text: "Search for references" }], timestamp: Date.now() }],
|
||||
tools: [bashTool],
|
||||
};
|
||||
|
||||
const config: AgentLoopConfig = {
|
||||
model: TEST_MODEL,
|
||||
convertToLlm: (msgs) => msgs.filter((m): m is any => m.role !== "custom"),
|
||||
toolExecution: "sequential",
|
||||
};
|
||||
|
||||
const stream = agentLoop(
|
||||
[{ role: "user", content: [{ type: "text", text: "Search for references" }], timestamp: Date.now() }],
|
||||
context,
|
||||
config,
|
||||
undefined,
|
||||
mockStream as any,
|
||||
);
|
||||
|
||||
const events = await collectEvents(stream);
|
||||
|
||||
// Must complete normally — execution errors should NOT trigger the cap
|
||||
const agentEnd = events.find((e) => e.type === "agent_end");
|
||||
assert.ok(agentEnd, "agent loop must emit agent_end");
|
||||
|
||||
// Count tool execution errors
|
||||
const toolErrors = events.filter(
|
||||
(e) => e.type === "tool_execution_end" && e.isError === true,
|
||||
);
|
||||
|
||||
// All bash calls should have been attempted (not capped early)
|
||||
assert.ok(
|
||||
toolErrors.length >= MAX_CONSECUTIVE_VALIDATION_FAILURES + 2,
|
||||
`Expected all ${MAX_CONSECUTIVE_VALIDATION_FAILURES + 2} bash execution errors to be processed (not capped), got ${toolErrors.length}`,
|
||||
);
|
||||
|
||||
// The stop message should NOT contain the schema overload text
|
||||
const allMessages = (agentEnd as any).messages as AgentMessage[];
|
||||
const lastMessage = allMessages[allMessages.length - 1];
|
||||
const lastText = lastMessage.role === "assistant"
|
||||
? (lastMessage as AssistantMessage).content.find((c) => c.type === "text")
|
||||
: undefined;
|
||||
if (lastText && lastText.type === "text") {
|
||||
assert.ok(
|
||||
!lastText.text.includes("consecutive turns with all tool calls failing"),
|
||||
"Final message must NOT contain schema overload stop text for execution-only errors",
|
||||
);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -293,13 +293,23 @@ async function runLoop(
|
|||
newMessages.push(result);
|
||||
}
|
||||
|
||||
// Schema overload detection (#2783): if EVERY tool result in this turn
|
||||
// is an error (validation failure, missing tool, etc.), increment the
|
||||
// consecutive failure counter. If any tool succeeded, reset to zero.
|
||||
const allToolsFailed = toolResults.length > 0 && toolResults.every((r) => r.isError);
|
||||
if (allToolsFailed) {
|
||||
// Schema overload detection (#2783): count only preparation-phase
|
||||
// errors (schema validation, tool-not-found, tool-blocked) toward the
|
||||
// consecutive failure cap. Tool execution errors — such as bash
|
||||
// commands returning non-zero exit codes (e.g. grep/rg exit 1 for
|
||||
// "no matches") — are valid tool usage and must NOT trigger the cap.
|
||||
// See: #3618
|
||||
const hasPreparationErrors = toolExecution.preparationErrorCount > 0;
|
||||
const allToolsFailedPreparation =
|
||||
toolResults.length > 0 &&
|
||||
toolExecution.preparationErrorCount === toolResults.length;
|
||||
if (allToolsFailedPreparation) {
|
||||
consecutiveAllToolErrorTurns++;
|
||||
} else {
|
||||
} else if (!hasPreparationErrors) {
|
||||
// Reset only when there are zero preparation errors this turn.
|
||||
// Mixed turns (some prep errors, some successes) don't reset,
|
||||
// but they also don't increment — this avoids masking a
|
||||
// pattern of alternating schema failures with one working call.
|
||||
consecutiveAllToolErrorTurns = 0;
|
||||
}
|
||||
|
||||
|
|
@ -452,6 +462,19 @@ async function streamAssistantResponse(
|
|||
return await response.result();
|
||||
}
|
||||
|
||||
/**
|
||||
* Result from executing tool calls in a turn. Includes metadata about
|
||||
* error provenance so the schema overload detector can distinguish
|
||||
* preparation failures (schema validation, tool-not-found, tool-blocked)
|
||||
* from execution failures (the tool ran but threw, e.g. bash exit code 1).
|
||||
*/
|
||||
interface ToolExecutionResult {
|
||||
toolResults: ToolResultMessage[];
|
||||
steeringMessages?: AgentMessage[];
|
||||
/** Number of tool results that failed during preparation (validation/schema). */
|
||||
preparationErrorCount: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Execute tool calls from an assistant message.
|
||||
*/
|
||||
|
|
@ -461,7 +484,7 @@ async function executeToolCalls(
|
|||
config: AgentLoopConfig,
|
||||
signal: AbortSignal | undefined,
|
||||
stream: EventStream<AgentEvent, AgentMessage[]>,
|
||||
): Promise<{ toolResults: ToolResultMessage[]; steeringMessages?: AgentMessage[] }> {
|
||||
): Promise<ToolExecutionResult> {
|
||||
const toolCalls = assistantMessage.content.filter((c) => c.type === "toolCall") as AgentToolCall[];
|
||||
if (config.toolExecution === "sequential") {
|
||||
return executeToolCallsSequential(currentContext, assistantMessage, toolCalls, config, signal, stream);
|
||||
|
|
@ -476,9 +499,10 @@ async function executeToolCallsSequential(
|
|||
config: AgentLoopConfig,
|
||||
signal: AbortSignal | undefined,
|
||||
stream: EventStream<AgentEvent, AgentMessage[]>,
|
||||
): Promise<{ toolResults: ToolResultMessage[]; steeringMessages?: AgentMessage[] }> {
|
||||
): Promise<ToolExecutionResult> {
|
||||
const results: ToolResultMessage[] = [];
|
||||
let steeringMessages: AgentMessage[] | undefined;
|
||||
let preparationErrorCount = 0;
|
||||
|
||||
for (let index = 0; index < toolCalls.length; index++) {
|
||||
const toolCall = toolCalls[index];
|
||||
|
|
@ -491,6 +515,9 @@ async function executeToolCallsSequential(
|
|||
|
||||
const preparation = await prepareToolCall(currentContext, assistantMessage, toolCall, config, signal);
|
||||
if (preparation.kind === "immediate") {
|
||||
if (preparation.isError) {
|
||||
preparationErrorCount++;
|
||||
}
|
||||
results.push(emitToolCallOutcome(toolCall, preparation.result, preparation.isError, stream));
|
||||
} else {
|
||||
const executed = await executePreparedToolCall(preparation, signal, stream);
|
||||
|
|
@ -520,7 +547,7 @@ async function executeToolCallsSequential(
|
|||
}
|
||||
}
|
||||
|
||||
return { toolResults: results, steeringMessages };
|
||||
return { toolResults: results, steeringMessages, preparationErrorCount };
|
||||
}
|
||||
|
||||
async function executeToolCallsParallel(
|
||||
|
|
@ -530,10 +557,11 @@ async function executeToolCallsParallel(
|
|||
config: AgentLoopConfig,
|
||||
signal: AbortSignal | undefined,
|
||||
stream: EventStream<AgentEvent, AgentMessage[]>,
|
||||
): Promise<{ toolResults: ToolResultMessage[]; steeringMessages?: AgentMessage[] }> {
|
||||
): Promise<ToolExecutionResult> {
|
||||
const results: ToolResultMessage[] = [];
|
||||
const runnableCalls: PreparedToolCall[] = [];
|
||||
let steeringMessages: AgentMessage[] | undefined;
|
||||
let preparationErrorCount = 0;
|
||||
|
||||
for (let index = 0; index < toolCalls.length; index++) {
|
||||
const toolCall = toolCalls[index];
|
||||
|
|
@ -546,6 +574,9 @@ async function executeToolCallsParallel(
|
|||
|
||||
const preparation = await prepareToolCall(currentContext, assistantMessage, toolCall, config, signal);
|
||||
if (preparation.kind === "immediate") {
|
||||
if (preparation.isError) {
|
||||
preparationErrorCount++;
|
||||
}
|
||||
results.push(emitToolCallOutcome(toolCall, preparation.result, preparation.isError, stream));
|
||||
} else {
|
||||
runnableCalls.push(preparation);
|
||||
|
|
@ -562,7 +593,7 @@ async function executeToolCallsParallel(
|
|||
for (const skipped of remainingCalls) {
|
||||
results.push(skipToolCall(skipped, stream));
|
||||
}
|
||||
return { toolResults: results, steeringMessages };
|
||||
return { toolResults: results, steeringMessages, preparationErrorCount };
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -594,7 +625,7 @@ async function executeToolCallsParallel(
|
|||
}
|
||||
}
|
||||
|
||||
return { toolResults: results, steeringMessages };
|
||||
return { toolResults: results, steeringMessages, preparationErrorCount };
|
||||
}
|
||||
|
||||
type PreparedToolCall = {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue