fix: add session-level search budget to prevent unbounded native web search (#1309) (#1529)

The Anthropic API's max_uses resets per request — when pause_turn triggers
a resubmit, the model gets a fresh budget each time. This allowed unlimited
total searches across a research unit, overwhelming the TUI render buffer.

Fix:
- Count web_search_tool_result blocks in conversation history on each
  before_provider_request to track cumulative searches per session
- Cap total native searches at 15 per session (3 full turns of 5)
- Dynamically set max_uses to min(5, remaining) — preserves per-turn cap
  while enforcing session ceiling
- When budget exhausted, omit web_search tool entirely instead of letting
  the model hit max_uses_exceeded repeatedly
- Reset counter on session_start (new agent unit)
- Add web search budget guidance to research prompts (defense in depth)

Tests: 5 new tests covering budget tracking, exhaustion, and reset.
All 35 native-search tests pass.
This commit is contained in:
Tom Boucher 2026-03-19 22:08:15 -04:00 committed by GitHub
parent c9d79a829c
commit 7afefc73ac
4 changed files with 255 additions and 9 deletions

View file

@ -25,9 +25,10 @@ Then research the codebase and relevant technologies. Narrate key findings and s
2. **Skill Discovery ({{skillDiscoveryMode}}):**{{skillDiscoveryInstructions}}
3. Explore relevant code. For small/familiar codebases, use `rg`, `find`, and targeted reads. For large or unfamiliar codebases, use `scout` to build a broad map efficiently before diving in.
4. Use `resolve_library` / `get_library_docs` for unfamiliar libraries — skip this for libraries already used in the codebase
5. Use the **Research** output template from the inlined context above — include only sections that have real content
6. If `.gsd/REQUIREMENTS.md` exists, research against it. Identify which Active requirements are table stakes, likely omissions, overbuilt risks, or domain-standard behaviors the user may or may not want.
7. Write `{{outputPath}}`
5. **Web search budget:** You have a limited budget of web searches (max ~15 per session). Use them strategically — prefer `resolve_library` / `get_library_docs` for library documentation. Do NOT repeat the same or similar queries. If a search didn't find what you need, rephrase once or move on. Target 3-5 total web searches for a typical research unit.
6. Use the **Research** output template from the inlined context above — include only sections that have real content
7. If `.gsd/REQUIREMENTS.md` exists, research against it. Identify which Active requirements are table stakes, likely omissions, overbuilt risks, or domain-standard behaviors the user may or may not want.
8. Write `{{outputPath}}`
## Strategic Questions to Answer

View file

@ -46,8 +46,9 @@ Research what this slice needs. Narrate key findings and surprises as you go —
2. **Skill Discovery ({{skillDiscoveryMode}}):**{{skillDiscoveryInstructions}}
3. Explore relevant code for this slice's scope. For targeted exploration, use `rg`, `find`, and reads. For broad or unfamiliar subsystems, use `scout` to map the relevant area first.
4. Use `resolve_library` / `get_library_docs` for unfamiliar libraries — skip this for libraries already used in the codebase
5. Use the **Research** output template from the inlined context above — include only sections that have real content. The template is already inlined above; do NOT attempt to read any template file from disk (there is no `templates/SLICE-RESEARCH.md` — the correct template is already present in this prompt).
6. Write `{{outputPath}}`
5. **Web search budget:** You have a limited budget of web searches (max ~15 per session). Use them strategically — prefer `resolve_library` / `get_library_docs` for library documentation. Do NOT repeat the same or similar queries. If a search didn't find what you need, rephrase once or move on. Target 3-5 total web searches for a typical research unit.
6. Use the **Research** output template from the inlined context above — include only sections that have real content. The template is already inlined above; do NOT attempt to read any template file from disk (there is no `templates/SLICE-RESEARCH.md` — the correct template is already present in this prompt).
7. Write `{{outputPath}}`
The slice directory already exists at `{{slicePath}}/`. Do NOT mkdir — just write the file.

View file

@ -16,6 +16,16 @@ export const CUSTOM_SEARCH_TOOL_NAMES = ["search-the-web", "search_and_read", "g
/** Thinking block types that require signature validation by the API */
const THINKING_TYPES = new Set(["thinking", "redacted_thinking"]);
/**
* Maximum number of native web searches allowed per session (agent unit).
* The Anthropic API's `max_uses` is per-request it resets on each API call.
* When `pause_turn` triggers a resubmit, the model gets a fresh budget.
* This session-level cap prevents unbounded search accumulation (#1309).
*
* 15 = 3 full turns of 5 searches each generous for research, but bounded.
*/
export const MAX_NATIVE_SEARCHES_PER_SESSION = 15;
/** When true, skip native web search injection and keep Brave/custom tools active on Anthropic. */
export function preferBraveSearch(): boolean {
// preferences.md takes priority over env var
@ -74,6 +84,11 @@ export function registerNativeSearchHooks(pi: NativeSearchPI): { getIsAnthropic:
let isAnthropicProvider = false;
let modelSelectFired = false;
// Session-level native search counter (#1309).
// Tracks cumulative web_search_tool_result blocks across all turns in a session.
// Reset on session_start. Used to compute remaining budget for max_uses.
let sessionSearchCount = 0;
// Track provider changes via model selection — also handles diagnostics
// since model_select fires AFTER session_start and knows the provider.
pi.on("model_select", async (event: any, ctx: any) => {
@ -161,13 +176,41 @@ export function registerNativeSearchHooks(pi: NativeSearchPI): { getIsAnthropic:
);
payload.tools = tools;
// ── Session-level search budget (#1309) ──────────────────────────────
// Count web_search_tool_result blocks in the conversation history to
// determine how many native searches have already been used this session.
// The Anthropic API's max_uses resets per request, so without this guard,
// pause_turn → resubmit cycles allow unlimited total searches.
if (Array.isArray(messages)) {
let historySearchCount = 0;
for (const msg of messages) {
const content = msg.content;
if (!Array.isArray(content)) continue;
for (const block of content) {
if ((block as any)?.type === "web_search_tool_result") {
historySearchCount++;
}
}
}
// Sync counter from history (handles session restore / context replay)
sessionSearchCount = historySearchCount;
}
const remaining = Math.max(0, MAX_NATIVE_SEARCHES_PER_SESSION - sessionSearchCount);
if (remaining <= 0) {
// Budget exhausted — don't inject the search tool at all.
// The model will proceed without web search capability.
return payload;
}
tools.push({
type: "web_search_20250305",
name: "web_search",
// Cap server-side searches per response to prevent the model from
// looping on web_search without synthesizing results (#817).
// 5 searches is generous — most queries need 1-2.
max_uses: 5,
// Cap per-request searches to the lesser of 5 (per-turn cap) or the
// remaining session budget (#1309). This prevents the model from
// consuming unlimited searches via pause_turn → resubmit cycles.
max_uses: Math.min(5, remaining),
});
return payload;
@ -175,6 +218,9 @@ export function registerNativeSearchHooks(pi: NativeSearchPI): { getIsAnthropic:
// Basic startup diagnostics — provider-specific info comes from model_select
pi.on("session_start", async (_event: any, ctx: any) => {
// Reset session-level search budget (#1309)
sessionSearchCount = 0;
const hasBrave = !!process.env.BRAVE_API_KEY;
const hasJina = !!process.env.JINA_API_KEY;
const hasAnswers = !!process.env.BRAVE_ANSWERS_KEY;

View file

@ -5,6 +5,7 @@ import {
stripThinkingFromHistory,
BRAVE_TOOL_NAMES,
CUSTOM_SEARCH_TOOL_NAMES,
MAX_NATIVE_SEARCHES_PER_SESSION,
type NativeSearchPI,
} from "../resources/extensions/search-the-web/native-search.ts";
@ -688,6 +689,203 @@ test("model_select DOES show notification on explicit user set", async () => {
assert.ok(nativeNotif, "Should show notification on explicit 'set' source");
});
// ─── Session-level search budget (#1309) ────────────────────────────────────
test("session search budget: max_uses decreases as history accumulates search results", async () => {
const pi = createMockPI();
registerNativeSearchHooks(pi);
await pi.fire("model_select", {
type: "model_select",
model: { provider: "anthropic", name: "claude-sonnet-4-6" },
previousModel: undefined,
source: "set",
});
// Simulate a conversation with 10 web_search_tool_result blocks in history
const messages: any[] = [
{ role: "user", content: "research this topic" },
{
role: "assistant",
content: [
{ type: "web_search_tool_result", tool_use_id: "ws1", content: [] },
{ type: "web_search_tool_result", tool_use_id: "ws2", content: [] },
{ type: "web_search_tool_result", tool_use_id: "ws3", content: [] },
{ type: "web_search_tool_result", tool_use_id: "ws4", content: [] },
{ type: "web_search_tool_result", tool_use_id: "ws5", content: [] },
{ type: "text", text: "Here are some results..." },
],
},
{ role: "user", content: "continue" },
{
role: "assistant",
content: [
{ type: "web_search_tool_result", tool_use_id: "ws6", content: [] },
{ type: "web_search_tool_result", tool_use_id: "ws7", content: [] },
{ type: "web_search_tool_result", tool_use_id: "ws8", content: [] },
{ type: "web_search_tool_result", tool_use_id: "ws9", content: [] },
{ type: "web_search_tool_result", tool_use_id: "ws10", content: [] },
{ type: "text", text: "More results..." },
],
},
{ role: "user", content: "keep going" },
];
const payload: Record<string, unknown> = {
model: "claude-sonnet-4-6-20250514",
tools: [{ name: "bash", type: "custom" }],
messages,
};
const result = await pi.fire("before_provider_request", {
type: "before_provider_request",
payload,
});
const tools = ((result as any)?.tools ?? payload.tools) as any[];
const nativeTool = tools.find((t: any) => t.type === "web_search_20250305");
assert.ok(nativeTool, "Should still inject web_search when budget remaining");
// 15 - 10 = 5 remaining, min(5, 5) = 5
assert.equal(nativeTool.max_uses, 5, "Should cap at min(5, remaining)");
});
test("session search budget: reduces max_uses when close to limit", async () => {
const pi = createMockPI();
registerNativeSearchHooks(pi);
await pi.fire("model_select", {
type: "model_select",
model: { provider: "anthropic", name: "claude-sonnet-4-6" },
previousModel: undefined,
source: "set",
});
// 13 search results in history → only 2 remaining
const searchBlocks = Array.from({ length: 13 }, (_, i) => ({
type: "web_search_tool_result",
tool_use_id: `ws${i}`,
content: [],
}));
const messages: any[] = [
{ role: "user", content: "research" },
{ role: "assistant", content: [...searchBlocks, { type: "text", text: "results" }] },
{ role: "user", content: "more" },
];
const payload: Record<string, unknown> = {
model: "claude-sonnet-4-6-20250514",
tools: [{ name: "bash", type: "custom" }],
messages,
};
const result = await pi.fire("before_provider_request", {
type: "before_provider_request",
payload,
});
const tools = ((result as any)?.tools ?? payload.tools) as any[];
const nativeTool = tools.find((t: any) => t.type === "web_search_20250305");
assert.ok(nativeTool, "Should still inject when budget > 0");
// 15 - 13 = 2 remaining
assert.equal(nativeTool.max_uses, 2, "Should reduce max_uses to remaining budget");
});
test("session search budget: omits web_search tool when budget exhausted", async () => {
const pi = createMockPI();
registerNativeSearchHooks(pi);
await pi.fire("model_select", {
type: "model_select",
model: { provider: "anthropic", name: "claude-sonnet-4-6" },
previousModel: undefined,
source: "set",
});
// 15+ search results in history → budget exhausted
const searchBlocks = Array.from({ length: MAX_NATIVE_SEARCHES_PER_SESSION }, (_, i) => ({
type: "web_search_tool_result",
tool_use_id: `ws${i}`,
content: [],
}));
const messages: any[] = [
{ role: "user", content: "research" },
{ role: "assistant", content: [...searchBlocks, { type: "text", text: "results" }] },
{ role: "user", content: "more" },
];
const payload: Record<string, unknown> = {
model: "claude-sonnet-4-6-20250514",
tools: [{ name: "bash", type: "custom" }],
messages,
};
const result = await pi.fire("before_provider_request", {
type: "before_provider_request",
payload,
});
const tools = ((result as any)?.tools ?? payload.tools) as any[];
const nativeTool = tools.find((t: any) => t.type === "web_search_20250305");
assert.equal(nativeTool, undefined, "Should NOT inject web_search when budget exhausted (#1309)");
// Other tools should remain
assert.ok(tools.some((t: any) => t.name === "bash"), "Non-search tools should remain");
});
test("session search budget: resets on session_start", async () => {
const pi = createMockPI();
registerNativeSearchHooks(pi);
await pi.fire("model_select", {
type: "model_select",
model: { provider: "anthropic", name: "claude-sonnet-4-6" },
previousModel: undefined,
source: "set",
});
// First session: exhaust budget
const searchBlocks = Array.from({ length: MAX_NATIVE_SEARCHES_PER_SESSION }, (_, i) => ({
type: "web_search_tool_result",
tool_use_id: `ws${i}`,
content: [],
}));
let payload: Record<string, unknown> = {
model: "claude-sonnet-4-6-20250514",
tools: [{ name: "bash", type: "custom" }],
messages: [
{ role: "user", content: "research" },
{ role: "assistant", content: [...searchBlocks] },
{ role: "user", content: "more" },
],
};
await pi.fire("before_provider_request", { type: "before_provider_request", payload });
let tools = (payload.tools as any[]);
assert.ok(!tools.some((t: any) => t.type === "web_search_20250305"), "Budget should be exhausted");
// New session starts — counter resets
await pi.fire("session_start", { type: "session_start" });
// New request with no history — full budget available
payload = {
model: "claude-sonnet-4-6-20250514",
tools: [{ name: "bash", type: "custom" }],
messages: [{ role: "user", content: "new research" }],
};
const result = await pi.fire("before_provider_request", { type: "before_provider_request", payload });
tools = ((result as any)?.tools ?? payload.tools) as any[];
const nativeTool = tools.find((t: any) => t.type === "web_search_20250305");
assert.ok(nativeTool, "Should inject web_search after session reset");
assert.equal(nativeTool.max_uses, 5, "Should have full per-turn budget after reset");
});
test("MAX_NATIVE_SEARCHES_PER_SESSION is exported and equals 15", () => {
assert.equal(MAX_NATIVE_SEARCHES_PER_SESSION, 15, "Session budget should be 15 (#1309)");
});
// ─── stripThinkingFromHistory tests ─────────────────────────────────────────
test("stripThinkingFromHistory removes thinking from earlier assistant messages", () => {