diff --git a/src/resources/extensions/gsd/prompts/research-milestone.md b/src/resources/extensions/gsd/prompts/research-milestone.md index b67516e3b..77210ded4 100644 --- a/src/resources/extensions/gsd/prompts/research-milestone.md +++ b/src/resources/extensions/gsd/prompts/research-milestone.md @@ -25,9 +25,10 @@ Then research the codebase and relevant technologies. Narrate key findings and s 2. **Skill Discovery ({{skillDiscoveryMode}}):**{{skillDiscoveryInstructions}} 3. Explore relevant code. For small/familiar codebases, use `rg`, `find`, and targeted reads. For large or unfamiliar codebases, use `scout` to build a broad map efficiently before diving in. 4. Use `resolve_library` / `get_library_docs` for unfamiliar libraries — skip this for libraries already used in the codebase -5. Use the **Research** output template from the inlined context above — include only sections that have real content -6. If `.gsd/REQUIREMENTS.md` exists, research against it. Identify which Active requirements are table stakes, likely omissions, overbuilt risks, or domain-standard behaviors the user may or may not want. -7. Write `{{outputPath}}` +5. **Web search budget:** You have a limited budget of web searches (max ~15 per session). Use them strategically — prefer `resolve_library` / `get_library_docs` for library documentation. Do NOT repeat the same or similar queries. If a search didn't find what you need, rephrase once or move on. Target 3-5 total web searches for a typical research unit. +6. Use the **Research** output template from the inlined context above — include only sections that have real content +7. If `.gsd/REQUIREMENTS.md` exists, research against it. Identify which Active requirements are table stakes, likely omissions, overbuilt risks, or domain-standard behaviors the user may or may not want. +8. Write `{{outputPath}}` ## Strategic Questions to Answer diff --git a/src/resources/extensions/gsd/prompts/research-slice.md b/src/resources/extensions/gsd/prompts/research-slice.md index c440851e1..b64e75257 100644 --- a/src/resources/extensions/gsd/prompts/research-slice.md +++ b/src/resources/extensions/gsd/prompts/research-slice.md @@ -46,8 +46,9 @@ Research what this slice needs. Narrate key findings and surprises as you go — 2. **Skill Discovery ({{skillDiscoveryMode}}):**{{skillDiscoveryInstructions}} 3. Explore relevant code for this slice's scope. For targeted exploration, use `rg`, `find`, and reads. For broad or unfamiliar subsystems, use `scout` to map the relevant area first. 4. Use `resolve_library` / `get_library_docs` for unfamiliar libraries — skip this for libraries already used in the codebase -5. Use the **Research** output template from the inlined context above — include only sections that have real content. The template is already inlined above; do NOT attempt to read any template file from disk (there is no `templates/SLICE-RESEARCH.md` — the correct template is already present in this prompt). -6. Write `{{outputPath}}` +5. **Web search budget:** You have a limited budget of web searches (max ~15 per session). Use them strategically — prefer `resolve_library` / `get_library_docs` for library documentation. Do NOT repeat the same or similar queries. If a search didn't find what you need, rephrase once or move on. Target 3-5 total web searches for a typical research unit. +6. Use the **Research** output template from the inlined context above — include only sections that have real content. The template is already inlined above; do NOT attempt to read any template file from disk (there is no `templates/SLICE-RESEARCH.md` — the correct template is already present in this prompt). +7. Write `{{outputPath}}` The slice directory already exists at `{{slicePath}}/`. Do NOT mkdir — just write the file. diff --git a/src/resources/extensions/search-the-web/native-search.ts b/src/resources/extensions/search-the-web/native-search.ts index d57c528b7..46b355e00 100644 --- a/src/resources/extensions/search-the-web/native-search.ts +++ b/src/resources/extensions/search-the-web/native-search.ts @@ -16,6 +16,16 @@ export const CUSTOM_SEARCH_TOOL_NAMES = ["search-the-web", "search_and_read", "g /** Thinking block types that require signature validation by the API */ const THINKING_TYPES = new Set(["thinking", "redacted_thinking"]); +/** + * Maximum number of native web searches allowed per session (agent unit). + * The Anthropic API's `max_uses` is per-request — it resets on each API call. + * When `pause_turn` triggers a resubmit, the model gets a fresh budget. + * This session-level cap prevents unbounded search accumulation (#1309). + * + * 15 = 3 full turns of 5 searches each — generous for research, but bounded. + */ +export const MAX_NATIVE_SEARCHES_PER_SESSION = 15; + /** When true, skip native web search injection and keep Brave/custom tools active on Anthropic. */ export function preferBraveSearch(): boolean { // preferences.md takes priority over env var @@ -74,6 +84,11 @@ export function registerNativeSearchHooks(pi: NativeSearchPI): { getIsAnthropic: let isAnthropicProvider = false; let modelSelectFired = false; + // Session-level native search counter (#1309). + // Tracks cumulative web_search_tool_result blocks across all turns in a session. + // Reset on session_start. Used to compute remaining budget for max_uses. + let sessionSearchCount = 0; + // Track provider changes via model selection — also handles diagnostics // since model_select fires AFTER session_start and knows the provider. pi.on("model_select", async (event: any, ctx: any) => { @@ -161,13 +176,41 @@ export function registerNativeSearchHooks(pi: NativeSearchPI): { getIsAnthropic: ); payload.tools = tools; + // ── Session-level search budget (#1309) ────────────────────────────── + // Count web_search_tool_result blocks in the conversation history to + // determine how many native searches have already been used this session. + // The Anthropic API's max_uses resets per request, so without this guard, + // pause_turn → resubmit cycles allow unlimited total searches. + if (Array.isArray(messages)) { + let historySearchCount = 0; + for (const msg of messages) { + const content = msg.content; + if (!Array.isArray(content)) continue; + for (const block of content) { + if ((block as any)?.type === "web_search_tool_result") { + historySearchCount++; + } + } + } + // Sync counter from history (handles session restore / context replay) + sessionSearchCount = historySearchCount; + } + + const remaining = Math.max(0, MAX_NATIVE_SEARCHES_PER_SESSION - sessionSearchCount); + + if (remaining <= 0) { + // Budget exhausted — don't inject the search tool at all. + // The model will proceed without web search capability. + return payload; + } + tools.push({ type: "web_search_20250305", name: "web_search", - // Cap server-side searches per response to prevent the model from - // looping on web_search without synthesizing results (#817). - // 5 searches is generous — most queries need 1-2. - max_uses: 5, + // Cap per-request searches to the lesser of 5 (per-turn cap) or the + // remaining session budget (#1309). This prevents the model from + // consuming unlimited searches via pause_turn → resubmit cycles. + max_uses: Math.min(5, remaining), }); return payload; @@ -175,6 +218,9 @@ export function registerNativeSearchHooks(pi: NativeSearchPI): { getIsAnthropic: // Basic startup diagnostics — provider-specific info comes from model_select pi.on("session_start", async (_event: any, ctx: any) => { + // Reset session-level search budget (#1309) + sessionSearchCount = 0; + const hasBrave = !!process.env.BRAVE_API_KEY; const hasJina = !!process.env.JINA_API_KEY; const hasAnswers = !!process.env.BRAVE_ANSWERS_KEY; diff --git a/src/tests/native-search.test.ts b/src/tests/native-search.test.ts index 3547c0ce2..9cabac87b 100644 --- a/src/tests/native-search.test.ts +++ b/src/tests/native-search.test.ts @@ -5,6 +5,7 @@ import { stripThinkingFromHistory, BRAVE_TOOL_NAMES, CUSTOM_SEARCH_TOOL_NAMES, + MAX_NATIVE_SEARCHES_PER_SESSION, type NativeSearchPI, } from "../resources/extensions/search-the-web/native-search.ts"; @@ -688,6 +689,203 @@ test("model_select DOES show notification on explicit user set", async () => { assert.ok(nativeNotif, "Should show notification on explicit 'set' source"); }); +// ─── Session-level search budget (#1309) ──────────────────────────────────── + +test("session search budget: max_uses decreases as history accumulates search results", async () => { + const pi = createMockPI(); + registerNativeSearchHooks(pi); + + await pi.fire("model_select", { + type: "model_select", + model: { provider: "anthropic", name: "claude-sonnet-4-6" }, + previousModel: undefined, + source: "set", + }); + + // Simulate a conversation with 10 web_search_tool_result blocks in history + const messages: any[] = [ + { role: "user", content: "research this topic" }, + { + role: "assistant", + content: [ + { type: "web_search_tool_result", tool_use_id: "ws1", content: [] }, + { type: "web_search_tool_result", tool_use_id: "ws2", content: [] }, + { type: "web_search_tool_result", tool_use_id: "ws3", content: [] }, + { type: "web_search_tool_result", tool_use_id: "ws4", content: [] }, + { type: "web_search_tool_result", tool_use_id: "ws5", content: [] }, + { type: "text", text: "Here are some results..." }, + ], + }, + { role: "user", content: "continue" }, + { + role: "assistant", + content: [ + { type: "web_search_tool_result", tool_use_id: "ws6", content: [] }, + { type: "web_search_tool_result", tool_use_id: "ws7", content: [] }, + { type: "web_search_tool_result", tool_use_id: "ws8", content: [] }, + { type: "web_search_tool_result", tool_use_id: "ws9", content: [] }, + { type: "web_search_tool_result", tool_use_id: "ws10", content: [] }, + { type: "text", text: "More results..." }, + ], + }, + { role: "user", content: "keep going" }, + ]; + + const payload: Record = { + model: "claude-sonnet-4-6-20250514", + tools: [{ name: "bash", type: "custom" }], + messages, + }; + + const result = await pi.fire("before_provider_request", { + type: "before_provider_request", + payload, + }); + + const tools = ((result as any)?.tools ?? payload.tools) as any[]; + const nativeTool = tools.find((t: any) => t.type === "web_search_20250305"); + assert.ok(nativeTool, "Should still inject web_search when budget remaining"); + // 15 - 10 = 5 remaining, min(5, 5) = 5 + assert.equal(nativeTool.max_uses, 5, "Should cap at min(5, remaining)"); +}); + +test("session search budget: reduces max_uses when close to limit", async () => { + const pi = createMockPI(); + registerNativeSearchHooks(pi); + + await pi.fire("model_select", { + type: "model_select", + model: { provider: "anthropic", name: "claude-sonnet-4-6" }, + previousModel: undefined, + source: "set", + }); + + // 13 search results in history → only 2 remaining + const searchBlocks = Array.from({ length: 13 }, (_, i) => ({ + type: "web_search_tool_result", + tool_use_id: `ws${i}`, + content: [], + })); + + const messages: any[] = [ + { role: "user", content: "research" }, + { role: "assistant", content: [...searchBlocks, { type: "text", text: "results" }] }, + { role: "user", content: "more" }, + ]; + + const payload: Record = { + model: "claude-sonnet-4-6-20250514", + tools: [{ name: "bash", type: "custom" }], + messages, + }; + + const result = await pi.fire("before_provider_request", { + type: "before_provider_request", + payload, + }); + + const tools = ((result as any)?.tools ?? payload.tools) as any[]; + const nativeTool = tools.find((t: any) => t.type === "web_search_20250305"); + assert.ok(nativeTool, "Should still inject when budget > 0"); + // 15 - 13 = 2 remaining + assert.equal(nativeTool.max_uses, 2, "Should reduce max_uses to remaining budget"); +}); + +test("session search budget: omits web_search tool when budget exhausted", async () => { + const pi = createMockPI(); + registerNativeSearchHooks(pi); + + await pi.fire("model_select", { + type: "model_select", + model: { provider: "anthropic", name: "claude-sonnet-4-6" }, + previousModel: undefined, + source: "set", + }); + + // 15+ search results in history → budget exhausted + const searchBlocks = Array.from({ length: MAX_NATIVE_SEARCHES_PER_SESSION }, (_, i) => ({ + type: "web_search_tool_result", + tool_use_id: `ws${i}`, + content: [], + })); + + const messages: any[] = [ + { role: "user", content: "research" }, + { role: "assistant", content: [...searchBlocks, { type: "text", text: "results" }] }, + { role: "user", content: "more" }, + ]; + + const payload: Record = { + model: "claude-sonnet-4-6-20250514", + tools: [{ name: "bash", type: "custom" }], + messages, + }; + + const result = await pi.fire("before_provider_request", { + type: "before_provider_request", + payload, + }); + + const tools = ((result as any)?.tools ?? payload.tools) as any[]; + const nativeTool = tools.find((t: any) => t.type === "web_search_20250305"); + assert.equal(nativeTool, undefined, "Should NOT inject web_search when budget exhausted (#1309)"); + // Other tools should remain + assert.ok(tools.some((t: any) => t.name === "bash"), "Non-search tools should remain"); +}); + +test("session search budget: resets on session_start", async () => { + const pi = createMockPI(); + registerNativeSearchHooks(pi); + + await pi.fire("model_select", { + type: "model_select", + model: { provider: "anthropic", name: "claude-sonnet-4-6" }, + previousModel: undefined, + source: "set", + }); + + // First session: exhaust budget + const searchBlocks = Array.from({ length: MAX_NATIVE_SEARCHES_PER_SESSION }, (_, i) => ({ + type: "web_search_tool_result", + tool_use_id: `ws${i}`, + content: [], + })); + + let payload: Record = { + model: "claude-sonnet-4-6-20250514", + tools: [{ name: "bash", type: "custom" }], + messages: [ + { role: "user", content: "research" }, + { role: "assistant", content: [...searchBlocks] }, + { role: "user", content: "more" }, + ], + }; + + await pi.fire("before_provider_request", { type: "before_provider_request", payload }); + let tools = (payload.tools as any[]); + assert.ok(!tools.some((t: any) => t.type === "web_search_20250305"), "Budget should be exhausted"); + + // New session starts — counter resets + await pi.fire("session_start", { type: "session_start" }); + + // New request with no history — full budget available + payload = { + model: "claude-sonnet-4-6-20250514", + tools: [{ name: "bash", type: "custom" }], + messages: [{ role: "user", content: "new research" }], + }; + + const result = await pi.fire("before_provider_request", { type: "before_provider_request", payload }); + tools = ((result as any)?.tools ?? payload.tools) as any[]; + const nativeTool = tools.find((t: any) => t.type === "web_search_20250305"); + assert.ok(nativeTool, "Should inject web_search after session reset"); + assert.equal(nativeTool.max_uses, 5, "Should have full per-turn budget after reset"); +}); + +test("MAX_NATIVE_SEARCHES_PER_SESSION is exported and equals 15", () => { + assert.equal(MAX_NATIVE_SEARCHES_PER_SESSION, 15, "Session budget should be 15 (#1309)"); +}); + // ─── stripThinkingFromHistory tests ───────────────────────────────────────── test("stripThinkingFromHistory removes thinking from earlier assistant messages", () => {