From ee836142ed2d1c7a9d5086b5afc6a2e985d0e660 Mon Sep 17 00:00:00 2001 From: Mikael Hugo Date: Tue, 5 May 2026 13:29:28 +0200 Subject: [PATCH] fix: harden sift codebase indexing --- src/resources/agents/scout.md | 2 +- .../extensions/sf/bootstrap/system-context.js | 2 +- .../extensions/sf/code-intelligence.d.ts | 6 +- .../extensions/sf/code-intelligence.js | 372 ++++++++++++++++-- .../extensions/sf/prompts/discuss-headless.md | 2 +- .../extensions/sf/prompts/discuss.md | 2 +- .../sf/prompts/guided-discuss-milestone.md | 2 +- .../sf/prompts/guided-discuss-slice.md | 2 +- src/resources/extensions/sf/prompts/queue.md | 2 +- src/resources/extensions/sf/prompts/system.md | 2 +- .../sf/tests/code-intelligence-sift.test.mjs | 133 +++++++ .../extensions/sf/tools/sift-search-tool.js | 26 +- src/resources/extensions/subagent/index.js | 44 ++- 13 files changed, 544 insertions(+), 53 deletions(-) create mode 100644 src/resources/extensions/sf/tests/code-intelligence-sift.test.mjs diff --git a/src/resources/agents/scout.md b/src/resources/agents/scout.md index c06b9ee63..b8c4ebcdd 100644 --- a/src/resources/agents/scout.md +++ b/src/resources/agents/scout.md @@ -8,7 +8,7 @@ You are a scout. Quickly investigate a codebase and return structured findings t Use in-process `grep`, `find`, `ls`, and `lsp` before shelling out. These keep exploration inside SF's tool surface and use native backends where available. -Use `codebase_search` as your PRIMARY tool for conceptual, behavioral, or architectural discovery (e.g. "how does X work?", "where is Y handled?"). It uses Sift-backed hybrid BM25/vector retrieval and is significantly more effective than grep for navigating unfamiliar logic. Use `sift_search` when you need agentic multi-turn research, explicit strategy selection (e.g. `page-index-hybrid`, `path-hybrid`), or planner configuration. Use exact text search (`grep`) only when you already have a specific identifier or filename in mind. You are still the scout role; Sift is the powerful primitive you should lead with for exploration. +Use `grep`, `find`, and `ls` for broad orientation first. Use `codebase_search` for conceptual, behavioral, or architectural discovery only with a narrow scope and when the project code-intelligence status says Sift is healthy enough for this repo. Use `sift_search` when you need explicit strategy selection (e.g. `bm25`, `path-hybrid`, `page-index-hybrid`) and a scoped path. If Sift is degraded, slow, or empty, fall back to grep/find/ls and direct reads. Each repo has its own Sift cache under `.sf/runtime/sift/`. Your output will be passed to an agent who has NOT seen the files you explored. diff --git a/src/resources/extensions/sf/bootstrap/system-context.js b/src/resources/extensions/sf/bootstrap/system-context.js index 392da9134..282bc9de3 100644 --- a/src/resources/extensions/sf/bootstrap/system-context.js +++ b/src/resources/extensions/sf/bootstrap/system-context.js @@ -112,7 +112,7 @@ through these tiers IN ORDER. Skip a tier only when it has been demonstrably exhausted, not just because the next tier is faster. Tier 1 — Code lookup: - - sift / codebase_search for symbols, patterns, prior usages + - grep/find/ls for broad orientation; scoped sift / codebase_search for symbols, patterns, prior usages when Sift status is healthy for the repo - Read source files (Read tool, file paths from PLAN/CODEBASE) - Inspect .sf/DECISIONS.md, .sf/KNOWLEDGE.md, docs/design-docs/, docs/records/ - Check tests for documented behavior diff --git a/src/resources/extensions/sf/code-intelligence.d.ts b/src/resources/extensions/sf/code-intelligence.d.ts index ea97f587c..f3e54d4f2 100644 --- a/src/resources/extensions/sf/code-intelligence.d.ts +++ b/src/resources/extensions/sf/code-intelligence.d.ts @@ -2,7 +2,11 @@ export const PROJECT_RAG_MCP_SERVER_NAME: string; export function detectProjectRag(projectRoot: string, prefs: Record, env?: NodeJS.ProcessEnv): unknown; export function resolveProjectRagBinary(env?: NodeJS.ProcessEnv): string | null; export function resolveSiftBinary(env?: NodeJS.ProcessEnv): string | null; -export function detectSift(_projectRoot: string, prefs: Record, env?: NodeJS.ProcessEnv): unknown; +export function resolveSiftWarmupRuntimeDirs(projectRoot: string): { searchCache: string; tmpDir: string }; +export function ensureSiftRuntimeDirs(projectRoot: string): { searchCache: string; tmpDir: string }; +export function buildSiftEnv(projectRoot: string, env: NodeJS.ProcessEnv): NodeJS.ProcessEnv; +export function resolveSiftSearchScope(projectRoot: string, scope?: string): string; +export function detectSift(projectRoot: string, prefs: Record, env?: NodeJS.ProcessEnv): unknown; export function ensureSiftIndexWarmup(projectRoot: string, prefs: Record, options?: Record): Promise; export function resolveProjectRagBuildJobs(env?: NodeJS.ProcessEnv): number; export function findProjectRagSourceDir(projectRoot: string, env?: NodeJS.ProcessEnv): string | null; diff --git a/src/resources/extensions/sf/code-intelligence.js b/src/resources/extensions/sf/code-intelligence.js index bac56ce80..fca6c252b 100644 --- a/src/resources/extensions/sf/code-intelligence.js +++ b/src/resources/extensions/sf/code-intelligence.js @@ -5,8 +5,8 @@ * accelerators for local code retrieval. */ import { spawn, spawnSync } from "node:child_process"; -import { existsSync, mkdirSync, readFileSync, statSync, writeFileSync, } from "node:fs"; -import { delimiter, join, resolve } from "node:path"; +import { existsSync, mkdirSync, readFileSync, readdirSync, statSync, writeFileSync, } from "node:fs"; +import { delimiter, isAbsolute, join, relative, resolve } from "node:path"; export const PROJECT_RAG_MCP_SERVER_NAME = "project-rag"; const PROJECT_RAG_BINARY_NAME = process.platform === "win32" ? "project-rag.exe" : "project-rag"; const SIFT_BINARY_NAME = process.platform === "win32" ? "sift.exe" : "sift"; @@ -22,8 +22,17 @@ const DEFAULT_SIFT_WARMUP_TTL_MS = 6 * 60 * 60 * 1000; const DEFAULT_SIFT_WARMUP_QUERY = "repo architecture source tests entrypoints configuration"; const DEFAULT_SIFT_WARMUP_LIMIT = 1; const DEFAULT_SIFT_WARMUP_RETRIEVER_TIMEOUT_MS = 30_000; -const DEFAULT_SIFT_WARMUP_HARD_TIMEOUT_SEC = 30; +const DEFAULT_SIFT_WARMUP_HARD_TIMEOUT_SEC = 600; const SIFT_WARMUP_KILL_GRACE_SEC = 10; +const DEFAULT_SIFT_HEALTH_TIMEOUT_MS = 60_000; +const SIFT_HEALTH_CACHE = new Map(); +const SIFT_CACHE_POLLUTION_PATTERNS = [ + { label: ".claude worktrees", pattern: /(?:^|[/\\])\.claude[/\\]/ }, + { label: ".git internals", pattern: /(?:^|[/\\])\.git[/\\]/ }, + { label: "dist-test output", pattern: /(?:^|[/\\])dist-test[/\\]/ }, + { label: "node_modules", pattern: /(?:^|[/\\])node_modules[/\\]/ }, + { label: "package dist output", pattern: /(?:^|[/\\])packages[/\\][^/\\]+[/\\]dist[/\\]/ }, +]; export function resolveSiftWarmupRuntimeDirs(projectRoot) { const runtimeRoot = join(projectRoot, ".sf", "runtime", "sift"); return { @@ -31,6 +40,20 @@ export function resolveSiftWarmupRuntimeDirs(projectRoot) { tmpDir: join(runtimeRoot, "tmp"), }; } +/** + * Ensure the repo-local Sift runtime directories exist. + * + * Purpose: keep Sift's search database scoped to the current repository instead + * of sharing a process-global cache across unrelated projects. + * + * Consumer: Sift warmup, status probes, `sift_search`, and `codebase_search`. + */ +export function ensureSiftRuntimeDirs(projectRoot) { + const dirs = resolveSiftWarmupRuntimeDirs(projectRoot); + mkdirSync(dirs.searchCache, { recursive: true }); + mkdirSync(dirs.tmpDir, { recursive: true }); + return dirs; +} export function buildSiftEnv(projectRoot, env) { const dirs = resolveSiftWarmupRuntimeDirs(projectRoot); return { @@ -39,6 +62,27 @@ export function buildSiftEnv(projectRoot, env) { TMPDIR: dirs.tmpDir, }; } +/** + * Resolve a Sift search scope to the form Sift's local ignore matcher expects. + * + * Purpose: preserve `.siftignore` semantics by running Sift from the repository + * root with repo-relative scopes instead of absolute paths. + * + * Consumer: Sift warmup, `sift_search`, and `codebase_search`. + */ +export function resolveSiftSearchScope(projectRoot, scope) { + const normalizedRoot = normalizeProjectRoot(projectRoot); + const requested = typeof scope === "string" && scope.trim() ? scope.trim() : "."; + const absolute = isAbsolute(requested) + ? resolve(requested) + : resolve(normalizedRoot, requested); + const rel = relative(normalizedRoot, absolute); + if (!rel) + return "."; + if (!rel.startsWith("..") && !isAbsolute(rel)) + return rel; + return requested; +} function readJsonConfig(configPath) { if (!existsSync(configPath)) return {}; @@ -195,7 +239,156 @@ export function resolveSiftBinary(env = process.env) { return (lookupExecutable(SIFT_BINARY_NAME, env) ?? (SIFT_BINARY_NAME === "sift" ? null : lookupExecutable("sift", env))); } -export function detectSift(_projectRoot, prefs, env = process.env) { +function resolveSiftHealthTimeoutMs(env) { + const raw = env.SF_SIFT_HEALTH_TIMEOUT_MS?.trim(); + if (!raw) + return DEFAULT_SIFT_HEALTH_TIMEOUT_MS; + const parsed = Number.parseInt(raw, 10); + return Number.isFinite(parsed) && parsed > 0 ? parsed : DEFAULT_SIFT_HEALTH_TIMEOUT_MS; +} +function resolveSiftHealthProbePath(projectRoot) { + for (const candidate of ["src", "packages", "tests"]) { + const absolute = join(projectRoot, candidate); + if (existsSync(absolute)) + return candidate; + } + return "."; +} +function runSiftHealthProbe(projectRoot, binaryPath, env) { + const normalizedRoot = normalizeProjectRoot(projectRoot); + const timeoutMs = resolveSiftHealthTimeoutMs(env); + const probePath = resolveSiftHealthProbePath(normalizedRoot); + const cacheKey = [ + normalizedRoot, + binaryPath, + env.SIFT_PATH ?? "", + env.SF_SIFT_HEALTH_TIMEOUT_MS ?? "", + env.SF_SIFT_HEALTHCHECK_DISABLE ?? "", + ].join("\0"); + if (SIFT_HEALTH_CACHE.has(cacheKey)) + return SIFT_HEALTH_CACHE.get(cacheKey); + const dirs = ensureSiftRuntimeDirs(normalizedRoot); + if (env.SF_SIFT_HEALTHCHECK_DISABLE === "1") { + const skipped = { + ok: true, + probePath, + timeoutMs, + searchCache: dirs.searchCache, + tmpDir: dirs.tmpDir, + reason: "sift health probe disabled", + }; + SIFT_HEALTH_CACHE.set(cacheKey, skipped); + return skipped; + } + const result = spawnSync(binaryPath, [ + "search", + "--json", + "--strategy", + "bm25", + "--limit", + "1", + "--retriever-timeout-ms", + String(Math.min(timeoutMs, 1_000)), + probePath, + "function", + ], { + cwd: normalizedRoot, + env: buildSiftEnv(normalizedRoot, env), + encoding: "utf-8", + maxBuffer: 1024 * 1024, + timeout: timeoutMs, + }); + const probe = { + ok: result.status === 0, + probePath, + timeoutMs, + searchCache: dirs.searchCache, + tmpDir: dirs.tmpDir, + status: result.status, + signal: result.signal, + stderr: result.stderr, + reason: "", + }; + if (probe.ok) { + probe.reason = `sift scoped health probe passed for ${probePath}`; + } + else if (result.error?.code === "ETIMEDOUT" || result.signal) { + probe.reason = `sift scoped health probe timed out after ${timeoutMs}ms for ${probePath}`; + } + else if (result.error) { + probe.reason = `sift scoped health probe failed: ${result.error.message}`; + } + else { + const detail = String(result.stderr || "").trim(); + probe.reason = detail + ? `sift scoped health probe failed: ${detail.slice(0, 300)}` + : `sift scoped health probe exited ${result.status ?? "unknown"}`; + } + SIFT_HEALTH_CACHE.set(cacheKey, probe); + return probe; +} +function listFilesCapped(root, maxFiles = 32) { + const files = []; + const visit = (dir) => { + if (files.length >= maxFiles) + return; + let entries = []; + try { + entries = readdirSync(dir, { withFileTypes: true }); + } + catch { + return; + } + for (const entry of entries) { + if (files.length >= maxFiles) + return; + const path = join(dir, entry.name); + if (entry.isDirectory()) { + visit(path); + } + else if (entry.isFile()) { + files.push(path); + } + } + }; + visit(root); + return files; +} +function inspectSiftCache(projectRoot) { + const dirs = resolveSiftWarmupRuntimeDirs(projectRoot); + const manifestRoot = join(dirs.searchCache, "artifacts", "manifests"); + const samples = []; + for (const manifest of listFilesCapped(manifestRoot, 16)) { + let text = ""; + try { + text = readFileSync(manifest).toString("utf-8"); + } + catch { + continue; + } + for (const { label, pattern } of SIFT_CACHE_POLLUTION_PATTERNS) { + const match = text.match(pattern); + if (match) { + const start = Math.max(0, (match.index ?? 0) - 80); + const end = Math.min(text.length, (match.index ?? 0) + 160); + const sample = text + .slice(start, end) + .replace(/[^\x20-\x7E]+/g, " ") + .trim(); + samples.push({ label, sample }); + break; + } + } + if (samples.length >= 5) + break; + } + return { + inspected: existsSync(manifestRoot), + polluted: samples.length > 0, + samples, + }; +} +export function detectSift(projectRoot, prefs, env = process.env) { if (prefs?.indexer_backend === "none") { return { backend: "sift", @@ -221,14 +414,58 @@ export function detectSift(_projectRoot, prefs, env = process.env) { reason: "SIFT_PATH is set but does not resolve to an executable file.", }; } + const warmup = readSiftWarmupMarker(projectRoot); + if (warmup?.status === "warming") { + const dirs = ensureSiftRuntimeDirs(projectRoot); + return { + backend: "sift", + status: "warming", + command: binaryPath, + binaryPath, + searchCache: dirs.searchCache, + tmpDir: dirs.tmpDir, + probePath: warmup.scope ?? ".", + reason: `${explicit ? "sift binary resolved from SIFT_PATH" : "sift binary found on PATH"}; repo-local Sift index warmup is still running`, + markerPath: warmup.markerPath, + }; + } + const health = runSiftHealthProbe(projectRoot, binaryPath, env); + if (!health.ok) { + return { + backend: "sift", + status: "degraded", + command: binaryPath, + binaryPath, + searchCache: health.searchCache, + tmpDir: health.tmpDir, + probePath: health.probePath, + reason: `${explicit ? "sift binary resolved from SIFT_PATH" : "sift binary found on PATH"} but ${health.reason}`, + }; + } + const cacheInspection = inspectSiftCache(projectRoot); + if (cacheInspection.polluted) { + return { + backend: "sift", + status: "degraded", + command: binaryPath, + binaryPath, + searchCache: health.searchCache, + tmpDir: health.tmpDir, + probePath: health.probePath, + cacheInspection, + reason: `${explicit ? "sift binary resolved from SIFT_PATH" : "sift binary found on PATH"} but repo-local Sift cache contains ignored/generated paths`, + }; + } return { backend: "sift", status: "configured", command: binaryPath, binaryPath, - reason: explicit - ? "sift binary resolved from SIFT_PATH" - : "sift binary found on PATH", + searchCache: health.searchCache, + tmpDir: health.tmpDir, + probePath: health.probePath, + cacheInspection, + reason: `${explicit ? "sift binary resolved from SIFT_PATH" : "sift binary found on PATH"}; ${health.reason}`, }; } function isFreshMarker(markerPath, now, ttlMs) { @@ -237,6 +474,11 @@ function isFreshMarker(markerPath, now, ttlMs) { if (now - stat.mtimeMs >= ttlMs) return false; const parsed = JSON.parse(readFileSync(markerPath, "utf-8")); + if (parsed.schemaVersion === 3) { + if (parsed.status === "warming" && parsed.pid && !isProcessAlive(parsed.pid)) + return false; + return typeof parsed.scope === "string" && parsed.scope.length > 0; + } return (parsed.schemaVersion === 2 && Array.isArray(parsed.args) && parsed.args.at(-2) === "."); @@ -245,6 +487,38 @@ function isFreshMarker(markerPath, now, ttlMs) { return false; } } +function readSiftWarmupMarker(projectRoot) { + const markerPath = join(projectRoot, ".sf", "runtime", "sift-index-warmup.json"); + try { + if (!existsSync(markerPath)) + return null; + const parsed = JSON.parse(readFileSync(markerPath, "utf-8")); + if (parsed.schemaVersion !== 3) + return null; + if (parsed.status !== "warming") + return null; + if (parsed.pid && !isProcessAlive(parsed.pid)) + return null; + const started = Date.parse(parsed.startedAt); + const hardTimeoutSec = Number(parsed.hardTimeoutSec ?? DEFAULT_SIFT_WARMUP_HARD_TIMEOUT_SEC); + const expiresAt = started + Math.max(60, hardTimeoutSec + SIFT_WARMUP_KILL_GRACE_SEC) * 1000; + if (!Number.isFinite(started) || Date.now() > expiresAt) + return null; + return { ...parsed, markerPath }; + } + catch { + return null; + } +} +function isProcessAlive(pid) { + try { + process.kill(Number(pid), 0); + return true; + } + catch { + return false; + } +} export function ensureSiftIndexWarmup(projectRoot, prefs, options = {}) { const env = options.env ?? process.env; const backendName = resolveEffectiveCodebaseIndexerBackendName(projectRoot, prefs, env); @@ -254,8 +528,18 @@ export function ensureSiftIndexWarmup(projectRoot, prefs, options = {}) { reason: `effective codebase indexer is ${backendName}`, }; } - const detection = detectSift(projectRoot, prefs, env); - if (detection.status !== "configured" || !detection.binaryPath) { + const detection = detectSift(projectRoot, prefs, { + ...env, + SF_SIFT_HEALTHCHECK_DISABLE: "1", + }); + if (detection.status === "warming") { + return { + status: "skipped", + reason: "sift index warmup is already running", + markerPath: detection.markerPath, + }; + } + if (!["configured", "degraded"].includes(detection.status) || !detection.binaryPath) { return { status: "unavailable", reason: detection.reason, @@ -271,6 +555,7 @@ export function ensureSiftIndexWarmup(projectRoot, prefs, options = {}) { markerPath, }; } + const scope = resolveSiftSearchScope(projectRoot, options.scope ?? "."); const siftArgs = [ "search", "--json", @@ -280,7 +565,7 @@ export function ensureSiftIndexWarmup(projectRoot, prefs, options = {}) { String(options.limit ?? DEFAULT_SIFT_WARMUP_LIMIT), "--retriever-timeout-ms", String(options.retrieverTimeoutMs ?? DEFAULT_SIFT_WARMUP_RETRIEVER_TIMEOUT_MS), - ".", + scope, options.query ?? DEFAULT_SIFT_WARMUP_QUERY, ]; const hardTimeoutSec = resolveSiftWarmupHardTimeoutSec(env, options.hardTimeoutSec); @@ -298,27 +583,30 @@ export function ensureSiftIndexWarmup(projectRoot, prefs, options = {}) { : "sift page-index-hybrid warmup started (no timeout(1)/gtimeout on PATH; running unbounded)"; try { const runtimeDirs = resolveSiftWarmupRuntimeDirs(projectRoot); - mkdirSync(join(projectRoot, ".sf", "runtime"), { recursive: true }); - mkdirSync(runtimeDirs.searchCache, { recursive: true }); - mkdirSync(runtimeDirs.tmpDir, { recursive: true }); + ensureSiftRuntimeDirs(projectRoot); const childEnv = buildSiftEnv(projectRoot, env); - writeFileSync(markerPath, `${JSON.stringify({ - schemaVersion: 2, + const marker = { + schemaVersion: 3, + status: "warming", startedAt: new Date(now).toISOString(), command, cwd: projectRoot, args, + scope, siftBinary: detection.binaryPath, hardTimeoutSec: wrapper?.timeoutSec ?? null, searchCache: runtimeDirs.searchCache, tmpDir: runtimeDirs.tmpDir, - }, null, 2)}\n`, "utf-8"); + }; + writeFileSync(markerPath, `${JSON.stringify(marker, null, 2)}\n`, "utf-8"); const child = (options.spawnFn ?? spawn)(command, args, { cwd: projectRoot, env: childEnv, stdio: "ignore", detached: true, }); + marker.pid = child.pid ?? null; + writeFileSync(markerPath, `${JSON.stringify(marker, null, 2)}\n`, "utf-8"); child.unref(); return { status: "started", @@ -507,17 +795,23 @@ function buildSiftContextLines(projectRoot, prefs, env = process.env) { } else if (detection.status === "configured" && detection.binaryPath) { lines.push(`- Sift: configured as local CLI \`${detection.binaryPath}\`.`); - lines.push("- Use Sift for broad code retrieval before manual file-by-file reading, " + - "especially conceptual queries, exact identifiers, approximate file/path intent, and synthesis-ready snippets."); - lines.push("- Tool: `sift_search` exposes the full Sift CLI surface — use it for agentic multi-turn search, " + - "explicit strategy selection, and planner configuration."); - lines.push("- Tool: `codebase_search` is the platform-level wrapper — use it for simple conceptual queries."); + lines.push(`- Sift cache: project-scoped at \`${detection.searchCache}\`; do not use a shared/global Sift search database for this repo.`); + lines.push("- Use Sift with explicit, narrow paths after quick `grep`/`find`/`ls` orientation; avoid root-scope searches unless status proves they are responsive."); + lines.push("- Tool: `sift_search` exposes the full Sift CLI surface — prefer direct `bm25`, `path-hybrid`, or `page-index-hybrid` with a scoped `path`."); + lines.push("- Tool: `codebase_search` is the platform-level wrapper — use it only with a scoped `scope` when possible."); lines.push("- Strategy guide: `page-index-hybrid` (strongest recall + structural reranking), " + "`path-hybrid` (filename/path-heavy), `bm25` (fast lexical-only), `vector` (semantic-only)."); - lines.push("- Agent mode: enable `agent: true` on `sift_search` for multi-turn research. " + - "Use `agentMode: 'graph'` for disconnected code regions and `plannerStrategy: 'model-driven'` for LLM-guided planning."); - lines.push("- SF runs Sift warmup with a project-scoped `SIFT_SEARCH_CACHE` under `.sf/runtime/sift/` while leaving model cache shared; " + - "if the CLI is missing or fails, continue with `.sf/CODEBASE.md`, native `grep`/`find`/`ls`, `lsp`, and scout."); + lines.push("- If Sift is slow, empty, or times out, continue with `.sf/CODEBASE.md`, native `grep`/`find`/`ls`, `lsp`, and scout."); + } + else if (detection.status === "warming" && detection.binaryPath) { + lines.push(`- Sift: installed at \`${detection.binaryPath}\`; repo-local index warmup is running.`); + lines.push(`- Sift cache: project-scoped at \`${detection.searchCache}\`; do not use a shared/global Sift search database for this repo.`); + lines.push("- Use grep/find/ls and `.sf/CODEBASE.md` for broad orientation while warmup runs. Use narrow `sift_search` paths if needed; broad root-scope Sift may still be cold."); + } + else if (detection.status === "degraded" && detection.binaryPath) { + lines.push(`- Sift: installed at \`${detection.binaryPath}\` but degraded for this repo: ${detection.reason}.`); + lines.push(`- Sift cache: project-scoped at \`${detection.searchCache}\`; do not use a shared/global Sift search database for this repo.`); + lines.push("- Do not use broad Sift/codebase_search as the first exploration step. Prefer native `grep`/`find`/`ls`, `.sf/CODEBASE.md`, and narrow `sift_search` only after reducing scope."); } else { lines.push("- Sift: not available. This is optional; continue with `.sf/CODEBASE.md`, native `grep`/`find`/`ls`, `lsp`, and scout."); @@ -537,7 +831,7 @@ export function resolveEffectiveCodebaseIndexerBackendName(projectRoot, prefs, e if (prefs?.indexer_backend) return prefs.indexer_backend; const sift = detectSift(projectRoot, prefs, env); - if (sift.status === "configured") + if (["configured", "warming", "degraded"].includes(sift.status)) return "sift"; return "projectRag"; } @@ -606,8 +900,32 @@ export function formatSiftStatus(projectRoot, prefs, env = process.env) { lines.push(`Command: ${detection.command}`); if (detection.binaryPath) lines.push(`Binary: ${detection.binaryPath}`); + if (detection.searchCache) + lines.push(`Search cache: ${detection.searchCache}`); + if (detection.tmpDir) + lines.push(`Temp dir: ${detection.tmpDir}`); + if (detection.probePath) + lines.push(`Health probe scope: ${detection.probePath}`); + if (detection.markerPath) + lines.push(`Warmup marker: ${detection.markerPath}`); + if (detection.cacheInspection?.polluted) { + lines.push("Cache integrity: polluted - ignored/generated paths were found in repo-local Sift manifests."); + for (const sample of detection.cacheInspection.samples ?? []) { + lines.push(`Cache sample (${sample.label}): ${sample.sample}`); + } + lines.push("Action: remove .sf/runtime/sift/search-cache and warm Sift again from the repo root."); + } + else if (detection.cacheInspection?.inspected) { + lines.push("Cache integrity: ok - no ignored/generated path samples found in inspected manifests."); + } if (detection.status === "configured" && detection.command) { - lines.push(`Operational: ${commandExists(detection.command, env) ? "yes" : "no - configured command is missing"}`); + lines.push(`Operational: ${commandExists(detection.command, env) ? "yes - scoped health probe passed" : "no - configured command is missing"}`); + } + else if (detection.status === "warming" && detection.command) { + lines.push("Operational: warming - binary exists and repo-local index warmup is running. Give Sift time on CPU before broad searches."); + } + else if (detection.status === "degraded" && detection.command) { + lines.push("Operational: degraded - binary exists, but the bounded scoped health probe failed. Use narrow paths or fallback search."); } else { lines.push("Operational: no - install rupurt/sift on PATH or set SIFT_PATH."); diff --git a/src/resources/extensions/sf/prompts/discuss-headless.md b/src/resources/extensions/sf/prompts/discuss-headless.md index 594470708..37e50f9ef 100644 --- a/src/resources/extensions/sf/prompts/discuss-headless.md +++ b/src/resources/extensions/sf/prompts/discuss-headless.md @@ -76,7 +76,7 @@ Before anything else, form a diagnosis: What is the core challenge? What is brok - **Measure coverage**: find untested critical paths - **Scan for dead code, stubs, and commented-out features** — abandoned attempts are signals - **Discover needed skills**: identify repo languages, frameworks, data stores, external services, build tools, and domain-specific competencies. Check installed skills first; record installed, missing, and potentially useful skills in `.sf/CODEBASE.md` and `.sf/PM-STRATEGY.md`. -- **Use code intelligence**: use `codebase_search` (or Project RAG tools if configured) as your PRIMARY exploration method for conceptual, behavioral, or architectural discovery before manually reading files. Use `sift_search` for agentic multi-turn research or explicit strategy selection. Fall back to `.sf/CODEBASE.md`, in-process `grep`/`find`/`ls`, and `lsp` only for exact matches or structural navigation. +- **Use code intelligence**: start with `.sf/CODEBASE.md`, in-process `grep`/`find`/`ls`, and `lsp` for broad orientation. Use `codebase_search` or `sift_search` only with a scoped path and only when the `PROJECT CODE INTELLIGENCE` block says Sift is healthy enough for this repo; if Sift is degraded, slow, empty, or timing out, keep using grep/find/ls and direct reads. Use Project RAG tools first for broad retrieval if Project RAG is configured. - Use in-process `grep`, `find`, `ls`, and `lsp` before shelling out. Fall back to shell `rg`, `find`, `ast-grep`, or `ls -la` only when the native/in-process tool surface is insufficient. ### Step 2: Check library and ecosystem facts diff --git a/src/resources/extensions/sf/prompts/discuss.md b/src/resources/extensions/sf/prompts/discuss.md index 2ff9f6f04..ed42cb894 100644 --- a/src/resources/extensions/sf/prompts/discuss.md +++ b/src/resources/extensions/sf/prompts/discuss.md @@ -34,7 +34,7 @@ After reflection is confirmed, decide the approach based on the actual scope — Before asking your first question, do a mandatory investigation pass. This is not optional. -1. **Scout the codebase** — use `codebase_search` for conceptual, behavioral, or architectural discovery (e.g. "how does X work?", "where is Y handled?"); use `sift_search` for agentic multi-turn research or explicit strategy selection; use in-process `grep`, `find`, `ls`, and `lsp` for exact identifier matches or structural navigation. Use `scout` for broad unfamiliar areas that need a separate explorer. Understand what already exists, what patterns are established, what constraints current code imposes. +1. **Scout the codebase** — start with in-process `grep`, `find`, `ls`, `.sf/CODEBASE.md`, and `lsp` for broad orientation. Use `codebase_search` or `sift_search` only with a scoped path and only when the `PROJECT CODE INTELLIGENCE` block says Sift is healthy enough for this repo; if Sift is degraded, slow, empty, or timing out, keep using grep/find/ls and direct reads. Use `scout` for broad unfamiliar areas that need a separate explorer. Understand what already exists, what patterns are established, what constraints current code imposes. 2. **Check library docs — DeepWiki first.** Use `ask_question` / `read_wiki_structure` / `read_wiki_contents` (DeepWiki) as the default for any GitHub-hosted library or framework the user mentioned. Fall back to `resolve_library` / `get_library_docs` (Context7) for npm/pypi/crates packages DeepWiki doesn't have. **Context7 free tier is capped at 1000 req/month — spend those on cases DeepWiki can't cover.** Get current facts about capabilities, constraints, API shapes, version-specific behavior. 3. **Web search** — `search-the-web` if the domain is unfamiliar, if you need current best practices, or if the user referenced external services/APIs you need facts about. Use `fetch_page` for full content when snippets aren't enough. diff --git a/src/resources/extensions/sf/prompts/guided-discuss-milestone.md b/src/resources/extensions/sf/prompts/guided-discuss-milestone.md index 5c3491a6f..cf29a89db 100644 --- a/src/resources/extensions/sf/prompts/guided-discuss-milestone.md +++ b/src/resources/extensions/sf/prompts/guided-discuss-milestone.md @@ -15,7 +15,7 @@ Apply `pm-planning` skill thinking throughout: use Working Backwards to anchor o ### Before your first question round Do a lightweight targeted investigation so your questions are grounded in reality: -- Scout the codebase: use `codebase_search` for conceptual, behavioral, or architectural discovery (e.g. "how does X work?", "where is Y handled?"); use `sift_search` for agentic multi-turn research or explicit strategy selection; use in-process `grep`, `find`, `ls`, and `lsp` for exact identifier matches or structural navigation. Use `scout` for broad unfamiliar areas that need a separate explorer. +- Scout the codebase: start with in-process `grep`, `find`, `ls`, `.sf/CODEBASE.md`, and `lsp` for broad orientation. Use `codebase_search` or `sift_search` only with a scoped path and only when Sift is healthy for this repo; if Sift is degraded, slow, empty, or timing out, keep using grep/find/ls and direct reads. Use `scout` for broad unfamiliar areas that need a separate explorer. - If the `PROJECT CODE INTELLIGENCE` block says Project RAG is configured, use its MCP search tools for broad concept, symbol, schema, and git-history lookup before manually reading files - Check the roadmap context above (if present) to understand what surrounds this milestone - **Library docs — DeepWiki first.** Use `ask_question` / `read_wiki_structure` / `read_wiki_contents` (DeepWiki) for any GitHub-hosted library. Fall back to `resolve_library` / `get_library_docs` (Context7) only when DeepWiki doesn't have it (Context7 is capped at 1000 req/month free tier). diff --git a/src/resources/extensions/sf/prompts/guided-discuss-slice.md b/src/resources/extensions/sf/prompts/guided-discuss-slice.md index 369ed97d0..afd57ee8a 100644 --- a/src/resources/extensions/sf/prompts/guided-discuss-slice.md +++ b/src/resources/extensions/sf/prompts/guided-discuss-slice.md @@ -11,7 +11,7 @@ Your goal is **not** to center the discussion on tech stack trivia, naming conve ### Before your first question round Do a lightweight targeted investigation so your questions are grounded in reality: -- Scout the codebase: use `codebase_search` for conceptual, behavioral, or architectural discovery (e.g. "how does X work?", "where is Y handled?"); use `sift_search` for agentic multi-turn research or explicit strategy selection; use in-process `grep`, `find`, `ls`, and `lsp` for exact identifier matches or structural navigation. Use `scout` for broad unfamiliar areas that need a separate explorer. +- Scout the codebase: start with in-process `grep`, `find`, `ls`, `.sf/CODEBASE.md`, and `lsp` for broad orientation. Use `codebase_search` or `sift_search` only with a scoped path and only when Sift is healthy for this repo; if Sift is degraded, slow, empty, or timing out, keep using grep/find/ls and direct reads. Use `scout` for broad unfamiliar areas that need a separate explorer. - Check the roadmap context above to understand what surrounds this slice — what comes before, what depends on it - **Library docs — DeepWiki first.** Use `ask_question` / `read_wiki_structure` / `read_wiki_contents` (DeepWiki) for any GitHub-hosted library. Fall back to `resolve_library` / `get_library_docs` (Context7) only when DeepWiki doesn't have it (Context7 is capped at 1000 req/month free tier). - Identify the 3–5 biggest behavioural unknowns: things where the user's answer will materially change what gets built diff --git a/src/resources/extensions/sf/prompts/queue.md b/src/resources/extensions/sf/prompts/queue.md index e26395eb2..fafb8de8a 100644 --- a/src/resources/extensions/sf/prompts/queue.md +++ b/src/resources/extensions/sf/prompts/queue.md @@ -26,7 +26,7 @@ Never fabricate or simulate user input during this discussion. Never generate fa - Check library docs **DeepWiki first** (`ask_question` / `read_wiki_structure` / `read_wiki_contents`) for any GitHub-hosted library or framework — AI-indexed, no free-tier cap. Fall back to Context7 (`resolve_library` / `get_library_docs`) for npm/pypi/crates packages DeepWiki doesn't cover. Context7 free tier is 1000 req/month — don't spend those on cases DeepWiki covers. - Do web searches (`search-the-web`) to verify the landscape — what solutions exist, what's changed recently, what's the current best practice. Use `freshness` for recency-sensitive queries, `domain` to target specific sites. Use `fetch_page` to read the full content of promising URLs when snippets aren't enough. **Budget:** You have a limited number of web searches per turn (typically 3-5). Prefer DeepWiki → Context7 → web search for docs; use `search_and_read` for one-shot topic research. Do NOT repeat the same or similar queries. Distribute searches across turns rather than clustering them. -- Scout the codebase: use `codebase_search` for conceptual, behavioral, or architectural discovery (e.g. "how does X work?", "where is Y handled?"); use `sift_search` for agentic multi-turn research or explicit strategy selection; use in-process `grep`, `find`, `ls`, and `lsp` for exact identifier matches or structural navigation. Use `scout` for broad unfamiliar areas that need a separate explorer. Understand what already exists, what patterns are established, what constraints current code imposes. +- Scout the codebase: start with in-process `grep`, `find`, `ls`, `.sf/CODEBASE.md`, and `lsp` for broad orientation. Use `codebase_search` or `sift_search` only with a scoped path and only when Sift is healthy for this repo; if Sift is degraded, slow, empty, or timing out, keep using grep/find/ls and direct reads. Use `scout` for broad unfamiliar areas that need a separate explorer. Understand what already exists, what patterns are established, what constraints current code imposes. Don't go deep — just enough that your next question reflects what's actually true rather than what you assume. diff --git a/src/resources/extensions/sf/prompts/system.md b/src/resources/extensions/sf/prompts/system.md index 64c109804..ec3e525a4 100644 --- a/src/resources/extensions/sf/prompts/system.md +++ b/src/resources/extensions/sf/prompts/system.md @@ -161,7 +161,7 @@ Templates showing the expected format for each artifact type are in: **Code navigation:** Use `lsp` for definition, type_definition, implementation, references, incoming_calls, outgoing_calls, hover, signature, symbols, rename, code_actions, format, and diagnostics. Falls back gracefully if no server is available. Never `grep` for a symbol definition when `lsp` can resolve it semantically. Never shell out to prettier/rustfmt/gofmt when `lsp format` is available. After editing code, use `lsp diagnostics` to verify no type errors were introduced. -**Codebase exploration:** For conceptual, behavioral, or architectural discovery (e.g. "how does X work?", "where is Y handled?"), use `codebase_search` first. Its hybrid BM25+Vector retrieval is significantly more effective than grep for navigating unfamiliar logic. For Sift-specific features — agentic multi-turn search, explicit strategy selection, or planner configuration — use `sift_search`. Strategy guide: `page-index-hybrid` (strongest recall + structural reranking, default), `path-hybrid` (filename/path-heavy queries), `bm25` (fast lexical-only), `vector` (semantic-only). Enable `agent: true` with `agentMode: 'graph'` for deep multi-turn research across disconnected code regions, or `plannerStrategy: 'model-driven'` for LLM-guided planning. Use in-process SF tools like `grep` for exact text matches when you already have a specific identifier, and `find`/`ls` for literal filesystem discovery. Use `lsp` for structural navigation (definitions, references). Use `.sf/CODEBASE.md` for durable orientation. If the `PROJECT CODE INTELLIGENCE` block says Project RAG is configured, use its MCP tools for broad hybrid semantic + BM25 code retrieval before manual file-by-file reading. Never read files one-by-one to "explore" — search first, then read what's relevant. +**Codebase exploration:** Start broad orientation with in-process `grep`, `find`, `ls`, `.sf/CODEBASE.md`, and `lsp`. Use `codebase_search` for conceptual, behavioral, or architectural discovery only after choosing a narrow scope and checking the `PROJECT CODE INTELLIGENCE` block; if Sift is degraded, slow, empty, or timing out, keep using grep/find/ls and direct reads. For Sift-specific features — explicit strategy selection or planner configuration — use `sift_search` with a scoped `path`. Strategy guide: `bm25` (fast lexical), `path-hybrid` (filename/path-heavy queries), `page-index-hybrid` (stronger recall + reranking), `vector` (semantic-only). Each repo uses its own Sift cache under `.sf/runtime/sift/`; do not rely on a shared/global Sift database. Use `lsp` for structural navigation (definitions, references). If the `PROJECT CODE INTELLIGENCE` block says Project RAG is configured, use its MCP tools for broad hybrid semantic + BM25 code retrieval before manual file-by-file reading. Never read files one-by-one to "explore" — search first, then read what's relevant. **Swarm dispatch:** Let the system decide whether swarming fits before dispatching multiple execution subagents. Use a 2-3 worker same-model swarm only when the work splits into independent shards with explicit file/directory ownership, shard-local verification, low conflict risk, and clear wall-clock savings. Do not swarm shared-interface edits, lockfiles, migrations, single-failure debugging, or sequence-dependent work. The parent agent remains coordinator: assign ownership, synthesize results, inspect dirty files, resolve conflicts, and run final verification. diff --git a/src/resources/extensions/sf/tests/code-intelligence-sift.test.mjs b/src/resources/extensions/sf/tests/code-intelligence-sift.test.mjs new file mode 100644 index 000000000..b8e7432b2 --- /dev/null +++ b/src/resources/extensions/sf/tests/code-intelligence-sift.test.mjs @@ -0,0 +1,133 @@ +import assert from "node:assert/strict"; +import { chmodSync, existsSync, mkdirSync, mkdtempSync, rmSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import { join } from "node:path"; +import { afterEach, test } from "vitest"; + +import { + buildSiftEnv, + detectSift, + ensureSiftRuntimeDirs, + resolveSiftSearchScope, + resolveSiftWarmupRuntimeDirs, + resolveEffectiveCodebaseIndexerBackendName, +} from "../code-intelligence.js"; + +const tmpRoots = []; + +afterEach(() => { + for (const dir of tmpRoots.splice(0)) { + rmSync(dir, { recursive: true, force: true }); + } +}); + +function makeProject() { + const root = mkdtempSync(join(tmpdir(), "sf-sift-project-")); + tmpRoots.push(root); + mkdirSync(join(root, "src"), { recursive: true }); + writeFileSync(join(root, "src", "index.js"), "export const value = 1;\n"); + return root; +} + +function makeFakeSift(script) { + const dir = mkdtempSync(join(tmpdir(), "sf-sift-bin-")); + tmpRoots.push(dir); + const bin = join(dir, "sift"); + writeFileSync(bin, script); + chmodSync(bin, 0o755); + return { dir, bin }; +} + +test("buildSiftEnv_uses_project_scoped_cache_from_project_root", () => { + const projectRoot = makeProject(); + + const env = buildSiftEnv(projectRoot, { PATH: "/usr/bin" }); + + assert.equal(env.SIFT_SEARCH_CACHE, join(projectRoot, ".sf", "runtime", "sift", "search-cache")); + assert.equal(env.TMPDIR, join(projectRoot, ".sf", "runtime", "sift", "tmp")); +}); + +test("resolveSiftSearchScope_normalizes_project_absolute_paths_to_relative", () => { + const projectRoot = makeProject(); + + assert.equal(resolveSiftSearchScope(projectRoot), "."); + assert.equal(resolveSiftSearchScope(projectRoot, projectRoot), "."); + assert.equal(resolveSiftSearchScope(projectRoot, join(projectRoot, "src")), "src"); + assert.equal(resolveSiftSearchScope(projectRoot, "src"), "src"); +}); + +test("ensureSiftRuntimeDirs_creates_repo_local_cache_directories", () => { + const projectRoot = makeProject(); + const dirs = ensureSiftRuntimeDirs(projectRoot); + + assert.equal(dirs.searchCache, join(projectRoot, ".sf", "runtime", "sift", "search-cache")); + assert.ok(existsSync(dirs.searchCache)); + assert.ok(existsSync(dirs.tmpDir)); +}); + +test("detectSift_when_probe_times_out_reports_degraded_not_configured", () => { + const projectRoot = makeProject(); + const { bin } = makeFakeSift("#!/bin/sh\nsleep 5\n"); + + const result = detectSift(projectRoot, {}, { + PATH: process.env.PATH ?? "", + SIFT_PATH: bin, + SF_SIFT_HEALTH_TIMEOUT_MS: "50", + }); + + assert.equal(result.status, "degraded"); + assert.equal(result.binaryPath, bin); + assert.match(result.reason, /health probe timed out/i); +}); + +test("resolveEffectiveCodebaseIndexerBackendName_when_sift_is_cold_still_selects_sift", () => { + const projectRoot = makeProject(); + const { bin } = makeFakeSift("#!/bin/sh\nsleep 5\n"); + + const result = resolveEffectiveCodebaseIndexerBackendName(projectRoot, {}, { + PATH: process.env.PATH ?? "", + SIFT_PATH: bin, + SF_SIFT_HEALTH_TIMEOUT_MS: "50", + }); + + assert.equal(result, "sift"); +}); + +test("detectSift_when_probe_succeeds_reports_project_cache", () => { + const projectRoot = makeProject(); + const { bin } = makeFakeSift("#!/bin/sh\nprintf '{\"hits\":[]}\\n'\n"); + + const result = detectSift(projectRoot, {}, { + PATH: process.env.PATH ?? "", + SIFT_PATH: bin, + SF_SIFT_HEALTH_TIMEOUT_MS: "1000", + }); + const dirs = resolveSiftWarmupRuntimeDirs(projectRoot); + + assert.equal(result.status, "configured"); + assert.equal(result.binaryPath, bin); + assert.equal(result.searchCache, dirs.searchCache); + assert.equal(result.tmpDir, dirs.tmpDir); +}); + +test("detectSift_when_cache_manifest_contains_ignored_paths_reports_degraded", () => { + const projectRoot = makeProject(); + const { bin } = makeFakeSift("#!/bin/sh\nprintf '{\"hits\":[]}\\n'\n"); + const dirs = ensureSiftRuntimeDirs(projectRoot); + const manifestDir = join(dirs.searchCache, "artifacts", "manifests"); + mkdirSync(manifestDir, { recursive: true }); + writeFileSync( + join(manifestDir, "bad"), + `./src/index.js\n${projectRoot}/.claude/worktrees/agent/src/index.js\n`, + ); + + const result = detectSift(projectRoot, {}, { + PATH: process.env.PATH ?? "", + SIFT_PATH: bin, + SF_SIFT_HEALTH_TIMEOUT_MS: "1000", + }); + + assert.equal(result.status, "degraded"); + assert.match(result.reason, /cache contains ignored\/generated paths/i); + assert.equal(result.cacheInspection.polluted, true); +}); diff --git a/src/resources/extensions/sf/tools/sift-search-tool.js b/src/resources/extensions/sf/tools/sift-search-tool.js index 25212a891..3554df475 100644 --- a/src/resources/extensions/sf/tools/sift-search-tool.js +++ b/src/resources/extensions/sf/tools/sift-search-tool.js @@ -11,7 +11,7 @@ */ import { execFile } from "node:child_process"; import { Type } from "@sinclair/typebox"; -import { resolveSiftBinary } from "../code-intelligence.js"; +import { buildSiftEnv, ensureSiftRuntimeDirs, resolveSiftBinary, resolveSiftSearchScope } from "../code-intelligence.js"; const KNOWN_STRATEGIES = [ "hybrid", @@ -34,7 +34,8 @@ const DEFAULT_TIMEOUT_MS = 60_000; /** * Build the sift CLI argument list from tool parameters. */ -function buildSiftArgs(params) { +function buildSiftArgs(params, projectRoot = process.cwd()) { + const scope = resolveSiftSearchScope(projectRoot, params.path); const args = [ "search", "--json", @@ -57,7 +58,7 @@ function buildSiftArgs(params) { } // Path and query are positional - args.push(params.path ?? ".", params.query); + args.push(scope, params.query); return args; } @@ -104,13 +105,16 @@ function parseSiftOutput(rawStdout, rawStderr) { /** * Execute a sift search with the given parameters. */ -function runSift(binaryPath, args, timeoutMs) { +function runSift(binaryPath, args, timeoutMs, projectRoot) { return new Promise((resolve, reject) => { + ensureSiftRuntimeDirs(projectRoot); const child = execFile( binaryPath, args, { + cwd: projectRoot, encoding: "utf-8", + env: buildSiftEnv(projectRoot, process.env), maxBuffer: 16 * 1024 * 1024, timeout: timeoutMs, }, @@ -154,7 +158,7 @@ export function registerSiftSearchTool(pi) { }), path: Type.Optional( Type.String({ - description: "Directory or file path to search within. Default: current directory ('.').", + description: "Directory or file path to search within. Default: repository root ('.'); absolute paths inside the repo are normalized to repo-relative paths so .siftignore applies.", default: ".", }), ), @@ -236,14 +240,17 @@ export function registerSiftSearchTool(pi) { }; } - const args = buildSiftArgs(params); + const projectRoot = process.cwd(); + const args = buildSiftArgs(params, projectRoot); + const scope = args.at(-2) ?? "."; const timeoutMs = params.timeoutMs ?? DEFAULT_TIMEOUT_MS; const startedAt = Date.now(); try { - const { stdout, stderr } = await runSift(binaryPath, args, timeoutMs); + const { stdout, stderr } = await runSift(binaryPath, args, timeoutMs, projectRoot); const elapsedMs = Date.now() - startedAt; const result = parseSiftOutput(stdout, stderr); + const runtimeDirs = ensureSiftRuntimeDirs(projectRoot); // Telemetry: log query outcomes for tuning const { logInfo } = await import("../workflow-logger.js"); @@ -251,14 +258,18 @@ export function registerSiftSearchTool(pi) { query: params.query, strategy: params.strategy ?? DEFAULT_STRATEGY, agent: params.agent ?? false, + path: scope, hitCount: result.hits.length, elapsedMs, binary: binaryPath, + searchCache: runtimeDirs.searchCache, }); const lines = [ `Sift search: "${params.query}"`, `Strategy: ${params.strategy ?? DEFAULT_STRATEGY}${params.agent ? ` | agent: ${params.agentMode ?? "linear"} | planner: ${params.plannerStrategy ?? "heuristic"}` : ""}`, + `Scope: ${scope}`, + `Search cache: ${runtimeDirs.searchCache}`, `Hits: ${result.hits.length} | Elapsed: ${elapsedMs}ms`, "", ]; @@ -288,6 +299,7 @@ export function registerSiftSearchTool(pi) { elapsedMs, hitCount: result.hits.length, hits: result.hits, + searchCache: runtimeDirs.searchCache, }, }; } catch (err) { diff --git a/src/resources/extensions/subagent/index.js b/src/resources/extensions/subagent/index.js index 1aa6fd9d3..56e32f811 100644 --- a/src/resources/extensions/subagent/index.js +++ b/src/resources/extensions/subagent/index.js @@ -22,7 +22,7 @@ import { StringEnum } from "@singularity-forge/pi-ai"; import { getMarkdownTheme, } from "@singularity-forge/pi-coding-agent"; import { Container, Markdown, Spacer, Text } from "@singularity-forge/pi-tui"; import { CmuxClient, shellEscape } from "../cmux/index.js"; -import { buildSiftEnv, resolveSiftBinary } from "../sf/code-intelligence.js"; +import { buildSiftEnv, ensureSiftRuntimeDirs, resolveSiftBinary, resolveSiftSearchScope } from "../sf/code-intelligence.js"; import { loadEffectiveSFPreferences } from "../sf/preferences.js"; import { formatTokenCount } from "../shared/mod.js"; import { getCurrentPhase } from "../shared/sf-phase-state.js"; @@ -1789,7 +1789,7 @@ export default function (pi) { description: "Natural-language query describing what to explore (e.g. 'find where the write gate tool_call hooks are registered')", }), scope: Type.Optional(Type.String({ - description: "Path to search within. Defaults to the current working directory. Use the active worktree for isolation.", + description: "Path to search within. Defaults to repository root ('.'); absolute paths inside the repo are normalized to repo-relative paths so .siftignore applies.", })), strategy: Type.Optional(Type.String({ description: "Search strategy: 'path-hybrid' (default), 'page-index-hybrid', 'bm25', or 'path'", @@ -1802,22 +1802,23 @@ export default function (pi) { name: "codebase_search", label: "Code Search", description: [ - "Perform Sift-backed hybrid (BM25 + Vector) retrieval over a codebase scope.", - " Use this as your PRIMARY exploration tool for conceptual, behavioral, or cross-cutting questions", + "Perform Sift-backed hybrid (BM25 + Vector) retrieval over a scoped codebase path.", + " Use this for conceptual, behavioral, or cross-cutting questions only after choosing a narrow scope", " (e.g. 'how is X handled?', 'where is the logic for Y?', 'find examples of Z').", - " It is significantly more effective than grep for discovering unfamiliar logic and architecture.", + " If Sift status is degraded or the scope is broad, prefer grep/find/ls and retry with a narrower scope.", ].join(""), promptGuidelines: [ - "Use codebase_search BEFORE grep when exploring unfamiliar areas or conceptual patterns.", + "Use grep/find/ls for broad orientation first, then codebase_search with a specific scope for conceptual patterns.", " page-index-hybrid (default): Use for 'How' and 'Why' questions (logic, implementation, reasoning).", " path-hybrid: Use for 'Where' questions (architecture, directory structure, file location).", + " Keep scope narrow enough to avoid root-level Sift timeouts; each repo uses its own SIFT_SEARCH_CACHE under .sf/runtime/sift/.", " Be descriptive in your query: include function names, types, or intent (e.g. 'auth middleware validation').", " This tool is read-only and optimized for evidence gathering before you plan or edit.", ], parameters: CodebaseSearchParams, renderCall(args, theme) { const query = typeof args.query === "string" ? args.query : ""; - const scope = typeof args.scope === "string" ? args.scope : process.cwd(); + const scope = resolveSiftSearchScope(process.cwd(), typeof args.scope === "string" ? args.scope : undefined); const strategy = typeof args.strategy === "string" ? args.strategy : "page-index-hybrid"; const preview = query.length > 90 ? `${query.slice(0, 89).trimEnd()}…` : query; const scopeLabel = scope.length > 70 @@ -1860,7 +1861,8 @@ export default function (pi) { return new Text(rendered, 0, 0); }, async execute(_toolCallId, params, signal) { - const scope = params.scope ?? process.cwd(); + const projectRoot = process.cwd(); + const scope = resolveSiftSearchScope(projectRoot, params.scope); const strategy = params.strategy ?? "page-index-hybrid"; const query = params.query; const timeoutMs = typeof params.timeoutMs === "number" && @@ -1868,14 +1870,33 @@ export default function (pi) { ? Math.max(1_000, params.timeoutMs) : CODEBASE_SEARCH_TIMEOUT_MS; const siftBin = resolveSiftBinary(); + if (!siftBin) { + return { + content: [ + { + type: "text", + text: "codebase_search unavailable: sift binary not found. Use grep/find/ls or set SIFT_PATH.", + }, + ], + details: { + operation: "codebase_search", + exitCode: 127, + query, + scope, + strategy, + timeoutMs, + }, + }; + } const args = buildCodebaseSearchArgs(strategy, query, scope); const stderr = []; const stdout = []; let wasAborted = false; let timedOut = false; - const childEnv = buildSiftEnv(scope, process.env); + const runtimeDirs = ensureSiftRuntimeDirs(projectRoot); + const childEnv = buildSiftEnv(projectRoot, process.env); const proc = spawn(siftBin, args, { - cwd: scope, + cwd: projectRoot, env: childEnv, shell: false, stdio: ["ignore", "pipe", "pipe"], @@ -1951,6 +1972,7 @@ export default function (pi) { scope, strategy, timeoutMs, + searchCache: runtimeDirs.searchCache, }, }; } @@ -1977,6 +1999,7 @@ export default function (pi) { scope, strategy, timeoutMs, + searchCache: runtimeDirs.searchCache, }, }; } @@ -1995,6 +2018,7 @@ export default function (pi) { exitCode, siftBin, timeoutMs, + searchCache: runtimeDirs.searchCache, }, }; },