From c36c8bd0b005ed54b51400d8efd00a29b5dd4785 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?T=C3=82CHES?= Date: Fri, 13 Mar 2026 12:45:56 -0600 Subject: [PATCH] feat: add native glob and fs_cache modules with gitignore-aware discovery (#226) Port glob, glob_util, and fs_cache modules from Oh My Pi's pi-natives crate, adapted for napi-rs v2. Provides gitignore-respecting filesystem discovery with a TTL-based scan cache, mtime sorting, file-type filtering, and node_modules exclusion. Includes a task module for async N-API work scheduling with cooperative cancellation (timeout-based), TypeScript type declarations and wrapper, and 12 integration tests covering pattern matching, recursion, gitignore, maxResults, sortByMtime, fileType filtering, and cache invalidation. Co-authored-by: Claude Opus 4.6 (1M context) --- native/Cargo.lock | 66 +++ native/crates/engine/Cargo.toml | 3 + native/crates/engine/src/fs_cache.rs | 423 ++++++++++++++++++++ native/crates/engine/src/glob.rs | 275 +++++++++++++ native/crates/engine/src/glob_util.rs | 109 +++++ native/crates/engine/src/lib.rs | 4 + native/crates/engine/src/task.rs | 107 +++++ packages/native/package.json | 8 +- packages/native/src/__tests__/glob.test.mjs | 237 +++++++++++ packages/native/src/glob/index.ts | 44 ++ packages/native/src/glob/types.ts | 53 +++ packages/native/src/index.ts | 9 + packages/native/src/native.ts | 5 + 13 files changed, 1341 insertions(+), 2 deletions(-) create mode 100644 native/crates/engine/src/fs_cache.rs create mode 100644 native/crates/engine/src/glob.rs create mode 100644 native/crates/engine/src/glob_util.rs create mode 100644 native/crates/engine/src/task.rs create mode 100644 packages/native/src/__tests__/glob.test.mjs create mode 100644 packages/native/src/glob/index.ts create mode 100644 packages/native/src/glob/types.ts diff --git a/native/Cargo.lock b/native/Cargo.lock index 9f109ed13..a832afca6 100644 --- a/native/Cargo.lock +++ b/native/Cargo.lock @@ -78,6 +78,20 @@ dependencies = [ "syn", ] +[[package]] +name = "dashmap" +version = "6.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5041cc499144891f3790297212f32a74fb938e5136a14943f338ef9e0ae276cf" +dependencies = [ + "cfg-if", + "crossbeam-utils", + "hashbrown", + "lock_api", + "once_cell", + "parking_lot_core", +] + [[package]] name = "either" version = "1.15.0" @@ -156,7 +170,10 @@ dependencies = [ name = "gsd-engine" version = "0.1.0" dependencies = [ + "dashmap", + "globset", "gsd-grep", + "ignore", "libc", "napi", "napi-build", @@ -174,6 +191,12 @@ dependencies = [ "rayon", ] +[[package]] +name = "hashbrown" +version = "0.14.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e5274423e17b7c9fc20b6e7e208532f9b19825d82dfd615708b70edd83df41f1" + [[package]] name = "ignore" version = "0.4.25" @@ -206,6 +229,15 @@ dependencies = [ "windows-link", ] +[[package]] +name = "lock_api" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" +dependencies = [ + "scopeguard", +] + [[package]] name = "log" version = "0.4.29" @@ -290,6 +322,19 @@ version = "1.21.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" +[[package]] +name = "parking_lot_core" +version = "0.9.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-link", +] + [[package]] name = "proc-macro2" version = "1.0.106" @@ -328,6 +373,15 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "redox_syscall" +version = "0.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" +dependencies = [ + "bitflags", +] + [[package]] name = "regex" version = "1.12.3" @@ -366,6 +420,12 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + [[package]] name = "semver" version = "1.0.27" @@ -401,6 +461,12 @@ dependencies = [ "syn", ] +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + [[package]] name = "syn" version = "2.0.117" diff --git a/native/crates/engine/Cargo.toml b/native/crates/engine/Cargo.toml index f6193e59c..21b7d3acf 100644 --- a/native/crates/engine/Cargo.toml +++ b/native/crates/engine/Cargo.toml @@ -12,6 +12,9 @@ crate-type = ["cdylib"] [dependencies] gsd-grep = { path = "../grep" } +dashmap = "6" +globset = "0.4" +ignore = "0.4" napi = { version = "2", features = ["napi8"] } napi-derive = "2" diff --git a/native/crates/engine/src/fs_cache.rs b/native/crates/engine/src/fs_cache.rs new file mode 100644 index 000000000..cad240ed3 --- /dev/null +++ b/native/crates/engine/src/fs_cache.rs @@ -0,0 +1,423 @@ +//! Shared filesystem scan cache for discovery tools (glob). +//! +//! Provides a TTL-based cache of scanned directory entries, with: +//! - Global policy (no per-call TTL tuning) +//! - Explicit invalidation for agent file mutations +//! - Empty-result fast recheck to avoid stale negatives +//! +//! # Policy Configuration (environment overrides) +//! - `FS_SCAN_CACHE_TTL_MS` – default `1000` +//! - `FS_SCAN_EMPTY_RECHECK_MS` – default `200` +//! - `FS_SCAN_CACHE_MAX_ENTRIES` – default `16` + +use std::{ + borrow::Cow, + path::{Path, PathBuf}, + sync::LazyLock, + time::{Duration, Instant}, +}; + +use dashmap::DashMap; +use ignore::WalkBuilder; +use napi::bindgen_prelude::*; +use napi_derive::napi; + +use crate::task; + +// ═══════════════════════════════════════════════════════════════════════════ +// Public types (re-exported by glob) +// ═══════════════════════════════════════════════════════════════════════════ + +#[derive(Debug, PartialEq, Eq)] +#[napi] +pub enum FileType { + /// Regular file. + File = 1, + /// Directory. + Dir = 2, + /// Symbolic link. + Symlink = 3, +} + +/// A single filesystem entry from a directory scan. +#[derive(Clone)] +#[napi(object)] +pub struct GlobMatch { + /// Relative path from the search root, using forward slashes. + pub path: String, + /// Resolved filesystem type for the match. + #[napi(js_name = "fileType")] + pub file_type: FileType, + /// Modification time in milliseconds since Unix epoch (from + /// `symlink_metadata`). + pub mtime: Option, +} + +// ═══════════════════════════════════════════════════════════════════════════ +// Cache policy +// ═══════════════════════════════════════════════════════════════════════════ + +const DEFAULT_CACHE_TTL_MS: u64 = 1_000; +const DEFAULT_EMPTY_RECHECK_MS: u64 = 200; +const DEFAULT_MAX_CACHE_ENTRIES: usize = 16; + +fn env_u64(name: &str, default: u64) -> u64 { + std::env::var(name) + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(default) +} + +fn env_usize(name: &str, default: usize) -> usize { + std::env::var(name) + .ok() + .and_then(|v| v.parse().ok()) + .unwrap_or(default) +} + +/// Configured cache TTL in milliseconds. +pub fn cache_ttl_ms() -> u64 { + env_u64("FS_SCAN_CACHE_TTL_MS", DEFAULT_CACHE_TTL_MS) +} + +/// Configured empty-result recheck threshold in milliseconds. +pub fn empty_recheck_ms() -> u64 { + env_u64("FS_SCAN_EMPTY_RECHECK_MS", DEFAULT_EMPTY_RECHECK_MS) +} + +fn max_cache_entries() -> usize { + env_usize("FS_SCAN_CACHE_MAX_ENTRIES", DEFAULT_MAX_CACHE_ENTRIES) +} + +// ═══════════════════════════════════════════════════════════════════════════ +// Cache internals +// ═══════════════════════════════════════════════════════════════════════════ + +#[derive(Clone, Debug, Eq, Hash, PartialEq)] +struct CacheKey { + root: PathBuf, + include_hidden: bool, + use_gitignore: bool, +} + +#[derive(Clone)] +struct CacheEntry { + created_at: Instant, + entries: Vec, +} + +static FS_CACHE: LazyLock> = LazyLock::new(DashMap::new); + +/// Result of a cache-aware scan, including the age of the cached data. +pub struct ScanResult { + /// Scanned filesystem entries. + pub entries: Vec, + /// How old the cached data is in milliseconds (0 = freshly scanned). + pub cache_age_ms: u64, +} + +fn evict_oldest() { + let max = max_cache_entries(); + if FS_CACHE.len() > max { + if let Some(oldest_key) = FS_CACHE + .iter() + .min_by_key(|entry| entry.value().created_at) + .map(|entry| entry.key().clone()) + { + FS_CACHE.remove(&oldest_key); + } + } +} + +// ═══════════════════════════════════════════════════════════════════════════ +// Path utilities +// ═══════════════════════════════════════════════════════════════════════════ + +/// Resolve a search path string to a canonical `PathBuf` (must be a directory). +pub fn resolve_search_path(path: &str) -> Result { + let candidate = PathBuf::from(path); + let root = if candidate.is_absolute() { + candidate + } else { + let cwd = std::env::current_dir() + .map_err(|err| Error::from_reason(format!("Failed to resolve cwd: {err}")))?; + cwd.join(candidate) + }; + let metadata = std::fs::metadata(&root) + .map_err(|err| Error::from_reason(format!("Path not found: {err}")))?; + if !metadata.is_dir() { + return Err(Error::from_reason( + "Search path must be a directory".to_string(), + )); + } + Ok(std::fs::canonicalize(&root).unwrap_or(root)) +} + +/// Normalize a filesystem path to a forward-slash relative string. +pub fn normalize_relative_path<'a>(root: &Path, path: &'a Path) -> Cow<'a, str> { + let relative = path.strip_prefix(root).unwrap_or(path); + if cfg!(windows) { + let relative = relative.to_string_lossy(); + if relative.contains('\\') { + Cow::Owned(relative.replace('\\', "/")) + } else { + relative + } + } else { + relative.to_string_lossy() + } +} + +pub fn contains_component(path: &Path, target: &str) -> bool { + path.components().any(|component| { + component + .as_os_str() + .to_str() + .is_some_and(|value| value == target) + }) +} + +pub fn should_skip_path(path: &Path, mentions_node_modules: bool) -> bool { + if contains_component(path, ".git") { + return true; + } + if !mentions_node_modules && contains_component(path, "node_modules") { + return true; + } + false +} + +pub fn classify_file_type(path: &Path) -> Option<(FileType, Option)> { + let metadata = std::fs::symlink_metadata(path).ok()?; + let file_type = metadata.file_type(); + let mtime_ms = metadata + .modified() + .ok() + .and_then(|t| t.duration_since(std::time::UNIX_EPOCH).ok()) + .map(|d| d.as_millis() as f64); + if file_type.is_symlink() { + Some((FileType::Symlink, mtime_ms)) + } else if file_type.is_dir() { + Some((FileType::Dir, mtime_ms)) + } else { + Some((FileType::File, mtime_ms)) + } +} + +// ═══════════════════════════════════════════════════════════════════════════ +// Walker + collection +// ═══════════════════════════════════════════════════════════════════════════ + +/// Builds a deterministic filesystem walker configured for visibility and +/// ignore rules. +pub fn build_walker(root: &Path, include_hidden: bool, use_gitignore: bool) -> WalkBuilder { + let mut builder = WalkBuilder::new(root); + builder + .hidden(!include_hidden) + .follow_links(false) + .sort_by_file_path(|a, b| a.cmp(b)); + + if use_gitignore { + builder + .git_ignore(true) + .git_exclude(true) + .git_global(true) + .ignore(true) + .parents(true); + } else { + builder + .git_ignore(false) + .git_exclude(false) + .git_global(false) + .ignore(false) + .parents(false); + } + + builder +} + +/// Scans filesystem entries and records normalized relative paths with file +/// metadata. +fn collect_entries( + root: &Path, + include_hidden: bool, + use_gitignore: bool, + ct: &task::CancelToken, +) -> Result> { + let builder = build_walker(root, include_hidden, use_gitignore); + let mut entries = Vec::new(); + + for entry in builder.build() { + ct.heartbeat()?; + + let Ok(entry) = entry else { continue }; + let path = entry.path(); + if should_skip_path(path, true) { + continue; + } + + let relative = normalize_relative_path(root, path); + if relative.is_empty() { + continue; + } + + let Some((file_type, mtime)) = classify_file_type(path) else { + continue; + }; + + entries.push(GlobMatch { + path: relative.into_owned(), + file_type, + mtime, + }); + } + + Ok(entries) +} + +// ═══════════════════════════════════════════════════════════════════════════ +// Cache API +// ═══════════════════════════════════════════════════════════════════════════ + +/// Returns scanned entries using the global TTL cache policy. +/// +/// The returned [`ScanResult::cache_age_ms`] lets callers implement +/// empty-result fast recheck: if a query produces zero matches and the cache is +/// older than [`empty_recheck_ms()`], call [`force_rescan`] before returning +/// empty. +pub fn get_or_scan( + root: &Path, + include_hidden: bool, + use_gitignore: bool, + ct: &task::CancelToken, +) -> Result { + let ttl = cache_ttl_ms(); + if ttl == 0 { + let entries = collect_entries(root, include_hidden, use_gitignore, ct)?; + return Ok(ScanResult { + entries, + cache_age_ms: 0, + }); + } + + let key = CacheKey { + root: root.to_path_buf(), + include_hidden, + use_gitignore, + }; + + let now = Instant::now(); + if let Some(entry) = FS_CACHE.get(&key) { + let age = now.duration_since(entry.created_at); + if age < Duration::from_millis(ttl) { + return Ok(ScanResult { + entries: entry.entries.clone(), + cache_age_ms: age.as_millis() as u64, + }); + } + drop(entry); + FS_CACHE.remove(&key); + } + + let entries = collect_entries(root, include_hidden, use_gitignore, ct)?; + FS_CACHE.insert( + key, + CacheEntry { + created_at: now, + entries: entries.clone(), + }, + ); + evict_oldest(); + Ok(ScanResult { + entries, + cache_age_ms: 0, + }) +} + +/// Force a fresh scan, replacing any existing cache entry. +/// +/// When `store` is false, the fresh scan result is returned without +/// repopulating the cache. +pub fn force_rescan( + root: &Path, + include_hidden: bool, + use_gitignore: bool, + store: bool, + ct: &task::CancelToken, +) -> Result> { + let key = CacheKey { + root: root.to_path_buf(), + include_hidden, + use_gitignore, + }; + FS_CACHE.remove(&key); + + let entries = collect_entries(root, include_hidden, use_gitignore, ct)?; + if store { + let now = Instant::now(); + FS_CACHE.insert( + key, + CacheEntry { + created_at: now, + entries: entries.clone(), + }, + ); + evict_oldest(); + } + Ok(entries) +} + +// ═══════════════════════════════════════════════════════════════════════════ +// Invalidation +// ═══════════════════════════════════════════════════════════════════════════ + +/// Invalidate cache entries whose root contains `target`. +pub fn invalidate_path(target: &Path) { + let keys_to_remove: Vec = FS_CACHE + .iter() + .filter(|entry| target.starts_with(&entry.key().root)) + .map(|entry| entry.key().clone()) + .collect(); + for key in keys_to_remove { + FS_CACHE.remove(&key); + } +} + +/// Clear the entire scan cache. +pub fn invalidate_all() { + FS_CACHE.clear(); +} + +/// Invalidate the filesystem scan cache. +/// +/// When called with a path, removes entries for roots containing that path. +/// When called without a path, clears the entire cache. +/// +/// Intended to be called after agent file mutations (write, edit, rename, +/// delete). +#[napi(js_name = "invalidateFsScanCache")] +pub fn invalidate_fs_scan_cache(path: Option) { + match path { + Some(p) => { + let candidate = PathBuf::from(&p); + let absolute = if candidate.is_absolute() { + candidate + } else if let Ok(cwd) = std::env::current_dir() { + cwd.join(candidate) + } else { + PathBuf::from(&p) + }; + let target = std::fs::canonicalize(&absolute) + .or_else(|_| { + absolute + .parent() + .and_then(|parent| std::fs::canonicalize(parent).ok()) + .and_then(|parent| absolute.file_name().map(|name| parent.join(name))) + .ok_or_else(|| std::io::Error::from(std::io::ErrorKind::NotFound)) + }) + .unwrap_or(absolute); + invalidate_path(&target); + } + None => invalidate_all(), + } +} diff --git a/native/crates/engine/src/glob.rs b/native/crates/engine/src/glob.rs new file mode 100644 index 000000000..ed17b5b3c --- /dev/null +++ b/native/crates/engine/src/glob.rs @@ -0,0 +1,275 @@ +//! Filesystem discovery with glob patterns, ignore semantics, and shared scan +//! caching. +//! +//! # Overview +//! Resolves a search root, obtains scanned entries via [`fs_cache`], applies +//! glob matching plus optional file-type filtering, and optionally streams each +//! accepted match through a callback. +//! +//! The walker always skips `.git`, and skips `node_modules` unless explicitly +//! requested. +//! +//! # Example +//! ```ignore +//! // JS: await native.glob({ pattern: "*.rs", path: "." }) +//! ``` + +use std::path::Path; + +use globset::GlobSet; +use napi::{ + bindgen_prelude::*, + threadsafe_function::{ThreadsafeFunction, ThreadsafeFunctionCallMode}, +}; +use napi_derive::napi; + +pub use crate::fs_cache::{FileType, GlobMatch}; +use crate::{fs_cache, glob_util, task}; + +/// Input options for `glob`, including traversal, filtering, and cancellation. +#[napi(object)] +pub struct GlobOptions { + /// Glob pattern to match (e.g., "*.ts"). + pub pattern: String, + /// Directory to search. + pub path: String, + /// Filter by file type: "file", "dir", or "symlink". Symlinks are + /// matched for file/dir filters based on their target type. + #[napi(js_name = "fileType")] + pub file_type: Option, + /// Match simple patterns recursively by default (`*.ts` -> recursive). + pub recursive: Option, + /// Include hidden files (default: false). + pub hidden: Option, + /// Maximum number of results to return. + #[napi(js_name = "maxResults")] + pub max_results: Option, + /// Respect .gitignore files (default: true). + pub gitignore: Option, + /// Enable shared filesystem scan cache (default: false). + pub cache: Option, + /// Sort results by mtime (most recent first) before applying limit. + #[napi(js_name = "sortByMtime")] + pub sort_by_mtime: Option, + /// Include `node_modules` entries when the pattern does not explicitly + /// mention them. + #[napi(js_name = "includeNodeModules")] + pub include_node_modules: Option, + /// Timeout in milliseconds for the operation. + #[napi(js_name = "timeoutMs")] + pub timeout_ms: Option, +} + +/// Result payload returned by a glob operation. +#[napi(object)] +pub struct GlobResult { + /// Matched filesystem entries. + pub matches: Vec, + /// Number of returned matches (`matches.len()`), clamped to `u32::MAX`. + pub total_matches: u32, +} + +/// Internal runtime config for a single glob execution. +struct GlobConfig { + root: std::path::PathBuf, + pattern: String, + recursive: bool, + include_hidden: bool, + file_type_filter: Option, + max_results: usize, + use_gitignore: bool, + mentions_node_modules: bool, + sort_by_mtime: bool, + use_cache: bool, +} + +fn resolve_symlink_target_type(root: &Path, relative_path: &str) -> Option { + let target_path = root.join(relative_path); + let metadata = std::fs::metadata(target_path).ok()?; + if metadata.is_dir() { + Some(FileType::Dir) + } else if metadata.is_file() { + Some(FileType::File) + } else { + None + } +} + +fn apply_file_type_filter(entry: &GlobMatch, config: &GlobConfig) -> Option { + let Some(filter) = config.file_type_filter else { + return Some(entry.file_type); + }; + if entry.file_type == filter { + return Some(entry.file_type); + } + if entry.file_type != FileType::Symlink { + return None; + } + match filter { + FileType::File | FileType::Dir => { + let resolved = resolve_symlink_target_type(&config.root, &entry.path)?; + if resolved == filter { + Some(resolved) + } else { + None + } + } + FileType::Symlink => None, + } +} + +/// Filter and collect matching entries from a pre-scanned list. +fn filter_entries( + entries: &[GlobMatch], + glob_set: &GlobSet, + config: &GlobConfig, + on_match: Option<&ThreadsafeFunction>, + ct: &task::CancelToken, +) -> Result> { + let mut matches = Vec::new(); + if config.max_results == 0 { + return Ok(matches); + } + + for entry in entries { + ct.heartbeat()?; + if fs_cache::should_skip_path(Path::new(&entry.path), config.mentions_node_modules) { + continue; + } + if !glob_set.is_match(&entry.path) { + continue; + } + let Some(effective_file_type) = apply_file_type_filter(entry, config) else { + continue; + }; + let mut matched_entry = entry.clone(); + matched_entry.file_type = effective_file_type; + if let Some(callback) = on_match { + callback.call( + Ok(matched_entry.clone()), + ThreadsafeFunctionCallMode::NonBlocking, + ); + } + + matches.push(matched_entry); + if !config.sort_by_mtime && matches.len() >= config.max_results { + break; + } + } + Ok(matches) +} + +/// Executes matching/filtering over scanned entries and optionally streams each +/// hit. +fn run_glob( + config: GlobConfig, + on_match: Option<&ThreadsafeFunction>, + ct: task::CancelToken, +) -> Result { + let glob_set = glob_util::compile_glob(&config.pattern, config.recursive)?; + if config.max_results == 0 { + return Ok(GlobResult { + matches: Vec::new(), + total_matches: 0, + }); + } + + let mut matches = if config.use_cache { + let scan = + fs_cache::get_or_scan(&config.root, config.include_hidden, config.use_gitignore, &ct)?; + let mut matches = filter_entries(&scan.entries, &glob_set, &config, on_match, &ct)?; + // Empty-result recheck: if we got zero matches from a cached scan that's old + // enough, force a rescan and try once more before returning empty. + if matches.is_empty() && scan.cache_age_ms >= fs_cache::empty_recheck_ms() { + let fresh = fs_cache::force_rescan( + &config.root, + config.include_hidden, + config.use_gitignore, + true, + &ct, + )?; + matches = filter_entries(&fresh, &glob_set, &config, on_match, &ct)?; + } + matches + } else { + let fresh = fs_cache::force_rescan( + &config.root, + config.include_hidden, + config.use_gitignore, + false, + &ct, + )?; + filter_entries(&fresh, &glob_set, &config, on_match, &ct)? + }; + + if config.sort_by_mtime { + matches.sort_by(|a, b| { + let a_mtime = a.mtime.unwrap_or(0.0); + let b_mtime = b.mtime.unwrap_or(0.0); + b_mtime + .partial_cmp(&a_mtime) + .unwrap_or(std::cmp::Ordering::Equal) + }); + matches.truncate(config.max_results); + } + let total_matches = matches.len().min(u32::MAX as usize) as u32; + Ok(GlobResult { + matches, + total_matches, + }) +} + +/// Find filesystem entries matching a glob pattern. +/// +/// Resolves the search root, scans entries, applies glob and optional file-type +/// filters, and optionally streams each accepted match through `on_match`. +/// +/// If `sortByMtime` is enabled, all matching entries are collected, sorted by +/// descending mtime, then truncated to `maxResults`. +#[napi(js_name = "glob")] +pub fn glob( + options: GlobOptions, + #[napi(ts_arg_type = "((match: GlobMatch) => void) | undefined | null")] on_match: Option< + ThreadsafeFunction, + >, +) -> task::Async { + let GlobOptions { + pattern, + path, + file_type, + recursive, + hidden, + max_results, + gitignore, + sort_by_mtime, + cache, + include_node_modules, + timeout_ms, + } = options; + + let pattern = pattern.trim(); + let pattern = if pattern.is_empty() { "*" } else { pattern }; + let pattern = pattern.to_string(); + + let ct = task::CancelToken::new(timeout_ms); + + task::blocking("glob", ct, move |ct| { + run_glob( + GlobConfig { + root: fs_cache::resolve_search_path(&path)?, + include_hidden: hidden.unwrap_or(false), + file_type_filter: file_type, + recursive: recursive.unwrap_or(true), + max_results: max_results.map_or(usize::MAX, |value| value as usize), + use_gitignore: gitignore.unwrap_or(true), + mentions_node_modules: include_node_modules + .unwrap_or_else(|| pattern.contains("node_modules")), + sort_by_mtime: sort_by_mtime.unwrap_or(false), + use_cache: cache.unwrap_or(false), + pattern, + }, + on_match.as_ref(), + ct, + ) + }) +} diff --git a/native/crates/engine/src/glob_util.rs b/native/crates/engine/src/glob_util.rs new file mode 100644 index 000000000..4f0d98e8e --- /dev/null +++ b/native/crates/engine/src/glob_util.rs @@ -0,0 +1,109 @@ +//! Shared glob-pattern helpers used by [`crate::glob`]. + +use globset::{GlobBuilder, GlobSet, GlobSetBuilder}; +use napi::bindgen_prelude::*; + +/// Normalize a raw glob string: fix path separators, optionally prepend `**/` +/// for recursive matching, and close any unclosed `{` alternation groups. +pub fn build_glob_pattern(glob: &str, recursive: bool) -> String { + let normalized = glob.replace('\\', "/"); + let pattern = if !recursive || normalized.contains('/') || normalized.starts_with("**") { + normalized + } else { + format!("**/{normalized}") + }; + fix_unclosed_braces(pattern) +} + +/// Compile a glob pattern string into a [`GlobSet`]. +/// +/// When `recursive` is true, simple patterns (no path separators, no leading +/// `**`) are automatically prefixed with `**/`. +pub fn compile_glob(glob: &str, recursive: bool) -> Result { + let mut builder = GlobSetBuilder::new(); + let pattern = build_glob_pattern(glob, recursive); + let glob = GlobBuilder::new(&pattern) + .literal_separator(true) + .build() + .map_err(|err| Error::from_reason(format!("Invalid glob pattern: {err}")))?; + builder.add(glob); + builder + .build() + .map_err(|err| Error::from_reason(format!("Failed to build glob matcher: {err}"))) +} + +/// Close unclosed `{` alternation groups in a glob pattern. +/// +/// LLMs occasionally produce patterns like `*.{ts,js` without the closing `}`. +/// Rather than failing, we append the missing braces. +fn fix_unclosed_braces(pattern: String) -> String { + let opens = pattern.chars().filter(|&c| c == '{').count(); + let closes = pattern.chars().filter(|&c| c == '}').count(); + if opens > closes { + let mut fixed = pattern; + for _ in 0..(opens - closes) { + fixed.push('}'); + } + fixed + } else { + pattern + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn simple_pattern_gets_recursive_prefix() { + assert_eq!(build_glob_pattern("*.ts", true), "**/*.ts"); + } + + #[test] + fn pattern_with_path_stays_as_is() { + assert_eq!(build_glob_pattern("src/*.ts", true), "src/*.ts"); + } + + #[test] + fn already_recursive_pattern_unchanged() { + assert_eq!(build_glob_pattern("**/*.rs", true), "**/*.rs"); + } + + #[test] + fn non_recursive_keeps_simple_pattern() { + assert_eq!(build_glob_pattern("*.ts", false), "*.ts"); + } + + #[test] + fn backslashes_normalized() { + assert_eq!(build_glob_pattern("src\\**\\*.ts", true), "src/**/*.ts"); + } + + #[test] + fn unclosed_brace_gets_closed() { + assert_eq!( + build_glob_pattern("*.{ts,tsx,js", true), + "**/*.{ts,tsx,js}" + ); + } + + #[test] + fn deeply_unclosed_braces_all_closed() { + assert_eq!(build_glob_pattern("{a,{b,c}", true), "**/{a,{b,c}}"); + } + + #[test] + fn balanced_braces_unchanged() { + assert_eq!(build_glob_pattern("*.{ts,js}", true), "**/*.{ts,js}"); + } + + #[test] + fn compile_glob_accepts_valid_pattern() { + assert!(compile_glob("*.ts", true).is_ok()); + } + + #[test] + fn compile_glob_fixes_unclosed_brace() { + assert!(compile_glob("*.{ts,tsx,js", true).is_ok()); + } +} diff --git a/native/crates/engine/src/lib.rs b/native/crates/engine/src/lib.rs index 6d8410583..3ee865513 100644 --- a/native/crates/engine/src/lib.rs +++ b/native/crates/engine/src/lib.rs @@ -8,5 +8,9 @@ #![allow(clippy::needless_pass_by_value)] +mod fs_cache; +mod glob; +mod glob_util; mod grep; mod ps; +mod task; diff --git a/native/crates/engine/src/task.rs b/native/crates/engine/src/task.rs new file mode 100644 index 000000000..f609fec6e --- /dev/null +++ b/native/crates/engine/src/task.rs @@ -0,0 +1,107 @@ +//! Blocking work scheduling for N-API exports. +//! +//! Runs CPU-bound or blocking Rust work on libuv's thread pool via napi's +//! `Task` trait, with cooperative cancellation support. +//! +//! # Cancellation +//! Pass a `CancelToken` to blocking tasks. Work must check +//! `CancelToken::heartbeat()` periodically to respect cancellation. + +use std::time::{Duration, Instant}; + +use napi::{Env, Error, Result, Task, bindgen_prelude::*}; + +// ───────────────────────────────────────────────────────────────────────────── +// Cancellation +// ───────────────────────────────────────────────────────────────────────────── + +/// Token for cooperative cancellation of blocking work. +/// +/// Call `heartbeat()` periodically inside long-running work to check for +/// cancellation requests from timeouts. +#[derive(Clone, Default)] +pub struct CancelToken { + deadline: Option, +} + +impl From<()> for CancelToken { + fn from((): ()) -> Self { + Self::default() + } +} + +impl CancelToken { + /// Create a new cancel token from an optional timeout in milliseconds. + pub fn new(timeout_ms: Option) -> Self { + let mut result = Self::default(); + if let Some(timeout_ms) = timeout_ms { + result.deadline = Some(Instant::now() + Duration::from_millis(timeout_ms as u64)); + } + result + } + + /// Check if cancellation has been requested. + /// + /// Returns `Ok(())` if work should continue, or an error if timed out. + pub fn heartbeat(&self) -> Result<()> { + if let Some(deadline) = self.deadline { + if deadline < Instant::now() { + return Err(Error::from_reason("Aborted: Timeout")); + } + } + Ok(()) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// Blocking Task - libuv thread pool integration +// ───────────────────────────────────────────────────────────────────────────── + +/// Task that runs blocking work on libuv's thread pool. +pub struct Blocking +where + T: Send + 'static, +{ + cancel_token: CancelToken, + work: Option Result + Send>>, +} + +impl Task for Blocking +where + T: ToNapiValue + Send + 'static + TypeName, +{ + type JsValue = T; + type Output = T; + + fn compute(&mut self) -> Result { + let work = self + .work + .take() + .ok_or_else(|| Error::from_reason("BlockingTask: work already consumed"))?; + work(self.cancel_token.clone()) + } + + fn resolve(&mut self, _env: Env, output: Self::Output) -> Result { + Ok(output) + } +} + +pub type Async = AsyncTask>; + +/// Create an `AsyncTask` that runs blocking work on libuv's thread pool. +/// +/// Returns `AsyncTask>` which becomes `Promise` on the JS side. +pub fn blocking( + _tag: &'static str, + cancel_token: impl Into, + work: F, +) -> AsyncTask> +where + F: FnOnce(CancelToken) -> Result + Send + 'static, + T: ToNapiValue + TypeName + Send + 'static, +{ + AsyncTask::new(Blocking { + cancel_token: cancel_token.into(), + work: Some(Box::new(work)), + }) +} diff --git a/packages/native/package.json b/packages/native/package.json index c480955ca..404fe52ec 100644 --- a/packages/native/package.json +++ b/packages/native/package.json @@ -1,14 +1,14 @@ { "name": "@gsd/native", "version": "0.1.0", - "description": "Native Rust bindings for GSD \u2014 high-performance grep via N-API", + "description": "Native Rust bindings for GSD — high-performance grep, glob, and process management via N-API", "type": "module", "main": "./src/index.ts", "types": "./src/index.ts", "scripts": { "build:native": "node ../../native/scripts/build.js", "build:native:dev": "node ../../native/scripts/build.js --dev", - "test": "node --test src/__tests__/grep.test.mjs src/__tests__/ps.test.mjs" + "test": "node --test src/__tests__/grep.test.mjs src/__tests__/ps.test.mjs src/__tests__/glob.test.mjs" }, "exports": { ".": { @@ -22,6 +22,10 @@ "./ps": { "types": "./src/ps/index.ts", "import": "./src/ps/index.ts" + }, + "./glob": { + "types": "./src/glob/index.ts", + "import": "./src/glob/index.ts" } }, "files": [ diff --git a/packages/native/src/__tests__/glob.test.mjs b/packages/native/src/__tests__/glob.test.mjs new file mode 100644 index 000000000..10fefb7f4 --- /dev/null +++ b/packages/native/src/__tests__/glob.test.mjs @@ -0,0 +1,237 @@ +import { test, describe } from "node:test"; +import assert from "node:assert/strict"; +import { createRequire } from "node:module"; +import * as path from "node:path"; +import { fileURLToPath } from "node:url"; +import * as fs from "node:fs"; +import * as os from "node:os"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const require = createRequire(import.meta.url); + +// Load the native addon directly +const addonDir = path.resolve( + __dirname, + "..", + "..", + "..", + "..", + "native", + "addon", +); +const platformTag = `${process.platform}-${process.arch}`; +const candidates = [ + path.join(addonDir, `gsd_engine.${platformTag}.node`), + path.join(addonDir, "gsd_engine.dev.node"), +]; + +let native; +for (const candidate of candidates) { + try { + native = require(candidate); + break; + } catch { + // try next + } +} + +if (!native) { + console.error( + "Native addon not found. Run `npm run build:native -w @gsd/native` first.", + ); + process.exit(1); +} + +describe("native glob: glob()", () => { + test("finds files matching a pattern", async (t) => { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "gsd-glob-test-")); + t.after(() => fs.rmSync(tmpDir, { recursive: true, force: true })); + + fs.writeFileSync(path.join(tmpDir, "file1.ts"), "const a = 1;"); + fs.writeFileSync(path.join(tmpDir, "file2.ts"), "const b = 2;"); + fs.writeFileSync(path.join(tmpDir, "file3.js"), "const c = 3;"); + + const result = await native.glob({ pattern: "*.ts", path: tmpDir }); + + assert.equal(result.totalMatches, 2); + assert.equal(result.matches.length, 2); + const paths = result.matches.map((m) => m.path).sort(); + assert.deepEqual(paths, ["file1.ts", "file2.ts"]); + }); + + test("recursive matching into subdirectories", async (t) => { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "gsd-glob-test-")); + t.after(() => fs.rmSync(tmpDir, { recursive: true, force: true })); + + fs.mkdirSync(path.join(tmpDir, "src")); + fs.mkdirSync(path.join(tmpDir, "src", "nested")); + fs.writeFileSync(path.join(tmpDir, "root.ts"), ""); + fs.writeFileSync(path.join(tmpDir, "src", "a.ts"), ""); + fs.writeFileSync(path.join(tmpDir, "src", "nested", "b.ts"), ""); + + const result = await native.glob({ pattern: "*.ts", path: tmpDir }); + + assert.equal(result.totalMatches, 3); + const paths = result.matches.map((m) => m.path).sort(); + assert.ok(paths.includes("root.ts")); + assert.ok(paths.includes("src/a.ts")); + assert.ok(paths.includes("src/nested/b.ts")); + }); + + test("respects maxResults limit", async (t) => { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "gsd-glob-test-")); + t.after(() => fs.rmSync(tmpDir, { recursive: true, force: true })); + + for (let i = 0; i < 10; i++) { + fs.writeFileSync(path.join(tmpDir, `file${i}.txt`), ""); + } + + const result = await native.glob({ + pattern: "*.txt", + path: tmpDir, + maxResults: 3, + }); + + assert.equal(result.matches.length, 3); + assert.equal(result.totalMatches, 3); + }); + + test("filters by file type (directories only)", async (t) => { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "gsd-glob-test-")); + t.after(() => fs.rmSync(tmpDir, { recursive: true, force: true })); + + fs.mkdirSync(path.join(tmpDir, "dir1")); + fs.mkdirSync(path.join(tmpDir, "dir2")); + fs.writeFileSync(path.join(tmpDir, "file.txt"), ""); + + const result = await native.glob({ + pattern: "*", + path: tmpDir, + recursive: false, + fileType: 2, // Dir + }); + + assert.equal(result.totalMatches, 2); + const paths = result.matches.map((m) => m.path).sort(); + assert.deepEqual(paths, ["dir1", "dir2"]); + }); + + test("respects .gitignore", async (t) => { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "gsd-glob-test-")); + t.after(() => fs.rmSync(tmpDir, { recursive: true, force: true })); + + // Init a git repo so .gitignore is respected + fs.mkdirSync(path.join(tmpDir, ".git")); + fs.writeFileSync(path.join(tmpDir, ".gitignore"), "ignored.txt\n"); + fs.writeFileSync(path.join(tmpDir, "kept.txt"), ""); + fs.writeFileSync(path.join(tmpDir, "ignored.txt"), ""); + + const result = await native.glob({ + pattern: "*.txt", + path: tmpDir, + gitignore: true, + }); + + assert.equal(result.totalMatches, 1); + assert.equal(result.matches[0].path, "kept.txt"); + }); + + test("includes gitignored files when gitignore=false", async (t) => { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "gsd-glob-test-")); + t.after(() => fs.rmSync(tmpDir, { recursive: true, force: true })); + + fs.mkdirSync(path.join(tmpDir, ".git")); + fs.writeFileSync(path.join(tmpDir, ".gitignore"), "ignored.txt\n"); + fs.writeFileSync(path.join(tmpDir, "kept.txt"), ""); + fs.writeFileSync(path.join(tmpDir, "ignored.txt"), ""); + + const result = await native.glob({ + pattern: "*.txt", + path: tmpDir, + gitignore: false, + }); + + assert.equal(result.totalMatches, 2); + }); + + test("skips node_modules by default", async (t) => { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "gsd-glob-test-")); + t.after(() => fs.rmSync(tmpDir, { recursive: true, force: true })); + + fs.mkdirSync(path.join(tmpDir, "node_modules")); + fs.writeFileSync(path.join(tmpDir, "node_modules", "dep.js"), ""); + fs.writeFileSync(path.join(tmpDir, "app.js"), ""); + + const result = await native.glob({ + pattern: "*.js", + path: tmpDir, + gitignore: false, + }); + + assert.equal(result.totalMatches, 1); + assert.equal(result.matches[0].path, "app.js"); + }); + + test("sortByMtime returns most recent first", async (t) => { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "gsd-glob-test-")); + t.after(() => fs.rmSync(tmpDir, { recursive: true, force: true })); + + fs.writeFileSync(path.join(tmpDir, "old.txt"), "old"); + // Ensure different mtime + const now = new Date(); + fs.utimesSync( + path.join(tmpDir, "old.txt"), + new Date(now.getTime() - 5000), + new Date(now.getTime() - 5000), + ); + fs.writeFileSync(path.join(tmpDir, "new.txt"), "new"); + + const result = await native.glob({ + pattern: "*.txt", + path: tmpDir, + sortByMtime: true, + }); + + assert.equal(result.totalMatches, 2); + assert.equal(result.matches[0].path, "new.txt"); + assert.equal(result.matches[1].path, "old.txt"); + }); + + test("errors on non-existent path", async () => { + await assert.rejects( + () => + native.glob({ + pattern: "*.txt", + path: "/nonexistent/path/that/does/not/exist", + }), + /Path not found/, + ); + }); + + test("returns mtime for each entry", async (t) => { + const tmpDir = fs.mkdtempSync(path.join(os.tmpdir(), "gsd-glob-test-")); + t.after(() => fs.rmSync(tmpDir, { recursive: true, force: true })); + + fs.writeFileSync(path.join(tmpDir, "test.txt"), "content"); + + const result = await native.glob({ pattern: "*.txt", path: tmpDir }); + + assert.equal(result.matches.length, 1); + assert.ok(typeof result.matches[0].mtime === "number"); + // mtime should be within the last minute + const oneMinuteAgo = Date.now() - 60_000; + assert.ok(result.matches[0].mtime > oneMinuteAgo); + }); +}); + +describe("native glob: invalidateFsScanCache()", () => { + test("can be called with a path", () => { + // Should not throw + native.invalidateFsScanCache("/tmp"); + }); + + test("can be called without arguments", () => { + // Should not throw + native.invalidateFsScanCache(); + }); +}); diff --git a/packages/native/src/glob/index.ts b/packages/native/src/glob/index.ts new file mode 100644 index 000000000..b3930eabd --- /dev/null +++ b/packages/native/src/glob/index.ts @@ -0,0 +1,44 @@ +/** + * Native glob module using N-API. + * + * Gitignore-respecting filesystem discovery backed by Rust's `ignore` and + * `globset` crates, with an optional TTL-based scan cache for repeated queries. + */ + +import { native } from "../native.js"; +import type { + GlobMatch, + GlobOptions, + GlobResult, +} from "./types.js"; + +export type { FileType, GlobMatch, GlobOptions, GlobResult } from "./types.js"; + +/** + * Find filesystem entries matching a glob pattern. + * + * Respects .gitignore by default. Skips `.git` and `node_modules` unless + * the pattern explicitly mentions them. + * + * @param options - Glob search options (pattern, path, filters, etc.) + * @param onMatch - Optional streaming callback invoked for each match. + * @returns Promise resolving to matched entries. + */ +export function glob( + options: GlobOptions, + onMatch?: (match: GlobMatch) => void, +): Promise { + return native.glob(options, onMatch) as Promise; +} + +/** + * Invalidate the filesystem scan cache. + * + * Call after file mutations (write, edit, rename, delete) to ensure + * subsequent glob queries see fresh data. + * + * @param path - Specific path to invalidate, or omit to clear all. + */ +export function invalidateFsScanCache(path?: string): void { + native.invalidateFsScanCache(path); +} diff --git a/packages/native/src/glob/types.ts b/packages/native/src/glob/types.ts new file mode 100644 index 000000000..703a78345 --- /dev/null +++ b/packages/native/src/glob/types.ts @@ -0,0 +1,53 @@ +/** File type classification for filesystem entries. */ +export const enum FileType { + /** Regular file. */ + File = 1, + /** Directory. */ + Dir = 2, + /** Symbolic link. */ + Symlink = 3, +} + +/** A single filesystem entry matched by a glob operation. */ +export interface GlobMatch { + /** Relative path from the search root, using forward slashes. */ + path: string; + /** Resolved filesystem type for the match. */ + fileType: FileType; + /** Modification time in milliseconds since Unix epoch. */ + mtime: number | null; +} + +/** Options for the glob operation. */ +export interface GlobOptions { + /** Glob pattern to match (e.g., "*.ts"). */ + pattern: string; + /** Directory to search. */ + path: string; + /** Filter by file type: File (1), Dir (2), or Symlink (3). */ + fileType?: FileType; + /** Match simple patterns recursively by default (default: true). */ + recursive?: boolean; + /** Include hidden files (default: false). */ + hidden?: boolean; + /** Maximum number of results to return. */ + maxResults?: number; + /** Respect .gitignore files (default: true). */ + gitignore?: boolean; + /** Enable shared filesystem scan cache (default: false). */ + cache?: boolean; + /** Sort results by mtime (most recent first) before applying limit. */ + sortByMtime?: boolean; + /** Include node_modules entries (default: false, unless pattern mentions it). */ + includeNodeModules?: boolean; + /** Timeout in milliseconds for the operation. */ + timeoutMs?: number; +} + +/** Result payload returned by a glob operation. */ +export interface GlobResult { + /** Matched filesystem entries. */ + matches: GlobMatch[]; + /** Number of returned matches. */ + totalMatches: number; +} diff --git a/packages/native/src/index.ts b/packages/native/src/index.ts index b8f3b1d40..a7a3c1420 100644 --- a/packages/native/src/index.ts +++ b/packages/native/src/index.ts @@ -4,6 +4,7 @@ * Modules: * - grep: ripgrep-backed regex search (content + filesystem) * - ps: cross-platform process tree management + * - glob: gitignore-respecting filesystem discovery with scan caching */ export { searchContent, grep } from "./grep/index.js"; @@ -23,3 +24,11 @@ export { processGroupId, killProcessGroup, } from "./ps/index.js"; + +export { glob, invalidateFsScanCache } from "./glob/index.js"; +export type { + FileType, + GlobMatch, + GlobOptions, + GlobResult, +} from "./glob/index.js"; diff --git a/packages/native/src/native.ts b/packages/native/src/native.ts index df9c3c8ad..0b9434c0d 100644 --- a/packages/native/src/native.ts +++ b/packages/native/src/native.ts @@ -47,4 +47,9 @@ export const native = loadNative() as { listDescendants: (pid: number) => number[]; processGroupId: (pid: number) => number | null; killProcessGroup: (pgid: number, signal: number) => boolean; + glob: ( + options: unknown, + onMatch?: ((match: unknown) => void) | undefined | null, + ) => Promise; + invalidateFsScanCache: (path?: string) => void; };