diff --git a/native/Cargo.lock b/native/Cargo.lock index 7cb16d2d3..588c75d51 100644 --- a/native/Cargo.lock +++ b/native/Cargo.lock @@ -540,6 +540,7 @@ dependencies = [ "ignore", "image", "libc", + "memchr", "napi", "napi-build", "napi-derive", diff --git a/native/crates/engine/Cargo.toml b/native/crates/engine/Cargo.toml index 204eef0b2..d4dda258c 100644 --- a/native/crates/engine/Cargo.toml +++ b/native/crates/engine/Cargo.toml @@ -18,6 +18,7 @@ dashmap = "6" globset = "0.4" html-to-markdown-rs = { version = "2", default-features = false } ignore = "0.4" +memchr = "2" image = { version = "0.25", default-features = false, features = [ "png", "jpeg", diff --git a/native/crates/engine/src/lib.rs b/native/crates/engine/src/lib.rs index ed5feb445..b18c3d16e 100644 --- a/native/crates/engine/src/lib.rs +++ b/native/crates/engine/src/lib.rs @@ -23,6 +23,7 @@ mod text; mod ttsr; mod gsd_parser; mod image; +mod truncate; mod json_parse; mod stream_process; mod xxhash; diff --git a/native/crates/engine/src/truncate.rs b/native/crates/engine/src/truncate.rs new file mode 100644 index 000000000..101759894 --- /dev/null +++ b/native/crates/engine/src/truncate.rs @@ -0,0 +1,364 @@ +//! Line-boundary-aware output truncation. +//! +//! Truncates tool output (bash, grep, file reads) at line boundaries, +//! counting by UTF-8 bytes. Three modes: +//! - **head**: keep the first N bytes worth of complete lines +//! - **tail**: keep the last N bytes worth of complete lines +//! - **both**: split budget between head and tail with an elision marker + +use napi_derive::napi; + +#[napi(object)] +pub struct TruncateResult { + /// The truncated (or original) text. + pub text: String, + /// Whether any truncation occurred. + pub truncated: bool, + /// Total number of lines in the original input. + pub original_lines: u32, + /// Number of complete lines kept in the output. + pub kept_lines: u32, +} + +#[napi(object)] +pub struct TruncateOutputResult { + /// The truncated (or original) text. + pub text: String, + /// Whether any truncation occurred. + pub truncated: bool, + /// Human-readable truncation summary (e.g. "Kept 50 of 1200 lines"). + pub message: Option, +} + +/// Keep the first `max_bytes` worth of complete lines. +/// +/// Returns the original text unchanged when it fits. When truncation is +/// required, the output ends at the last newline boundary that fits within +/// the byte budget. UTF-8 boundaries are respected because we split on `\n` +/// which is always a single byte. +#[napi(js_name = "truncateTail")] +pub fn truncate_tail(text: String, max_bytes: u32) -> TruncateResult { + let max = max_bytes as usize; + let total_bytes = text.len(); + + // Fast path: fits entirely + if total_bytes <= max { + let line_count = memchr::memchr_iter(b'\n', text.as_bytes()).count() + + if text.is_empty() || text.ends_with('\n') { 0 } else { 1 }; + return TruncateResult { + text, + truncated: false, + original_lines: line_count as u32, + kept_lines: line_count as u32, + }; + } + + let bytes = text.as_bytes(); + let original_lines = count_lines(bytes); + + // Find the last newline at or before max_bytes + let cut = find_last_newline_before(bytes, max); + + if cut == 0 { + // First line alone exceeds the budget — keep nothing + return TruncateResult { + text: String::new(), + truncated: true, + original_lines, + kept_lines: 0, + }; + } + + let kept = &bytes[..cut]; + let kept_lines = count_lines(kept); + + TruncateResult { + text: std::str::from_utf8(kept).expect("split at newline boundary preserves UTF-8").to_owned(), + truncated: true, + original_lines, + kept_lines, + } +} + +/// Keep the last `max_bytes` worth of complete lines. +/// +/// The output starts at the first line boundary after skipping enough bytes +/// from the front. UTF-8 boundaries are respected because we only split on +/// `\n`. +#[napi(js_name = "truncateHead")] +pub fn truncate_head(text: String, max_bytes: u32) -> TruncateResult { + let max = max_bytes as usize; + let total_bytes = text.len(); + + // Fast path + if total_bytes <= max { + let line_count = memchr::memchr_iter(b'\n', text.as_bytes()).count() + + if text.is_empty() || text.ends_with('\n') { 0 } else { 1 }; + return TruncateResult { + text, + truncated: false, + original_lines: line_count as u32, + kept_lines: line_count as u32, + }; + } + + let bytes = text.as_bytes(); + let original_lines = count_lines(bytes); + + // We need to keep the last `max` bytes. Find the first newline at or + // after (total_bytes - max) so we start on a line boundary. + let skip_to = total_bytes - max; + let start = find_first_newline_after(bytes, skip_to); + + if start >= total_bytes { + // Last line alone exceeds the budget — keep nothing + return TruncateResult { + text: String::new(), + truncated: true, + original_lines, + kept_lines: 0, + }; + } + + let kept = &bytes[start..]; + let kept_lines = count_lines(kept); + + TruncateResult { + text: std::str::from_utf8(kept).expect("split at newline boundary preserves UTF-8").to_owned(), + truncated: true, + original_lines, + kept_lines, + } +} + +/// Main entry point: truncate tool output with head/tail/both modes. +/// +/// Modes: +/// - `"tail"` (default): keep the beginning (head truncation removes tail) +/// - `"head"`: keep the end (tail truncation removes head) +/// - `"both"`: keep beginning and end, elide the middle +#[napi(js_name = "truncateOutput")] +pub fn truncate_output( + text: String, + max_bytes: u32, + mode: Option, +) -> TruncateOutputResult { + let max = max_bytes as usize; + + if text.len() <= max { + return TruncateOutputResult { + text, + truncated: false, + message: None, + }; + } + + let mode_str = mode.as_deref().unwrap_or("tail"); + let original_lines = count_lines(text.as_bytes()); + + match mode_str { + "head" => { + let total_bytes = text.len(); + let r = truncate_head(text, max_bytes); + let removed = total_bytes - r.text.len(); + let msg = format!( + "Kept last {} of {} lines ({} bytes truncated from start)", + r.kept_lines, r.original_lines, removed + ); + TruncateOutputResult { + text: r.text, + truncated: true, + message: Some(msg), + } + } + "both" => { + let half = max / 2; + let head_result = truncate_tail(text.clone(), half as u32); + let tail_result = truncate_head(text, (max - half) as u32); + + let marker = format!( + "\n\n... [{} lines elided] ...\n\n", + original_lines + .saturating_sub(head_result.kept_lines) + .saturating_sub(tail_result.kept_lines) + ); + let combined = format!("{}{}{}", head_result.text, marker, tail_result.text); + let kept = head_result.kept_lines + tail_result.kept_lines; + let msg = format!( + "Kept {} of {} lines (head {} + tail {})", + kept, original_lines, head_result.kept_lines, tail_result.kept_lines + ); + TruncateOutputResult { + text: combined, + truncated: true, + message: Some(msg), + } + } + _ => { + // "tail" — keep the beginning + let total_bytes = text.len(); + let r = truncate_tail(text, max_bytes); + let removed = total_bytes - r.text.len(); + let msg = format!( + "Kept first {} of {} lines ({} bytes truncated from end)", + r.kept_lines, r.original_lines, removed + ); + TruncateOutputResult { + text: r.text, + truncated: true, + message: Some(msg), + } + } + } +} + +// ── helpers ────────────────────────────────────────────────────────────── + +/// Count lines in a byte slice. A trailing newline does not add an extra line. +#[inline] +fn count_lines(bytes: &[u8]) -> u32 { + if bytes.is_empty() { + return 0; + } + let newlines = memchr::memchr_iter(b'\n', bytes).count() as u32; + if bytes.last() == Some(&b'\n') { + newlines + } else { + newlines + 1 + } +} + +/// Find the byte position just past the last `\n` that is at or before `limit`. +/// Returns 0 if no newline exists before `limit`. +#[inline] +fn find_last_newline_before(bytes: &[u8], limit: usize) -> usize { + let search_end = limit.min(bytes.len()); + // Search backwards for \n + match memchr::memrchr(b'\n', &bytes[..search_end]) { + Some(pos) => pos + 1, // include the newline + None => 0, + } +} + +/// Find the byte position just past the first `\n` at or after `pos`. +/// Returns `bytes.len()` if no newline is found. +#[inline] +fn find_first_newline_after(bytes: &[u8], pos: usize) -> usize { + let start = pos.min(bytes.len()); + match memchr::memchr(b'\n', &bytes[start..]) { + Some(offset) => start + offset + 1, // skip past the newline + None => bytes.len(), + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_no_truncation_needed() { + let r = truncate_tail("hello\nworld\n".into(), 100); + assert!(!r.truncated); + assert_eq!(r.original_lines, 2); + assert_eq!(r.kept_lines, 2); + assert_eq!(r.text, "hello\nworld\n"); + } + + #[test] + fn test_tail_truncation_ascii() { + // "hello\nworld\n" = 12 bytes, limit to 7 -> keep "hello\n" + let r = truncate_tail("hello\nworld\n".into(), 7); + assert!(r.truncated); + assert_eq!(r.text, "hello\n"); + assert_eq!(r.kept_lines, 1); + assert_eq!(r.original_lines, 2); + } + + #[test] + fn test_head_truncation_ascii() { + let r = truncate_head("hello\nworld\n".into(), 7); + assert!(r.truncated); + assert_eq!(r.text, "world\n"); + assert_eq!(r.kept_lines, 1); + } + + #[test] + fn test_utf8_multibyte() { + // "cafe\u{0301}\n" = "café\n" where é is e + combining accent (3 bytes for the combining char) + // Actually let's use a simpler case: "日本\n" = 7 bytes (3+3+1) + let input = "日本\nworld\n".to_string(); + assert_eq!(input.len(), 13); // 3+3+1+5+1 + let r = truncate_tail(input.clone(), 8); + assert!(r.truncated); + assert_eq!(r.text, "日本\n"); + assert_eq!(r.kept_lines, 1); + } + + #[test] + fn test_empty_input() { + let r = truncate_tail(String::new(), 100); + assert!(!r.truncated); + assert_eq!(r.original_lines, 0); + assert_eq!(r.kept_lines, 0); + + let r2 = truncate_head(String::new(), 100); + assert!(!r2.truncated); + } + + #[test] + fn test_exact_boundary() { + let input = "abc\ndef\n".to_string(); // 8 bytes + let r = truncate_tail(input.clone(), 8); + assert!(!r.truncated); + assert_eq!(r.text, "abc\ndef\n"); + } + + #[test] + fn test_single_line_exceeding_limit() { + let r = truncate_tail("this_is_a_very_long_line".into(), 5); + assert!(r.truncated); + assert_eq!(r.text, ""); + assert_eq!(r.kept_lines, 0); + } + + #[test] + fn test_head_single_line_exceeding() { + let r = truncate_head("this_is_a_very_long_line".into(), 5); + assert!(r.truncated); + assert_eq!(r.text, ""); + assert_eq!(r.kept_lines, 0); + } + + #[test] + fn test_truncate_output_both_mode() { + let mut lines = Vec::new(); + for i in 0..100 { + lines.push(format!("line {i}")); + } + let input = lines.join("\n") + "\n"; + let r = truncate_output(input, 200, Some("both".into())); + assert!(r.truncated); + assert!(r.message.is_some()); + assert!(r.text.contains("... [")); + } + + #[test] + fn test_count_lines() { + assert_eq!(count_lines(b""), 0); + assert_eq!(count_lines(b"a"), 1); + assert_eq!(count_lines(b"a\n"), 1); + assert_eq!(count_lines(b"a\nb"), 2); + assert_eq!(count_lines(b"a\nb\n"), 2); + } + + #[test] + fn test_utf8_emoji() { + // Each emoji is 4 bytes + let input = "😀\n😂\n🎉\n".to_string(); + assert_eq!(input.len(), 15); // 4+1+4+1+4+1 + let r = truncate_tail(input, 6); + assert!(r.truncated); + assert_eq!(r.text, "😀\n"); + assert_eq!(r.kept_lines, 1); + } +} diff --git a/packages/native/src/__tests__/truncate.test.mjs b/packages/native/src/__tests__/truncate.test.mjs new file mode 100644 index 000000000..07a79463e --- /dev/null +++ b/packages/native/src/__tests__/truncate.test.mjs @@ -0,0 +1,145 @@ +import { test, describe } from "node:test"; +import assert from "node:assert/strict"; +import { createRequire } from "node:module"; +import * as path from "node:path"; +import { fileURLToPath } from "node:url"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const require = createRequire(import.meta.url); + +const addonDir = path.resolve(__dirname, "..", "..", "..", "..", "native", "addon"); +const platformTag = `${process.platform}-${process.arch}`; +const candidates = [ + path.join(addonDir, `gsd_engine.${platformTag}.node`), + path.join(addonDir, "gsd_engine.dev.node"), +]; + +let native; +for (const candidate of candidates) { + try { + native = require(candidate); + break; + } catch { + // try next + } +} + +if (!native) { + console.error("Native addon not found. Build first."); + process.exit(1); +} + +// ── truncateTail ───────────────────────────────────────────────────────── + +describe("truncateTail", () => { + test("no truncation when content fits", () => { + const r = native.truncateTail("hello\nworld\n", 100); + assert.equal(r.truncated, false); + assert.equal(r.text, "hello\nworld\n"); + assert.equal(r.originalLines, 2); + assert.equal(r.keptLines, 2); + }); + + test("truncates at line boundary (ASCII)", () => { + const r = native.truncateTail("hello\nworld\n", 7); + assert.equal(r.truncated, true); + assert.equal(r.text, "hello\n"); + assert.equal(r.keptLines, 1); + }); + + test("empty input", () => { + const r = native.truncateTail("", 100); + assert.equal(r.truncated, false); + assert.equal(r.originalLines, 0); + }); + + test("exact boundary", () => { + const r = native.truncateTail("abc\ndef\n", 8); + assert.equal(r.truncated, false); + assert.equal(r.text, "abc\ndef\n"); + }); + + test("single line exceeding limit", () => { + const r = native.truncateTail("this_is_very_long", 5); + assert.equal(r.truncated, true); + assert.equal(r.text, ""); + assert.equal(r.keptLines, 0); + }); + + test("UTF-8 multibyte characters", () => { + // "日本\n" = 7 bytes (3+3+1) + const r = native.truncateTail("日本\nworld\n", 8); + assert.equal(r.truncated, true); + assert.equal(r.text, "日本\n"); + assert.equal(r.keptLines, 1); + }); + + test("emoji (4-byte UTF-8)", () => { + // "😀\n" = 5 bytes + const r = native.truncateTail("😀\n😂\n🎉\n", 6); + assert.equal(r.truncated, true); + assert.equal(r.text, "😀\n"); + assert.equal(r.keptLines, 1); + }); +}); + +// ── truncateHead ───────────────────────────────────────────────────────── + +describe("truncateHead", () => { + test("no truncation when content fits", () => { + const r = native.truncateHead("hello\nworld\n", 100); + assert.equal(r.truncated, false); + assert.equal(r.text, "hello\nworld\n"); + }); + + test("keeps last lines (ASCII)", () => { + const r = native.truncateHead("hello\nworld\n", 7); + assert.equal(r.truncated, true); + assert.equal(r.text, "world\n"); + assert.equal(r.keptLines, 1); + }); + + test("empty input", () => { + const r = native.truncateHead("", 100); + assert.equal(r.truncated, false); + }); + + test("single line exceeding limit", () => { + const r = native.truncateHead("this_is_very_long", 5); + assert.equal(r.truncated, true); + assert.equal(r.text, ""); + assert.equal(r.keptLines, 0); + }); +}); + +// ── truncateOutput ─────────────────────────────────────────────────────── + +describe("truncateOutput", () => { + test("no truncation when fits", () => { + const r = native.truncateOutput("small", 100); + assert.equal(r.truncated, false); + assert.equal(r.text, "small"); + assert.equal(r.message, null); + }); + + test("tail mode (default)", () => { + const lines = Array.from({ length: 100 }, (_, i) => `line ${i}`).join("\n") + "\n"; + const r = native.truncateOutput(lines, 200); + assert.equal(r.truncated, true); + assert.ok(r.message); + }); + + test("head mode", () => { + const lines = Array.from({ length: 100 }, (_, i) => `line ${i}`).join("\n") + "\n"; + const r = native.truncateOutput(lines, 200, "head"); + assert.equal(r.truncated, true); + assert.ok(r.message.includes("start")); + }); + + test("both mode", () => { + const lines = Array.from({ length: 100 }, (_, i) => `line ${i}`).join("\n") + "\n"; + const r = native.truncateOutput(lines, 200, "both"); + assert.equal(r.truncated, true); + assert.ok(r.text.includes("... [")); + }); +}); diff --git a/packages/native/src/index.ts b/packages/native/src/index.ts index e66228c9e..cd8613f22 100644 --- a/packages/native/src/index.ts +++ b/packages/native/src/index.ts @@ -123,3 +123,6 @@ export type { ParsedGsdFile, SectionResult, } from "./gsd-parser/index.js"; + +export { truncateTail, truncateHead, truncateOutput } from "./truncate/index.js"; +export type { TruncateResult, TruncateOutputResult } from "./truncate/index.js"; diff --git a/packages/native/src/native.ts b/packages/native/src/native.ts index 2463b9114..944c50c69 100644 --- a/packages/native/src/native.ts +++ b/packages/native/src/native.ts @@ -132,6 +132,9 @@ export const native = loadNative() as { extractAllSections: (content: string, level?: number) => string; batchParseGsdFiles: (directory: string) => unknown; parseRoadmapFile: (content: string) => unknown; + truncateTail: (text: string, maxBytes: number) => unknown; + truncateHead: (text: string, maxBytes: number) => unknown; + truncateOutput: (text: string, maxBytes: number, mode?: string) => unknown; parseJson: (text: string) => unknown; parsePartialJson: (text: string) => unknown; parseStreamingJson: (text: string) => unknown; diff --git a/packages/native/src/truncate/index.ts b/packages/native/src/truncate/index.ts new file mode 100644 index 000000000..a9036219c --- /dev/null +++ b/packages/native/src/truncate/index.ts @@ -0,0 +1,50 @@ +/** + * Line-boundary-aware output truncation (native Rust). + * + * Truncates tool output at line boundaries, counting by UTF-8 bytes. + * Three modes: head (keep end), tail (keep start), both (keep start+end). + */ + +import { native } from "../native.js"; + +export interface TruncateResult { + text: string; + truncated: boolean; + originalLines: number; + keptLines: number; +} + +export interface TruncateOutputResult { + text: string; + truncated: boolean; + message?: string; +} + +/** + * Keep the first `maxBytes` worth of complete lines. + */ +export function truncateTail(text: string, maxBytes: number): TruncateResult { + return (native as Record).truncateTail(text, maxBytes) as TruncateResult; +} + +/** + * Keep the last `maxBytes` worth of complete lines. + */ +export function truncateHead(text: string, maxBytes: number): TruncateResult { + return (native as Record).truncateHead(text, maxBytes) as TruncateResult; +} + +/** + * Main entry point: truncate tool output with head/tail/both modes. + */ +export function truncateOutput( + text: string, + maxBytes: number, + mode?: string, +): TruncateOutputResult { + return (native as Record).truncateOutput( + text, + maxBytes, + mode, + ) as TruncateOutputResult; +}