Move the edit tool's hot-path diffing operations from JS to native Rust: - `normalizeForFuzzyMatch`: single-pass Unicode normalization (smart quotes, dashes, special spaces, trailing whitespace) - `fuzzyFindText`: exact-then-fuzzy substring search with UTF-16 index conversion for JS compatibility - `generateDiff`: unified diff generation using the `similar` crate (Myers' algorithm with optimizations) The Rust module at `native/crates/engine/src/diff.rs` exposes three napi functions. The TypeScript wrapper at `packages/native/src/diff/` follows the existing module pattern. `edit-diff.ts` now delegates to native implementations while keeping line-ending handling and file I/O in JS. 18 tests covering normalization, fuzzy matching (including UTF-16 index correctness with emoji/surrogate pairs), and diff generation. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
421 lines
14 KiB
Rust
421 lines
14 KiB
Rust
//! Fuzzy text matching and unified diff generation for the edit tool.
|
|
//!
|
|
//! Replaces the JS `edit-diff.ts` hot path with native Rust:
|
|
//! - `normalizeForFuzzyMatch`: Unicode normalization (smart quotes, dashes, special spaces, trailing whitespace)
|
|
//! - `fuzzyFindText`: exact-then-fuzzy substring search
|
|
//! - `generateDiff`: unified diff with line numbers and context, matching the JS output format
|
|
|
|
use napi_derive::napi;
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// normalizeForFuzzyMatch
|
|
// ---------------------------------------------------------------------------
|
|
|
|
/// Normalize text for fuzzy matching:
|
|
/// - Strip trailing whitespace from each line
|
|
/// - Smart single quotes → '
|
|
/// - Smart double quotes → "
|
|
/// - Various dashes/hyphens → -
|
|
/// - Special Unicode spaces → regular space
|
|
#[napi(js_name = "normalizeForFuzzyMatch")]
|
|
pub fn normalize_for_fuzzy_match(text: String) -> String {
|
|
normalize_impl(&text)
|
|
}
|
|
|
|
fn normalize_impl(text: &str) -> String {
|
|
let mut out = String::with_capacity(text.len());
|
|
|
|
for (i, line) in text.split('\n').enumerate() {
|
|
if i > 0 {
|
|
out.push('\n');
|
|
}
|
|
let trimmed = line.trim_end();
|
|
for ch in trimmed.chars() {
|
|
out.push(normalize_char(ch));
|
|
}
|
|
}
|
|
|
|
out
|
|
}
|
|
|
|
#[inline]
|
|
fn normalize_char(ch: char) -> char {
|
|
match ch {
|
|
// Smart single quotes → '
|
|
'\u{2018}' | '\u{2019}' | '\u{201A}' | '\u{201B}' => '\'',
|
|
// Smart double quotes → "
|
|
'\u{201C}' | '\u{201D}' | '\u{201E}' | '\u{201F}' => '"',
|
|
// Various dashes/hyphens → -
|
|
'\u{2010}' | '\u{2011}' | '\u{2012}' | '\u{2013}' | '\u{2014}' | '\u{2015}'
|
|
| '\u{2212}' => '-',
|
|
// Special spaces → regular space
|
|
'\u{00A0}' | '\u{2002}' | '\u{2003}' | '\u{2004}' | '\u{2005}' | '\u{2006}'
|
|
| '\u{2007}' | '\u{2008}' | '\u{2009}' | '\u{200A}' | '\u{202F}' | '\u{205F}'
|
|
| '\u{3000}' => ' ',
|
|
_ => ch,
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// fuzzyFindText
|
|
// ---------------------------------------------------------------------------
|
|
|
|
#[napi(object)]
|
|
pub struct FuzzyMatchResult {
|
|
pub found: bool,
|
|
pub index: i32,
|
|
pub match_length: i32,
|
|
pub used_fuzzy_match: bool,
|
|
/// When exact match: original content. When fuzzy match: normalized content.
|
|
pub content_for_replacement: String,
|
|
}
|
|
|
|
/// Convert a UTF-8 byte offset to a JS string index (UTF-16 code unit offset).
|
|
fn byte_offset_to_utf16(s: &str, byte_offset: usize) -> usize {
|
|
s[..byte_offset].chars().map(|c| c.len_utf16()).sum()
|
|
}
|
|
|
|
/// Get the UTF-16 code unit length of a UTF-8 string.
|
|
fn utf16_len(s: &str) -> usize {
|
|
s.chars().map(|c| c.len_utf16()).sum()
|
|
}
|
|
|
|
/// Find `old_text` in `content`, trying exact match first, then fuzzy match.
|
|
///
|
|
/// Returns indices and lengths as UTF-16 code unit offsets (compatible with
|
|
/// JS `String.prototype.substring()`).
|
|
///
|
|
/// When fuzzy matching is used, `content_for_replacement` is the normalized
|
|
/// version of `content` (trailing whitespace stripped, Unicode quotes/dashes
|
|
/// normalized to ASCII).
|
|
#[napi(js_name = "fuzzyFindText")]
|
|
pub fn fuzzy_find_text(content: String, old_text: String) -> FuzzyMatchResult {
|
|
// Try exact match first
|
|
if let Some(byte_idx) = content.find(&old_text) {
|
|
return FuzzyMatchResult {
|
|
found: true,
|
|
index: byte_offset_to_utf16(&content, byte_idx) as i32,
|
|
match_length: utf16_len(&old_text) as i32,
|
|
used_fuzzy_match: false,
|
|
content_for_replacement: content,
|
|
};
|
|
}
|
|
|
|
// Try fuzzy match
|
|
let fuzzy_content = normalize_impl(&content);
|
|
let fuzzy_old_text = normalize_impl(&old_text);
|
|
|
|
if let Some(byte_idx) = fuzzy_content.find(&fuzzy_old_text) {
|
|
FuzzyMatchResult {
|
|
found: true,
|
|
index: byte_offset_to_utf16(&fuzzy_content, byte_idx) as i32,
|
|
match_length: utf16_len(&fuzzy_old_text) as i32,
|
|
used_fuzzy_match: true,
|
|
content_for_replacement: fuzzy_content,
|
|
}
|
|
} else {
|
|
FuzzyMatchResult {
|
|
found: false,
|
|
index: -1,
|
|
match_length: 0,
|
|
used_fuzzy_match: false,
|
|
content_for_replacement: content,
|
|
}
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// generateDiff
|
|
// ---------------------------------------------------------------------------
|
|
|
|
#[napi(object)]
|
|
pub struct DiffResult {
|
|
pub diff: String,
|
|
pub first_changed_line: Option<i32>,
|
|
}
|
|
|
|
/// Generate a unified diff string with line numbers and context.
|
|
///
|
|
/// Uses the `similar` crate (Myers' diff algorithm with optimizations).
|
|
/// Output format matches the JS `generateDiffString`:
|
|
/// - `+N line` for additions
|
|
/// - `-N line` for removals
|
|
/// - ` N line` for context
|
|
/// - ` ... ` for skipped context
|
|
#[napi(js_name = "generateDiff")]
|
|
pub fn generate_diff(old_content: String, new_content: String, context_lines: Option<u32>) -> DiffResult {
|
|
let context = context_lines.unwrap_or(4) as usize;
|
|
generate_diff_impl(&old_content, &new_content, context)
|
|
}
|
|
|
|
fn generate_diff_impl(old_content: &str, new_content: &str, context_lines: usize) -> DiffResult {
|
|
let old_lines: Vec<&str> = old_content.split('\n').collect();
|
|
let new_lines: Vec<&str> = new_content.split('\n').collect();
|
|
|
|
let max_line_num = old_lines.len().max(new_lines.len());
|
|
let line_num_width = if max_line_num == 0 {
|
|
1
|
|
} else {
|
|
max_line_num.to_string().len()
|
|
};
|
|
|
|
// Use similar crate for diffing
|
|
let diff = similar::TextDiff::configure()
|
|
.algorithm(similar::Algorithm::Myers)
|
|
.diff_lines(old_content, new_content);
|
|
|
|
let mut output: Vec<String> = Vec::new();
|
|
let mut old_line_num: usize = 1;
|
|
let mut new_line_num: usize = 1;
|
|
let mut last_was_change = false;
|
|
let mut first_changed_line: Option<i32> = None;
|
|
|
|
// Build parts from diff ops, matching the JS `diff` npm package structure
|
|
#[derive(Debug)]
|
|
enum PartTag {
|
|
Equal,
|
|
Added,
|
|
Removed,
|
|
}
|
|
|
|
struct Part {
|
|
tag: PartTag,
|
|
lines: Vec<String>,
|
|
}
|
|
|
|
let mut parts: Vec<Part> = Vec::new();
|
|
|
|
for op in diff.ops() {
|
|
match op {
|
|
similar::DiffOp::Equal { old_index, len, .. } => {
|
|
let lines: Vec<String> = old_lines[*old_index..*old_index + *len]
|
|
.iter()
|
|
.map(|s| s.to_string())
|
|
.collect();
|
|
parts.push(Part { tag: PartTag::Equal, lines });
|
|
}
|
|
similar::DiffOp::Delete { old_index, old_len, .. } => {
|
|
let lines: Vec<String> = old_lines[*old_index..*old_index + *old_len]
|
|
.iter()
|
|
.map(|s| s.to_string())
|
|
.collect();
|
|
parts.push(Part { tag: PartTag::Removed, lines });
|
|
}
|
|
similar::DiffOp::Insert { new_index, new_len, .. } => {
|
|
let lines: Vec<String> = new_lines[*new_index..*new_index + *new_len]
|
|
.iter()
|
|
.map(|s| s.to_string())
|
|
.collect();
|
|
parts.push(Part { tag: PartTag::Added, lines });
|
|
}
|
|
similar::DiffOp::Replace {
|
|
old_index, old_len, new_index, new_len, ..
|
|
} => {
|
|
let del_lines: Vec<String> = old_lines[*old_index..*old_index + *old_len]
|
|
.iter()
|
|
.map(|s| s.to_string())
|
|
.collect();
|
|
parts.push(Part { tag: PartTag::Removed, lines: del_lines });
|
|
|
|
let ins_lines: Vec<String> = new_lines[*new_index..*new_index + *new_len]
|
|
.iter()
|
|
.map(|s| s.to_string())
|
|
.collect();
|
|
parts.push(Part { tag: PartTag::Added, lines: ins_lines });
|
|
}
|
|
}
|
|
}
|
|
|
|
for (i, part) in parts.iter().enumerate() {
|
|
let raw = &part.lines;
|
|
|
|
match part.tag {
|
|
PartTag::Added | PartTag::Removed => {
|
|
if first_changed_line.is_none() {
|
|
first_changed_line = Some(new_line_num as i32);
|
|
}
|
|
|
|
for line in raw {
|
|
match part.tag {
|
|
PartTag::Added => {
|
|
let num = format!("{:>width$}", new_line_num, width = line_num_width);
|
|
output.push(format!("+{} {}", num, line));
|
|
new_line_num += 1;
|
|
}
|
|
PartTag::Removed => {
|
|
let num = format!("{:>width$}", old_line_num, width = line_num_width);
|
|
output.push(format!("-{} {}", num, line));
|
|
old_line_num += 1;
|
|
}
|
|
_ => unreachable!(),
|
|
}
|
|
}
|
|
last_was_change = true;
|
|
}
|
|
PartTag::Equal => {
|
|
let next_part_is_change = i < parts.len() - 1
|
|
&& matches!(parts[i + 1].tag, PartTag::Added | PartTag::Removed);
|
|
|
|
if last_was_change || next_part_is_change {
|
|
let mut lines_to_show = raw.as_slice();
|
|
let mut skip_start = 0usize;
|
|
let mut skip_end = 0usize;
|
|
|
|
if !last_was_change {
|
|
// Show only last N lines as leading context
|
|
skip_start = raw.len().saturating_sub(context_lines);
|
|
lines_to_show = &raw[skip_start..];
|
|
}
|
|
|
|
if !next_part_is_change && lines_to_show.len() > context_lines {
|
|
// Show only first N lines as trailing context
|
|
skip_end = lines_to_show.len() - context_lines;
|
|
lines_to_show = &lines_to_show[..context_lines];
|
|
}
|
|
|
|
if skip_start > 0 {
|
|
output.push(format!(
|
|
" {:>width$} ...",
|
|
"",
|
|
width = line_num_width
|
|
));
|
|
old_line_num += skip_start;
|
|
new_line_num += skip_start;
|
|
}
|
|
|
|
for line in lines_to_show {
|
|
let num = format!("{:>width$}", old_line_num, width = line_num_width);
|
|
output.push(format!(" {} {}", num, line));
|
|
old_line_num += 1;
|
|
new_line_num += 1;
|
|
}
|
|
|
|
if skip_end > 0 {
|
|
output.push(format!(
|
|
" {:>width$} ...",
|
|
"",
|
|
width = line_num_width
|
|
));
|
|
old_line_num += skip_end;
|
|
new_line_num += skip_end;
|
|
}
|
|
} else {
|
|
old_line_num += raw.len();
|
|
new_line_num += raw.len();
|
|
}
|
|
|
|
last_was_change = false;
|
|
}
|
|
}
|
|
}
|
|
|
|
DiffResult {
|
|
diff: output.join("\n"),
|
|
first_changed_line,
|
|
}
|
|
}
|
|
|
|
// ---------------------------------------------------------------------------
|
|
// Tests
|
|
// ---------------------------------------------------------------------------
|
|
|
|
#[cfg(test)]
|
|
mod tests {
|
|
use super::*;
|
|
|
|
#[test]
|
|
fn test_normalize_smart_quotes() {
|
|
let input = "\u{201C}hello\u{201D} \u{2018}world\u{2019}";
|
|
assert_eq!(normalize_impl(input), "\"hello\" 'world'");
|
|
}
|
|
|
|
#[test]
|
|
fn test_normalize_dashes() {
|
|
let input = "a\u{2013}b\u{2014}c\u{2212}d";
|
|
assert_eq!(normalize_impl(input), "a-b-c-d");
|
|
}
|
|
|
|
#[test]
|
|
fn test_normalize_special_spaces() {
|
|
let input = "a\u{00A0}b\u{2003}c\u{3000}d";
|
|
assert_eq!(normalize_impl(input), "a b c d");
|
|
}
|
|
|
|
#[test]
|
|
fn test_normalize_trailing_whitespace() {
|
|
let input = "hello \nworld ";
|
|
assert_eq!(normalize_impl(input), "hello\nworld");
|
|
}
|
|
|
|
#[test]
|
|
fn test_fuzzy_find_exact() {
|
|
let result = fuzzy_find_text("hello world".to_string(), "world".to_string());
|
|
assert!(result.found);
|
|
assert_eq!(result.index, 6);
|
|
assert_eq!(result.match_length, 5);
|
|
assert!(!result.used_fuzzy_match);
|
|
}
|
|
|
|
#[test]
|
|
fn test_fuzzy_find_with_smart_quotes() {
|
|
let content = "let x = \u{201C}hello\u{201D};".to_string();
|
|
let old_text = "let x = \"hello\";".to_string();
|
|
let result = fuzzy_find_text(content, old_text);
|
|
assert!(result.found);
|
|
assert!(result.used_fuzzy_match);
|
|
}
|
|
|
|
#[test]
|
|
fn test_fuzzy_find_not_found() {
|
|
let result = fuzzy_find_text("hello world".to_string(), "xyz".to_string());
|
|
assert!(!result.found);
|
|
assert_eq!(result.index, -1);
|
|
}
|
|
|
|
#[test]
|
|
fn test_generate_diff_basic() {
|
|
let old = "line1\nline2\nline3";
|
|
let new_text = "line1\nmodified\nline3";
|
|
let result = generate_diff_impl(old, new_text, 4);
|
|
assert!(result.diff.contains("-"));
|
|
assert!(result.diff.contains("+"));
|
|
assert!(result.diff.contains("line2"));
|
|
assert!(result.diff.contains("modified"));
|
|
assert!(result.first_changed_line.is_some());
|
|
}
|
|
|
|
#[test]
|
|
fn test_generate_diff_addition() {
|
|
let old = "line1\nline3";
|
|
let new_text = "line1\nline2\nline3";
|
|
let result = generate_diff_impl(old, new_text, 4);
|
|
assert!(result.diff.contains("+"));
|
|
assert!(result.diff.contains("line2"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_generate_diff_deletion() {
|
|
let old = "line1\nline2\nline3";
|
|
let new_text = "line1\nline3";
|
|
let result = generate_diff_impl(old, new_text, 4);
|
|
assert!(result.diff.contains("-"));
|
|
assert!(result.diff.contains("line2"));
|
|
}
|
|
|
|
#[test]
|
|
fn test_generate_diff_context_ellipsis() {
|
|
let mut old_lines: Vec<String> = (1..=20).map(|i| format!("line{}", i)).collect();
|
|
let old = old_lines.join("\n");
|
|
old_lines[10] = "modified".to_string();
|
|
let new_text = old_lines.join("\n");
|
|
let result = generate_diff_impl(&old, &new_text, 2);
|
|
assert!(result.diff.contains("..."));
|
|
}
|
|
|
|
#[test]
|
|
fn test_generate_diff_empty() {
|
|
let result = generate_diff_impl("same", "same", 4);
|
|
assert!(result.diff.is_empty());
|
|
assert!(result.first_changed_line.is_none());
|
|
}
|
|
}
|