singularity-forge/rust-engine/crates/engine/src/diff.rs

440 lines
15 KiB
Rust

//! Fuzzy text matching and unified diff generation for the edit tool.
//!
//! Replaces the JS `edit-diff.ts` hot path with native Rust:
//! - `normalizeForFuzzyMatch`: Unicode normalization (smart quotes, dashes, special spaces, trailing whitespace)
//! - `fuzzyFindText`: exact-then-fuzzy substring search
//! - `generateDiff`: unified diff with line numbers and context, matching the JS output format
use napi_derive::napi;
// ---------------------------------------------------------------------------
// normalizeForFuzzyMatch
// ---------------------------------------------------------------------------
/// Normalize text for fuzzy matching:
/// - Strip trailing whitespace from each line
/// - Smart single quotes → '
/// - Smart double quotes → "
/// - Various dashes/hyphens → -
/// - Special Unicode spaces → regular space
#[napi(js_name = "normalizeForFuzzyMatch")]
pub fn normalize_for_fuzzy_match(text: String) -> String {
normalize_impl(&text)
}
fn normalize_impl(text: &str) -> String {
let mut out = String::with_capacity(text.len());
for (i, line) in text.split('\n').enumerate() {
if i > 0 {
out.push('\n');
}
let trimmed = line.trim_end();
for ch in trimmed.chars() {
out.push(normalize_char(ch));
}
}
out
}
#[inline]
fn normalize_char(ch: char) -> char {
match ch {
// Smart single quotes → '
'\u{2018}' | '\u{2019}' | '\u{201A}' | '\u{201B}' => '\'',
// Smart double quotes → "
'\u{201C}' | '\u{201D}' | '\u{201E}' | '\u{201F}' => '"',
// Various dashes/hyphens → -
'\u{2010}' | '\u{2011}' | '\u{2012}' | '\u{2013}' | '\u{2014}' | '\u{2015}'
| '\u{2212}' => '-',
// Special spaces → regular space
'\u{00A0}' | '\u{2002}' | '\u{2003}' | '\u{2004}' | '\u{2005}' | '\u{2006}'
| '\u{2007}' | '\u{2008}' | '\u{2009}' | '\u{200A}' | '\u{202F}' | '\u{205F}'
| '\u{3000}' => ' ',
_ => ch,
}
}
// ---------------------------------------------------------------------------
// fuzzyFindText
// ---------------------------------------------------------------------------
#[napi(object)]
pub struct FuzzyMatchResult {
pub found: bool,
pub index: i32,
pub match_length: i32,
pub used_fuzzy_match: bool,
/// When exact match: original content. When fuzzy match: normalized content.
pub content_for_replacement: String,
}
/// Convert a UTF-8 byte offset to a JS string index (UTF-16 code unit offset).
fn byte_offset_to_utf16(s: &str, byte_offset: usize) -> usize {
s[..byte_offset].chars().map(|c| c.len_utf16()).sum()
}
/// Get the UTF-16 code unit length of a UTF-8 string.
fn utf16_len(s: &str) -> usize {
s.chars().map(|c| c.len_utf16()).sum()
}
/// Find `old_text` in `content`, trying exact match first, then fuzzy match.
///
/// Returns indices and lengths as UTF-16 code unit offsets (compatible with
/// JS `String.prototype.substring()`).
///
/// When fuzzy matching is used, `content_for_replacement` is the normalized
/// version of `content` (trailing whitespace stripped, Unicode quotes/dashes
/// normalized to ASCII).
#[napi(js_name = "fuzzyFindText")]
pub fn fuzzy_find_text(content: String, old_text: String) -> FuzzyMatchResult {
// Try exact match first
if let Some(byte_idx) = content.find(&old_text) {
return FuzzyMatchResult {
found: true,
index: byte_offset_to_utf16(&content, byte_idx) as i32,
match_length: utf16_len(&old_text) as i32,
used_fuzzy_match: false,
content_for_replacement: content,
};
}
// Try fuzzy match
let fuzzy_content = normalize_impl(&content);
let fuzzy_old_text = normalize_impl(&old_text);
if let Some(byte_idx) = fuzzy_content.find(&fuzzy_old_text) {
FuzzyMatchResult {
found: true,
index: byte_offset_to_utf16(&fuzzy_content, byte_idx) as i32,
match_length: utf16_len(&fuzzy_old_text) as i32,
used_fuzzy_match: true,
content_for_replacement: fuzzy_content,
}
} else {
FuzzyMatchResult {
found: false,
index: -1,
match_length: 0,
used_fuzzy_match: false,
content_for_replacement: content,
}
}
}
// ---------------------------------------------------------------------------
// generateDiff
// ---------------------------------------------------------------------------
#[napi(object)]
pub struct DiffResult {
pub diff: String,
pub first_changed_line: Option<i32>,
}
/// Generate a unified diff string with line numbers and context.
///
/// Uses the `similar` crate (Myers' diff algorithm with optimizations).
/// Output format matches the JS `generateDiffString`:
/// - `+N line` for additions
/// - `-N line` for removals
/// - ` N line` for context
/// - ` ... ` for skipped context
#[napi(js_name = "generateDiff")]
pub fn generate_diff(
old_content: String,
new_content: String,
context_lines: Option<u32>,
) -> DiffResult {
let context = context_lines.unwrap_or(4) as usize;
generate_diff_impl(&old_content, &new_content, context)
}
fn generate_diff_impl(old_content: &str, new_content: &str, context_lines: usize) -> DiffResult {
let old_lines: Vec<&str> = old_content.split('\n').collect();
let new_lines: Vec<&str> = new_content.split('\n').collect();
let max_line_num = old_lines.len().max(new_lines.len());
let line_num_width = if max_line_num == 0 {
1
} else {
max_line_num.to_string().len()
};
// Use similar crate for diffing
let diff = similar::TextDiff::configure()
.algorithm(similar::Algorithm::Myers)
.diff_lines(old_content, new_content);
let mut output: Vec<String> = Vec::new();
let mut old_line_num: usize = 1;
let mut new_line_num: usize = 1;
let mut last_was_change = false;
let mut first_changed_line: Option<i32> = None;
// Build parts from diff ops, matching the JS `diff` npm package structure
#[derive(Debug)]
enum PartTag {
Equal,
Added,
Removed,
}
struct Part {
tag: PartTag,
lines: Vec<String>,
}
let mut parts: Vec<Part> = Vec::new();
for op in diff.ops() {
match op {
similar::DiffOp::Equal { old_index, len, .. } => {
let lines: Vec<String> = old_lines[*old_index..*old_index + *len]
.iter()
.map(|s| s.to_string())
.collect();
parts.push(Part {
tag: PartTag::Equal,
lines,
});
}
similar::DiffOp::Delete {
old_index, old_len, ..
} => {
let lines: Vec<String> = old_lines[*old_index..*old_index + *old_len]
.iter()
.map(|s| s.to_string())
.collect();
parts.push(Part {
tag: PartTag::Removed,
lines,
});
}
similar::DiffOp::Insert {
new_index, new_len, ..
} => {
let lines: Vec<String> = new_lines[*new_index..*new_index + *new_len]
.iter()
.map(|s| s.to_string())
.collect();
parts.push(Part {
tag: PartTag::Added,
lines,
});
}
similar::DiffOp::Replace {
old_index,
old_len,
new_index,
new_len,
..
} => {
let del_lines: Vec<String> = old_lines[*old_index..*old_index + *old_len]
.iter()
.map(|s| s.to_string())
.collect();
parts.push(Part {
tag: PartTag::Removed,
lines: del_lines,
});
let ins_lines: Vec<String> = new_lines[*new_index..*new_index + *new_len]
.iter()
.map(|s| s.to_string())
.collect();
parts.push(Part {
tag: PartTag::Added,
lines: ins_lines,
});
}
}
}
for (i, part) in parts.iter().enumerate() {
let raw = &part.lines;
match part.tag {
PartTag::Added | PartTag::Removed => {
if first_changed_line.is_none() {
first_changed_line = Some(new_line_num as i32);
}
for line in raw {
match part.tag {
PartTag::Added => {
let num = format!("{:>width$}", new_line_num, width = line_num_width);
output.push(format!("+{} {}", num, line));
new_line_num += 1;
}
PartTag::Removed => {
let num = format!("{:>width$}", old_line_num, width = line_num_width);
output.push(format!("-{} {}", num, line));
old_line_num += 1;
}
_ => unreachable!(),
}
}
last_was_change = true;
}
PartTag::Equal => {
let next_part_is_change = i < parts.len() - 1
&& matches!(parts[i + 1].tag, PartTag::Added | PartTag::Removed);
if last_was_change || next_part_is_change {
let mut lines_to_show = raw.as_slice();
let mut skip_start = 0usize;
let mut skip_end = 0usize;
if !last_was_change {
// Show only last N lines as leading context
skip_start = raw.len().saturating_sub(context_lines);
lines_to_show = &raw[skip_start..];
}
if !next_part_is_change && lines_to_show.len() > context_lines {
// Show only first N lines as trailing context
skip_end = lines_to_show.len() - context_lines;
lines_to_show = &lines_to_show[..context_lines];
}
if skip_start > 0 {
output.push(format!(" {:>width$} ...", "", width = line_num_width));
old_line_num += skip_start;
new_line_num += skip_start;
}
for line in lines_to_show {
let num = format!("{:>width$}", old_line_num, width = line_num_width);
output.push(format!(" {} {}", num, line));
old_line_num += 1;
new_line_num += 1;
}
if skip_end > 0 {
output.push(format!(" {:>width$} ...", "", width = line_num_width));
old_line_num += skip_end;
new_line_num += skip_end;
}
} else {
old_line_num += raw.len();
new_line_num += raw.len();
}
last_was_change = false;
}
}
}
DiffResult {
diff: output.join("\n"),
first_changed_line,
}
}
// ---------------------------------------------------------------------------
// Tests
// ---------------------------------------------------------------------------
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_normalize_smart_quotes() {
let input = "\u{201C}hello\u{201D} \u{2018}world\u{2019}";
assert_eq!(normalize_impl(input), "\"hello\" 'world'");
}
#[test]
fn test_normalize_dashes() {
let input = "a\u{2013}b\u{2014}c\u{2212}d";
assert_eq!(normalize_impl(input), "a-b-c-d");
}
#[test]
fn test_normalize_special_spaces() {
let input = "a\u{00A0}b\u{2003}c\u{3000}d";
assert_eq!(normalize_impl(input), "a b c d");
}
#[test]
fn test_normalize_trailing_whitespace() {
let input = "hello \nworld ";
assert_eq!(normalize_impl(input), "hello\nworld");
}
#[test]
fn test_fuzzy_find_exact() {
let result = fuzzy_find_text("hello world".to_string(), "world".to_string());
assert!(result.found);
assert_eq!(result.index, 6);
assert_eq!(result.match_length, 5);
assert!(!result.used_fuzzy_match);
}
#[test]
fn test_fuzzy_find_with_smart_quotes() {
let content = "let x = \u{201C}hello\u{201D};".to_string();
let old_text = "let x = \"hello\";".to_string();
let result = fuzzy_find_text(content, old_text);
assert!(result.found);
assert!(result.used_fuzzy_match);
}
#[test]
fn test_fuzzy_find_not_found() {
let result = fuzzy_find_text("hello world".to_string(), "xyz".to_string());
assert!(!result.found);
assert_eq!(result.index, -1);
}
#[test]
fn test_generate_diff_basic() {
let old = "line1\nline2\nline3";
let new_text = "line1\nmodified\nline3";
let result = generate_diff_impl(old, new_text, 4);
assert!(result.diff.contains("-"));
assert!(result.diff.contains("+"));
assert!(result.diff.contains("line2"));
assert!(result.diff.contains("modified"));
assert!(result.first_changed_line.is_some());
}
#[test]
fn test_generate_diff_addition() {
let old = "line1\nline3";
let new_text = "line1\nline2\nline3";
let result = generate_diff_impl(old, new_text, 4);
assert!(result.diff.contains("+"));
assert!(result.diff.contains("line2"));
}
#[test]
fn test_generate_diff_deletion() {
let old = "line1\nline2\nline3";
let new_text = "line1\nline3";
let result = generate_diff_impl(old, new_text, 4);
assert!(result.diff.contains("-"));
assert!(result.diff.contains("line2"));
}
#[test]
fn test_generate_diff_context_ellipsis() {
let mut old_lines: Vec<String> = (1..=20).map(|i| format!("line{}", i)).collect();
let old = old_lines.join("\n");
old_lines[10] = "modified".to_string();
let new_text = old_lines.join("\n");
let result = generate_diff_impl(&old, &new_text, 2);
assert!(result.diff.contains("..."));
}
#[test]
fn test_generate_diff_empty() {
let result = generate_diff_impl("same", "same", 4);
assert!(result.diff.is_empty());
assert!(result.first_changed_line.is_none());
}
}