From 0b288f389fb57b85caa3dbd8b9d93802a6e37bce Mon Sep 17 00:00:00 2001 From: Lex Christopherson Date: Fri, 13 Mar 2026 12:40:22 -0600 Subject: [PATCH] feat: add html-to-markdown native module Port HTML-to-Markdown conversion from Oh My Pi's html module using html-to-markdown-rs. Exposes `htmlToMarkdown()` via N-API with options for content cleaning (strip nav/forms/headers/footers) and image skipping. Co-Authored-By: Claude Opus 4.6 (1M context) --- native/Cargo.lock | 931 ++++++++++++++++++++ native/crates/engine/Cargo.toml | 1 + native/crates/engine/src/html.rs | 44 + native/crates/engine/src/lib.rs | 1 + packages/native/package.json | 6 +- packages/native/src/__tests__/html.test.mjs | 98 +++ packages/native/src/html/index.ts | 24 + packages/native/src/html/types.ts | 7 + packages/native/src/index.ts | 4 + packages/native/src/native.ts | 1 + 10 files changed, 1116 insertions(+), 1 deletion(-) create mode 100644 native/crates/engine/src/html.rs create mode 100644 packages/native/src/__tests__/html.test.mjs create mode 100644 packages/native/src/html/index.ts create mode 100644 packages/native/src/html/types.ts diff --git a/native/Cargo.lock b/native/Cargo.lock index ba8fa03da..be0931d5b 100644 --- a/native/Cargo.lock +++ b/native/Cargo.lock @@ -2,6 +2,19 @@ # It is not intended for manual editing. version = 4 +[[package]] +name = "ahash" +version = "0.8.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a15f179cd60c4584b8a8c596927aadc462e27f2ca70c04e0071964a73ba7a75" +dependencies = [ + "cfg-if", + "const-random", + "once_cell", + "version_check", + "zerocopy", +] + [[package]] name = "aho-corasick" version = "1.1.4" @@ -11,6 +24,54 @@ dependencies = [ "memchr", ] +[[package]] +name = "allocator-api2" +version = "0.2.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "683d7910e743518b0e34f1186f92494becacb047c7b6bf616c96772180fef923" + +[[package]] +name = "ast-grep-core" +version = "0.39.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "057ae90e7256ebf85f840b1638268df0142c9d19467d500b790631fd301acc27" +dependencies = [ + "bit-set", + "regex", + "thiserror", + "tree-sitter", +] + +[[package]] +name = "astral-tl" +version = "0.7.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d90933ffb0f97e2fc2e0de21da9d3f20597b804012d199843a6fe7c2810d28f3" +dependencies = [ + "memchr", +] + +[[package]] +name = "base64" +version = "0.22.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "72b3254f16251a8381aa12e40e3c4d2f0199f8c6508fbecb9d91f575e0fbb8c6" + +[[package]] +name = "bit-set" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "08807e080ed7f9d5433fa9b275196cfc35414f66a0c79d864dc51a0d825231a3" +dependencies = [ + "bit-vec", +] + +[[package]] +name = "bit-vec" +version = "0.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5e764a1d40d510daf35e07be9eb06e75770908c27d411ee6c92109c9840eaaf7" + [[package]] name = "bitflags" version = "2.11.0" @@ -28,12 +89,42 @@ dependencies = [ "serde", ] +[[package]] +name = "cc" +version = "1.2.56" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "aebf35691d1bfb0ac386a69bac2fde4dd276fb618cf8bf4f5318fe285e821bb2" +dependencies = [ + "find-msvc-tools", + "shlex", +] + [[package]] name = "cfg-if" version = "1.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9330f8b2ff13f34540b44e946ef35111825727b38d33286ef986142615121801" +[[package]] +name = "const-random" +version = "0.1.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "87e00182fe74b066627d63b85fd550ac2998d4b0bd86bfed477a0ae4c7c71359" +dependencies = [ + "const-random-macro", +] + +[[package]] +name = "const-random-macro" +version = "0.1.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f9d839f2a20b0aee515dc581a6172f2321f96cab76c1a38a4c584a194955390e" +dependencies = [ + "getrandom", + "once_cell", + "tiny-keccak", +] + [[package]] name = "convert_case" version = "0.6.0" @@ -68,6 +159,12 @@ version = "0.8.21" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28" +[[package]] +name = "crunchy" +version = "0.2.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "460fbee9c2c2f33933d720630a6a0bac33ba7053db5344fac858d4b8952d77d5" + [[package]] name = "ctor" version = "0.2.9" @@ -102,6 +199,41 @@ dependencies = [ "encoding_rs", ] +[[package]] +name = "equivalent" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "877a4ace8713b0bcf2a4e7eec82529c029f1d0619886d18145fea96c3ffe5c0f" + +[[package]] +name = "fastrand" +version = "2.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" + +[[package]] +name = "find-msvc-tools" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5baebc0774151f905a1a2cc41989300b1e6fbb29aff0ceffa1064fdd3088d582" + +[[package]] +name = "foldhash" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77ce24cb58228fbb8aa041425bb1050850ac19177686ea6e0f41a70416f56fdb" + +[[package]] +name = "getrandom" +version = "0.2.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ff2abc00be7fca6ebc474524697ae276ad847ad0a6b3faa4bcb027e9a4614ad0" +dependencies = [ + "cfg-if", + "libc", + "wasi", +] + [[package]] name = "globset" version = "0.4.18" @@ -152,11 +284,62 @@ dependencies = [ "memmap2", ] +[[package]] +name = "gsd-ast" +version = "0.1.0" +dependencies = [ + "ast-grep-core", + "globset", + "ignore", + "napi", + "napi-derive", + "phf", + "tree-sitter", + "tree-sitter-bash", + "tree-sitter-c", + "tree-sitter-c-sharp", + "tree-sitter-cpp", + "tree-sitter-css", + "tree-sitter-diff", + "tree-sitter-elixir", + "tree-sitter-go", + "tree-sitter-haskell", + "tree-sitter-hcl", + "tree-sitter-html", + "tree-sitter-java", + "tree-sitter-javascript", + "tree-sitter-json", + "tree-sitter-julia", + "tree-sitter-kotlin-sg", + "tree-sitter-lua", + "tree-sitter-make", + "tree-sitter-md", + "tree-sitter-nix", + "tree-sitter-objc", + "tree-sitter-odin", + "tree-sitter-php", + "tree-sitter-python", + "tree-sitter-regex", + "tree-sitter-ruby", + "tree-sitter-rust", + "tree-sitter-scala", + "tree-sitter-solidity", + "tree-sitter-starlark", + "tree-sitter-swift", + "tree-sitter-toml-ng", + "tree-sitter-typescript", + "tree-sitter-verilog", + "tree-sitter-xml", + "tree-sitter-yaml", + "tree-sitter-zig", +] + [[package]] name = "gsd-engine" version = "0.1.0" dependencies = [ "gsd-grep", + "html-to-markdown-rs", "napi", "napi-build", "napi-derive", @@ -173,6 +356,53 @@ dependencies = [ "rayon", ] +[[package]] +name = "hashbrown" +version = "0.16.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "841d1cc9bed7f9236f321df977030373f4a4163ae1a7dbfe1a51a2c1a51d9100" +dependencies = [ + "allocator-api2", + "equivalent", + "foldhash", +] + +[[package]] +name = "html-escape" +version = "0.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6d1ad449764d627e22bfd7cd5e8868264fc9236e07c752972b4080cd351cb476" +dependencies = [ + "utf8-width", +] + +[[package]] +name = "html-to-markdown-rs" +version = "2.28.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9377e16af590b764fd98fd176027cf8831c5335f8964f3f643753e38913a4e" +dependencies = [ + "ahash", + "astral-tl", + "base64", + "html-escape", + "html5ever", + "lru", + "once_cell", + "regex", + "thiserror", +] + +[[package]] +name = "html5ever" +version = "0.38.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1054432bae2f14e0061e33d23402fbaa67a921d319d56adc6bcf887ddad1cbc2" +dependencies = [ + "log", + "markup5ever", +] + [[package]] name = "ignore" version = "0.4.25" @@ -189,6 +419,22 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "indexmap" +version = "2.13.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7714e70437a7dc3ac8eb7e6f8df75fd8eb422675fc7678aff7364301092b1017" +dependencies = [ + "equivalent", + "hashbrown", +] + +[[package]] +name = "itoa" +version = "1.0.17" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "92ecc6618181def0457392ccd0ee51198e065e016d1d527a7ac1b6dc7c1f09d2" + [[package]] name = "libc" version = "0.2.183" @@ -205,12 +451,41 @@ dependencies = [ "windows-link", ] +[[package]] +name = "lock_api" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "224399e74b87b5f3557511d98dff8b14089b3dadafcab6bb93eab67d3aace965" +dependencies = [ + "scopeguard", +] + [[package]] name = "log" version = "0.4.29" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5e5032e24019045c762d3c0f28f5b6b8bbf38563a65908389bf7978758920897" +[[package]] +name = "lru" +version = "0.16.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a1dc47f592c06f33f8e3aea9591776ec7c9f9e4124778ff8a3c3b87159f7e593" +dependencies = [ + "hashbrown", +] + +[[package]] +name = "markup5ever" +version = "0.38.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8983d30f2915feeaaab2d6babdd6bc7e9ed1a00b66b5e6d74df19aa9c0e91862" +dependencies = [ + "log", + "tendril", + "web_atoms", +] + [[package]] name = "memchr" version = "2.8.0" @@ -283,12 +558,100 @@ dependencies = [ "libloading", ] +[[package]] +name = "new_debug_unreachable" +version = "1.0.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" + [[package]] name = "once_cell" version = "1.21.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9f7c3e4beb33f85d45ae3e3a1792185706c8e16d043238c593331cc7cd313b50" +[[package]] +name = "parking_lot" +version = "0.12.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "93857453250e3077bd71ff98b6a65ea6621a19bb0f559a85248955ac12c45a1a" +dependencies = [ + "lock_api", + "parking_lot_core", +] + +[[package]] +name = "parking_lot_core" +version = "0.9.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2621685985a2ebf1c516881c026032ac7deafcda1a2c9b7850dc81e3dfcb64c1" +dependencies = [ + "cfg-if", + "libc", + "redox_syscall", + "smallvec", + "windows-link", +] + +[[package]] +name = "phf" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c1562dc717473dbaa4c1f85a36410e03c047b2e7df7f45ee938fbef64ae7fadf" +dependencies = [ + "phf_macros", + "phf_shared", + "serde", +] + +[[package]] +name = "phf_codegen" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "49aa7f9d80421bca176ca8dbfebe668cc7a2684708594ec9f3c0db0805d5d6e1" +dependencies = [ + "phf_generator", + "phf_shared", +] + +[[package]] +name = "phf_generator" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "135ace3a761e564ec88c03a77317a7c6b80bb7f7135ef2544dbe054243b89737" +dependencies = [ + "fastrand", + "phf_shared", +] + +[[package]] +name = "phf_macros" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "812f032b54b1e759ccd5f8b6677695d5268c588701effba24601f6932f8269ef" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "phf_shared" +version = "0.13.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e57fef6bc5981e38c2ce2d63bfa546861309f875b8a75f092d1d54ae2d64f266" +dependencies = [ + "siphasher", +] + +[[package]] +name = "precomputed-hash" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" + [[package]] name = "proc-macro2" version = "1.0.106" @@ -327,6 +690,15 @@ dependencies = [ "crossbeam-utils", ] +[[package]] +name = "redox_syscall" +version = "0.5.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed2bf2547551a7053d6fdfafda3f938979645c44812fbfcda098faae3f1a362d" +dependencies = [ + "bitflags", +] + [[package]] name = "regex" version = "1.12.3" @@ -365,6 +737,12 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "scopeguard" +version = "1.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" + [[package]] name = "semver" version = "1.0.27" @@ -400,6 +778,68 @@ dependencies = [ "syn", ] +[[package]] +name = "serde_json" +version = "1.0.149" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "83fc039473c5595ace860d8c4fafa220ff474b3fc6bfdb4293327f1a37e94d86" +dependencies = [ + "indexmap", + "itoa", + "memchr", + "serde", + "serde_core", + "zmij", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + +[[package]] +name = "siphasher" +version = "1.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b2aa850e253778c88a04c3d7323b043aeda9d3e30d5971937c1855769763678e" + +[[package]] +name = "smallvec" +version = "1.15.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b1b7a3b5fe4f1376887184045fcf45c69e92af734b7aaddc05fb777b6fbd03" + +[[package]] +name = "streaming-iterator" +version = "0.1.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b2231b7c3057d5e4ad0156fb3dc807d900806020c5ffa3ee6ff2c8c76fb8520" + +[[package]] +name = "string_cache" +version = "0.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a18596f8c785a729f2819c0f6a7eae6ebeebdfffbfe4214ae6b087f690e31901" +dependencies = [ + "new_debug_unreachable", + "parking_lot", + "phf_shared", + "precomputed-hash", +] + +[[package]] +name = "string_cache_codegen" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "585635e46db231059f76c5849798146164652513eb9e8ab2685939dd90f29b69" +dependencies = [ + "phf_generator", + "phf_shared", + "proc-macro2", + "quote", +] + [[package]] name = "syn" version = "2.0.117" @@ -411,6 +851,435 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "tendril" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c4790fc369d5a530f4b544b094e31388b9b3a37c0f4652ade4505945f5660d24" +dependencies = [ + "new_debug_unreachable", + "utf-8", +] + +[[package]] +name = "thiserror" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4288b5bcbc7920c07a1149a35cf9590a2aa808e0bc1eafaade0b80947865fbc4" +dependencies = [ + "thiserror-impl", +] + +[[package]] +name = "thiserror-impl" +version = "2.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ebc4ee7f67670e9b64d05fa4253e753e016c6c95ff35b89b7941d6b856dec1d5" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "tiny-keccak" +version = "2.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2c9d3793400a45f954c52e73d068316d76b6f4e36977e3fcebb13a2721e80237" +dependencies = [ + "crunchy", +] + +[[package]] +name = "tree-sitter" +version = "0.25.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "78f873475d258561b06f1c595d93308a7ed124d9977cb26b148c2084a4a3cc87" +dependencies = [ + "cc", + "regex", + "regex-syntax", + "serde_json", + "streaming-iterator", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-bash" +version = "0.25.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9e5ec769279cc91b561d3df0d8a5deb26b0ad40d183127f409494d6d8fc53062" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-c" +version = "0.24.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1a3aad8f0129083a59fe8596157552d2bb7148c492d44c21558d68ca1c722707" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-c-sharp" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67f06accca7b45351758663b8215089e643d53bd9a660ce0349314263737fcb0" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-cpp" +version = "0.23.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "df2196ea9d47b4ab4a31b9297eaa5a5d19a0b121dceb9f118f6790ad0ab94743" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-css" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a5cbc5e18f29a2c6d6435891f42569525cf95435a3e01c2f1947abcde178686f" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-diff" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "dfe1e5ca280a65dfe5ba4205c1bcc84edf486464fed315db53dee6da9a335889" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-elixir" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "66dd064a762ed95bfc29857fa3cb7403bb1e5cb88112de0f6341b7e47284ba40" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-go" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c8560a4d2f835cc0d4d2c2e03cbd0dde2f6114b43bc491164238d333e28b16ea" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-haskell" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "977c51e504548cba13fc27cb5a2edab2124cf6716a1934915d07ab99523b05a4" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-hcl" +version = "1.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5a7b2cc3d7121553b84309fab9d11b3ff3d420403eef9ae50f9fd1cd9d9cf012" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-html" +version = "0.23.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "261b708e5d92061ede329babaaa427b819329a9d427a1d710abb0f67bbef63ee" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-java" +version = "0.23.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0aa6cbcdc8c679b214e616fd3300da67da0e492e066df01bcf5a5921a71e90d6" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-javascript" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68204f2abc0627a90bdf06e605f5c470aa26fdcb2081ea553a04bdad756693f5" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-json" +version = "0.23.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "86a5d6b3ea17e06e7a34aabeadd68f5866c0d0f9359155d432095f8b751865e4" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-julia" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4144731a178812ee867619b1e98b3b91e54c1652304b26e5ebe3175b701de323" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-kotlin-sg" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a0e175b7530765d1e36ad234a7acaa8b2a3316153f239d724376c7ee5e8d8e98" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-language" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "009994f150cc0cd50ff54917d5bc8bffe8cad10ca10d81c34da2ec421ae61782" + +[[package]] +name = "tree-sitter-lua" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5cdb9adf0965fec58e7660cbb3a059dbb12ebeec9459e6dcbae3db004739641e" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-make" +version = "1.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c5998dc7cbcbdab19fae8aefef982bf2d6544513d8d2e69cc44aec4c63810104" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-md" +version = "0.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2efd398be546456c814598ee56c0f51769a77241511b4a58077815d120afa882" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-nix" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4952a9733f3a98f6683a0ccd1035d84ab7a52f7e84eeed58548d86765ad92de3" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-objc" +version = "3.0.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9ca8bb556423fc176f0535e79d525f783a6684d3c9da81bf9d905303c129e1d2" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-odin" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "24db210fe9ba2237c71c5030d7b146c7025420ba72dd8013d13cd822c3a8d77a" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-php" +version = "0.24.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0d8c17c3ab69052c5eeaa7ff5cd972dd1bc25d1b97ee779fec391ad3b5df5592" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-python" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6bf85fd39652e740bf60f46f4cda9492c3a9ad75880575bf14960f775cb74a1c" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-regex" +version = "0.25.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bd8a59be9f0ac131fd8f062eaaba14882b2fa5a6a7882a20134cb1d60df2e625" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-ruby" +version = "0.23.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "be0484ea4ef6bb9c575b4fdabde7e31340a8d2dbc7d52b321ac83da703249f95" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-rust" +version = "0.24.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4b9b18034c684a2420722be8b2a91c9c44f2546b631c039edf575ccba8c61be1" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-scala" +version = "0.24.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3b4f354028b5fcf1d0c77f1c6d84cd5a579f29a1e43cb61551ec6580e9a99229" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-solidity" +version = "1.2.13" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4eacf8875b70879f0cb670c60b233ad0b68752d9e1474e6c3ef168eea8a90b25" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-starlark" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8934f282d085cc4b9ee28aa688aa3fbe8aa3766201c2a6252f411d45b4c3a721" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-swift" +version = "0.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4ef216011c3e3df4fa864736f347cb8d509b1066cf0c8549fb1fd81ac9832e59" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-toml-ng" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e9adc2c898ae49730e857d75be403da3f92bb81d8e37a2f918a08dd10de5ebb1" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-typescript" +version = "0.23.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c5f76ed8d947a75cc446d5fccd8b602ebf0cde64ccf2ffa434d873d7a575eff" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-verilog" +version = "1.0.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d4e7e0360395852f1f6ff5b7b82c72dc6557d181073188df1d60ec469ea69c66" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-xml" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e670041f591d994f54d597ddcd8f4ebc930e282c4c76a42268743b71f0c8b6b3" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-yaml" +version = "0.7.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "53c223db85f05e34794f065454843b0668ebc15d240ada63e2b5939f43ce7c97" +dependencies = [ + "cc", + "tree-sitter-language", +] + +[[package]] +name = "tree-sitter-zig" +version = "1.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ab11fc124851b0db4dd5e55983bbd9631192e93238389dcd44521715e5d53e28" +dependencies = [ + "cc", + "tree-sitter-language", +] + [[package]] name = "unicode-ident" version = "1.0.24" @@ -423,6 +1292,24 @@ version = "1.12.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493" +[[package]] +name = "utf-8" +version = "0.7.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" + +[[package]] +name = "utf8-width" +version = "0.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1292c0d970b54115d14f2492fe0170adf21d68a1de108eebc51c1df4f346a091" + +[[package]] +name = "version_check" +version = "0.9.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" + [[package]] name = "walkdir" version = "2.5.0" @@ -433,6 +1320,24 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "wasi" +version = "0.11.1+wasi-snapshot-preview1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccf3ec651a847eb01de73ccad15eb7d99f80485de043efb2f370cd654f4ea44b" + +[[package]] +name = "web_atoms" +version = "0.2.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "57a9779e9f04d2ac1ce317aee707aa2f6b773afba7b931222bff6983843b1576" +dependencies = [ + "phf", + "phf_codegen", + "string_cache", + "string_cache_codegen", +] + [[package]] name = "winapi-util" version = "0.1.11" @@ -456,3 +1361,29 @@ checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc" dependencies = [ "windows-link", ] + +[[package]] +name = "zerocopy" +version = "0.8.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f2578b716f8a7a858b7f02d5bd870c14bf4ddbbcf3a4c05414ba6503640505e3" +dependencies = [ + "zerocopy-derive", +] + +[[package]] +name = "zerocopy-derive" +version = "0.8.42" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7e6cc098ea4d3bd6246687de65af3f920c430e236bee1e3bf2e441463f08a02f" +dependencies = [ + "proc-macro2", + "quote", + "syn", +] + +[[package]] +name = "zmij" +version = "1.0.21" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b8848ee67ecc8aedbaf3e4122217aff892639231befc6a1b58d29fff4c2cabaa" diff --git a/native/crates/engine/Cargo.toml b/native/crates/engine/Cargo.toml index dcd61ef0c..a6daaca0a 100644 --- a/native/crates/engine/Cargo.toml +++ b/native/crates/engine/Cargo.toml @@ -12,6 +12,7 @@ crate-type = ["cdylib"] [dependencies] gsd-grep = { path = "../grep" } +html-to-markdown-rs = { version = "2", default-features = false } napi = { version = "2", features = ["napi8"] } napi-derive = "2" diff --git a/native/crates/engine/src/html.rs b/native/crates/engine/src/html.rs new file mode 100644 index 000000000..2cc44c047 --- /dev/null +++ b/native/crates/engine/src/html.rs @@ -0,0 +1,44 @@ +//! HTML to Markdown conversion via N-API. +//! +//! Wraps `html-to-markdown-rs` and exposes it as a JS-callable N-API export. + +use html_to_markdown_rs::{convert, ConversionOptions, PreprocessingOptions, PreprocessingPreset}; +use napi::bindgen_prelude::*; +use napi_derive::napi; + +/// Options for HTML to Markdown conversion. +#[napi(object)] +#[derive(Debug, Default)] +pub struct HtmlToMarkdownOptions { + /// Remove navigation elements, forms, headers, footers. + #[napi(js_name = "cleanContent")] + pub clean_content: Option, + /// Skip images during conversion. + #[napi(js_name = "skipImages")] + pub skip_images: Option, +} + +/// Convert HTML source to Markdown with optional preprocessing. +/// +/// Strips boilerplate (nav, forms, headers, footers) when `cleanContent` is true. +/// Returns the Markdown string. +#[napi(js_name = "htmlToMarkdown")] +pub fn html_to_markdown(html: String, options: Option) -> Result { + let options = options.unwrap_or_default(); + let clean_content = options.clean_content.unwrap_or(false); + let skip_images = options.skip_images.unwrap_or(false); + + let conversion_opts = ConversionOptions { + skip_images, + preprocessing: PreprocessingOptions { + enabled: clean_content, + preset: PreprocessingPreset::Aggressive, + remove_navigation: true, + remove_forms: true, + }, + ..Default::default() + }; + + convert(&html, Some(conversion_opts)) + .map_err(|err| Error::from_reason(format!("HTML conversion error: {err}"))) +} diff --git a/native/crates/engine/src/lib.rs b/native/crates/engine/src/lib.rs index 82985849b..f4dca80dc 100644 --- a/native/crates/engine/src/lib.rs +++ b/native/crates/engine/src/lib.rs @@ -9,3 +9,4 @@ #![allow(clippy::needless_pass_by_value)] mod grep; +mod html; diff --git a/packages/native/package.json b/packages/native/package.json index 84de3dfb3..f81b574d6 100644 --- a/packages/native/package.json +++ b/packages/native/package.json @@ -8,7 +8,7 @@ "scripts": { "build:native": "node ../../native/scripts/build.js", "build:native:dev": "node ../../native/scripts/build.js --dev", - "test": "node --test src/__tests__/grep.test.mjs" + "test": "node --test src/__tests__/grep.test.mjs src/__tests__/html.test.mjs" }, "exports": { ".": { @@ -18,6 +18,10 @@ "./grep": { "types": "./src/grep/index.ts", "import": "./src/grep/index.ts" + }, + "./html": { + "types": "./src/html/index.ts", + "import": "./src/html/index.ts" } }, "files": [ diff --git a/packages/native/src/__tests__/html.test.mjs b/packages/native/src/__tests__/html.test.mjs new file mode 100644 index 000000000..31e21c463 --- /dev/null +++ b/packages/native/src/__tests__/html.test.mjs @@ -0,0 +1,98 @@ +import { test, describe } from "node:test"; +import assert from "node:assert/strict"; +import { createRequire } from "node:module"; +import * as path from "node:path"; +import { fileURLToPath } from "node:url"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const require = createRequire(import.meta.url); + +const addonDir = path.resolve(__dirname, "..", "..", "..", "..", "native", "addon"); +const platformTag = `${process.platform}-${process.arch}`; +const candidates = [ + path.join(addonDir, `gsd_engine.${platformTag}.node`), + path.join(addonDir, "gsd_engine.dev.node"), +]; + +let native; +for (const candidate of candidates) { + try { + native = require(candidate); + break; + } catch { + // try next + } +} + +if (!native) { + console.error("Native addon not found. Run `npm run build:native -w @gsd/native` first."); + process.exit(1); +} + +describe("native html: htmlToMarkdown()", () => { + test("converts basic HTML to markdown", () => { + const html = "

Hello

World

"; + const result = native.htmlToMarkdown(html); + assert.ok(result.includes("Hello"), "Should contain heading text"); + assert.ok(result.includes("World"), "Should contain paragraph text"); + }); + + test("converts links to markdown links", () => { + const html = '

Visit Example

'; + const result = native.htmlToMarkdown(html); + assert.ok(result.includes("[Example]"), "Should contain markdown link text"); + assert.ok(result.includes("(https://example.com)"), "Should contain markdown link URL"); + }); + + test("converts lists to markdown", () => { + const html = "
  • First
  • Second
  • Third
"; + const result = native.htmlToMarkdown(html); + assert.ok(result.includes("First"), "Should contain first item"); + assert.ok(result.includes("Second"), "Should contain second item"); + assert.ok(result.includes("Third"), "Should contain third item"); + }); + + test("converts bold and italic", () => { + const html = "

bold and italic

"; + const result = native.htmlToMarkdown(html); + assert.ok(result.includes("**bold**") || result.includes("__bold__"), "Should contain bold"); + assert.ok(result.includes("*italic*") || result.includes("_italic_"), "Should contain italic"); + }); + + test("handles empty HTML", () => { + const result = native.htmlToMarkdown(""); + assert.equal(typeof result, "string"); + }); + + test("handles plain text", () => { + const result = native.htmlToMarkdown("Just plain text"); + assert.ok(result.includes("Just plain text"), "Should preserve plain text"); + }); + + test("accepts skipImages option", () => { + const html = '

Title

Content with photo image

'; + const result = native.htmlToMarkdown(html, { skipImages: true }); + assert.ok(result.includes("Title"), "Should contain heading"); + assert.ok(result.includes("Content"), "Should contain paragraph text"); + }); + + test("accepts cleanContent option", () => { + const html = '

Article

Body text.

Copyright
'; + const result = native.htmlToMarkdown(html, { cleanContent: true }); + assert.ok(result.includes("Article") || result.includes("Body text"), "Should contain main content"); + }); + + test("converts code blocks", () => { + const html = "
const x = 1;
"; + const result = native.htmlToMarkdown(html); + assert.ok(result.includes("const x = 1;"), "Should contain code content"); + }); + + test("converts complex nested HTML", () => { + const html = '

Section

Text with bold link.

  • Item one
  • Item two
'; + const result = native.htmlToMarkdown(html); + assert.ok(result.includes("Section"), "Should contain heading"); + assert.ok(result.includes("example.com"), "Should contain link"); + assert.ok(result.includes("one"), "Should contain list items"); + }); +}); diff --git a/packages/native/src/html/index.ts b/packages/native/src/html/index.ts new file mode 100644 index 000000000..28886b7a2 --- /dev/null +++ b/packages/native/src/html/index.ts @@ -0,0 +1,24 @@ +/** + * HTML to Markdown conversion via native Rust bindings. + * + * Uses `html-to-markdown-rs` under the hood for high-performance + * conversion with optional content cleaning (stripping nav, forms, etc.). + */ + +import { native } from "../native.js"; +import type { HtmlToMarkdownOptions } from "./types.js"; + +export type { HtmlToMarkdownOptions }; + +/** + * Convert an HTML string to Markdown. + * + * When `cleanContent` is true, boilerplate elements (nav, forms, headers, + * footers) are stripped before conversion. + */ +export function htmlToMarkdown( + html: string, + options?: HtmlToMarkdownOptions, +): string { + return native.htmlToMarkdown(html, options ?? {}) as string; +} diff --git a/packages/native/src/html/types.ts b/packages/native/src/html/types.ts new file mode 100644 index 000000000..a8984c7a8 --- /dev/null +++ b/packages/native/src/html/types.ts @@ -0,0 +1,7 @@ +/** Options for HTML to Markdown conversion. */ +export interface HtmlToMarkdownOptions { + /** Remove navigation elements, forms, headers, footers. */ + cleanContent?: boolean; + /** Skip images during conversion. */ + skipImages?: boolean; +} diff --git a/packages/native/src/index.ts b/packages/native/src/index.ts index 3c5cfdf83..8cbc7f81d 100644 --- a/packages/native/src/index.ts +++ b/packages/native/src/index.ts @@ -3,6 +3,7 @@ * * Modules: * - grep: ripgrep-backed regex search (content + filesystem) + * - html: HTML to Markdown conversion */ export { searchContent, grep } from "./grep/index.js"; @@ -15,3 +16,6 @@ export type { SearchOptions, SearchResult, } from "./grep/index.js"; + +export { htmlToMarkdown } from "./html/index.js"; +export type { HtmlToMarkdownOptions } from "./html/index.js"; diff --git a/packages/native/src/native.ts b/packages/native/src/native.ts index 93aa1a09d..17a4aa1b2 100644 --- a/packages/native/src/native.ts +++ b/packages/native/src/native.ts @@ -43,4 +43,5 @@ function loadNative(): Record { export const native = loadNative() as { search: (content: Buffer | Uint8Array, options: unknown) => unknown; grep: (options: unknown) => unknown; + htmlToMarkdown: (html: string, options: unknown) => unknown; };