feat: add html-to-markdown native module

Port HTML-to-Markdown conversion from Oh My Pi's html module using
html-to-markdown-rs. Exposes `htmlToMarkdown()` via N-API with options
for content cleaning (strip nav/forms/headers/footers) and image skipping.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Lex Christopherson 2026-03-13 12:40:22 -06:00
parent 0d390688e3
commit a74d2061c1
10 changed files with 1116 additions and 1 deletions

931
native/Cargo.lock generated

File diff suppressed because it is too large Load diff

View file

@ -12,6 +12,7 @@ crate-type = ["cdylib"]
[dependencies]
gsd-grep = { path = "../grep" }
html-to-markdown-rs = { version = "2", default-features = false }
napi = { version = "2", features = ["napi8"] }
napi-derive = "2"

View file

@ -0,0 +1,44 @@
//! HTML to Markdown conversion via N-API.
//!
//! Wraps `html-to-markdown-rs` and exposes it as a JS-callable N-API export.
use html_to_markdown_rs::{convert, ConversionOptions, PreprocessingOptions, PreprocessingPreset};
use napi::bindgen_prelude::*;
use napi_derive::napi;
/// Options for HTML to Markdown conversion.
#[napi(object)]
#[derive(Debug, Default)]
pub struct HtmlToMarkdownOptions {
/// Remove navigation elements, forms, headers, footers.
#[napi(js_name = "cleanContent")]
pub clean_content: Option<bool>,
/// Skip images during conversion.
#[napi(js_name = "skipImages")]
pub skip_images: Option<bool>,
}
/// Convert HTML source to Markdown with optional preprocessing.
///
/// Strips boilerplate (nav, forms, headers, footers) when `cleanContent` is true.
/// Returns the Markdown string.
#[napi(js_name = "htmlToMarkdown")]
pub fn html_to_markdown(html: String, options: Option<HtmlToMarkdownOptions>) -> Result<String> {
let options = options.unwrap_or_default();
let clean_content = options.clean_content.unwrap_or(false);
let skip_images = options.skip_images.unwrap_or(false);
let conversion_opts = ConversionOptions {
skip_images,
preprocessing: PreprocessingOptions {
enabled: clean_content,
preset: PreprocessingPreset::Aggressive,
remove_navigation: true,
remove_forms: true,
},
..Default::default()
};
convert(&html, Some(conversion_opts))
.map_err(|err| Error::from_reason(format!("HTML conversion error: {err}")))
}

View file

@ -9,3 +9,4 @@
#![allow(clippy::needless_pass_by_value)]
mod grep;
mod html;

View file

@ -8,7 +8,7 @@
"scripts": {
"build:native": "node ../../native/scripts/build.js",
"build:native:dev": "node ../../native/scripts/build.js --dev",
"test": "node --test src/__tests__/grep.test.mjs"
"test": "node --test src/__tests__/grep.test.mjs src/__tests__/html.test.mjs"
},
"exports": {
".": {
@ -18,6 +18,10 @@
"./grep": {
"types": "./src/grep/index.ts",
"import": "./src/grep/index.ts"
},
"./html": {
"types": "./src/html/index.ts",
"import": "./src/html/index.ts"
}
},
"files": [

View file

@ -0,0 +1,98 @@
import { test, describe } from "node:test";
import assert from "node:assert/strict";
import { createRequire } from "node:module";
import * as path from "node:path";
import { fileURLToPath } from "node:url";
const __dirname = path.dirname(fileURLToPath(import.meta.url));
const require = createRequire(import.meta.url);
const addonDir = path.resolve(__dirname, "..", "..", "..", "..", "native", "addon");
const platformTag = `${process.platform}-${process.arch}`;
const candidates = [
path.join(addonDir, `gsd_engine.${platformTag}.node`),
path.join(addonDir, "gsd_engine.dev.node"),
];
let native;
for (const candidate of candidates) {
try {
native = require(candidate);
break;
} catch {
// try next
}
}
if (!native) {
console.error("Native addon not found. Run `npm run build:native -w @gsd/native` first.");
process.exit(1);
}
describe("native html: htmlToMarkdown()", () => {
test("converts basic HTML to markdown", () => {
const html = "<h1>Hello</h1><p>World</p>";
const result = native.htmlToMarkdown(html);
assert.ok(result.includes("Hello"), "Should contain heading text");
assert.ok(result.includes("World"), "Should contain paragraph text");
});
test("converts links to markdown links", () => {
const html = '<p>Visit <a href="https://example.com">Example</a></p>';
const result = native.htmlToMarkdown(html);
assert.ok(result.includes("[Example]"), "Should contain markdown link text");
assert.ok(result.includes("(https://example.com)"), "Should contain markdown link URL");
});
test("converts lists to markdown", () => {
const html = "<ul><li>First</li><li>Second</li><li>Third</li></ul>";
const result = native.htmlToMarkdown(html);
assert.ok(result.includes("First"), "Should contain first item");
assert.ok(result.includes("Second"), "Should contain second item");
assert.ok(result.includes("Third"), "Should contain third item");
});
test("converts bold and italic", () => {
const html = "<p><strong>bold</strong> and <em>italic</em></p>";
const result = native.htmlToMarkdown(html);
assert.ok(result.includes("**bold**") || result.includes("__bold__"), "Should contain bold");
assert.ok(result.includes("*italic*") || result.includes("_italic_"), "Should contain italic");
});
test("handles empty HTML", () => {
const result = native.htmlToMarkdown("");
assert.equal(typeof result, "string");
});
test("handles plain text", () => {
const result = native.htmlToMarkdown("Just plain text");
assert.ok(result.includes("Just plain text"), "Should preserve plain text");
});
test("accepts skipImages option", () => {
const html = '<h1>Title</h1><p>Content with <img src="photo.jpg" alt="photo"> image</p>';
const result = native.htmlToMarkdown(html, { skipImages: true });
assert.ok(result.includes("Title"), "Should contain heading");
assert.ok(result.includes("Content"), "Should contain paragraph text");
});
test("accepts cleanContent option", () => {
const html = '<nav><a href="/home">Home</a></nav><main><h1>Article</h1><p>Body text.</p></main><footer>Copyright</footer>';
const result = native.htmlToMarkdown(html, { cleanContent: true });
assert.ok(result.includes("Article") || result.includes("Body text"), "Should contain main content");
});
test("converts code blocks", () => {
const html = "<pre><code>const x = 1;</code></pre>";
const result = native.htmlToMarkdown(html);
assert.ok(result.includes("const x = 1;"), "Should contain code content");
});
test("converts complex nested HTML", () => {
const html = '<div><h2>Section</h2><p>Text with <a href="https://example.com"><strong>bold link</strong></a>.</p><ul><li>Item one</li><li>Item two</li></ul></div>';
const result = native.htmlToMarkdown(html);
assert.ok(result.includes("Section"), "Should contain heading");
assert.ok(result.includes("example.com"), "Should contain link");
assert.ok(result.includes("one"), "Should contain list items");
});
});

View file

@ -0,0 +1,24 @@
/**
* HTML to Markdown conversion via native Rust bindings.
*
* Uses `html-to-markdown-rs` under the hood for high-performance
* conversion with optional content cleaning (stripping nav, forms, etc.).
*/
import { native } from "../native.js";
import type { HtmlToMarkdownOptions } from "./types.js";
export type { HtmlToMarkdownOptions };
/**
* Convert an HTML string to Markdown.
*
* When `cleanContent` is true, boilerplate elements (nav, forms, headers,
* footers) are stripped before conversion.
*/
export function htmlToMarkdown(
html: string,
options?: HtmlToMarkdownOptions,
): string {
return native.htmlToMarkdown(html, options ?? {}) as string;
}

View file

@ -0,0 +1,7 @@
/** Options for HTML to Markdown conversion. */
export interface HtmlToMarkdownOptions {
/** Remove navigation elements, forms, headers, footers. */
cleanContent?: boolean;
/** Skip images during conversion. */
skipImages?: boolean;
}

View file

@ -3,6 +3,7 @@
*
* Modules:
* - grep: ripgrep-backed regex search (content + filesystem)
* - html: HTML to Markdown conversion
*/
export { searchContent, grep } from "./grep/index.js";
@ -15,3 +16,6 @@ export type {
SearchOptions,
SearchResult,
} from "./grep/index.js";
export { htmlToMarkdown } from "./html/index.js";
export type { HtmlToMarkdownOptions } from "./html/index.js";

View file

@ -43,4 +43,5 @@ function loadNative(): Record<string, unknown> {
export const native = loadNative() as {
search: (content: Buffer | Uint8Array, options: unknown) => unknown;
grep: (options: unknown) => unknown;
htmlToMarkdown: (html: string, options: unknown) => unknown;
};