feat: add html-to-markdown native module
Port HTML-to-Markdown conversion from Oh My Pi's html module using html-to-markdown-rs. Exposes `htmlToMarkdown()` via N-API with options for content cleaning (strip nav/forms/headers/footers) and image skipping. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
c669c6183a
commit
0b288f389f
10 changed files with 1116 additions and 1 deletions
931
native/Cargo.lock
generated
931
native/Cargo.lock
generated
File diff suppressed because it is too large
Load diff
|
|
@ -12,6 +12,7 @@ crate-type = ["cdylib"]
|
|||
|
||||
[dependencies]
|
||||
gsd-grep = { path = "../grep" }
|
||||
html-to-markdown-rs = { version = "2", default-features = false }
|
||||
napi = { version = "2", features = ["napi8"] }
|
||||
napi-derive = "2"
|
||||
|
||||
|
|
|
|||
44
native/crates/engine/src/html.rs
Normal file
44
native/crates/engine/src/html.rs
Normal file
|
|
@ -0,0 +1,44 @@
|
|||
//! HTML to Markdown conversion via N-API.
|
||||
//!
|
||||
//! Wraps `html-to-markdown-rs` and exposes it as a JS-callable N-API export.
|
||||
|
||||
use html_to_markdown_rs::{convert, ConversionOptions, PreprocessingOptions, PreprocessingPreset};
|
||||
use napi::bindgen_prelude::*;
|
||||
use napi_derive::napi;
|
||||
|
||||
/// Options for HTML to Markdown conversion.
|
||||
#[napi(object)]
|
||||
#[derive(Debug, Default)]
|
||||
pub struct HtmlToMarkdownOptions {
|
||||
/// Remove navigation elements, forms, headers, footers.
|
||||
#[napi(js_name = "cleanContent")]
|
||||
pub clean_content: Option<bool>,
|
||||
/// Skip images during conversion.
|
||||
#[napi(js_name = "skipImages")]
|
||||
pub skip_images: Option<bool>,
|
||||
}
|
||||
|
||||
/// Convert HTML source to Markdown with optional preprocessing.
|
||||
///
|
||||
/// Strips boilerplate (nav, forms, headers, footers) when `cleanContent` is true.
|
||||
/// Returns the Markdown string.
|
||||
#[napi(js_name = "htmlToMarkdown")]
|
||||
pub fn html_to_markdown(html: String, options: Option<HtmlToMarkdownOptions>) -> Result<String> {
|
||||
let options = options.unwrap_or_default();
|
||||
let clean_content = options.clean_content.unwrap_or(false);
|
||||
let skip_images = options.skip_images.unwrap_or(false);
|
||||
|
||||
let conversion_opts = ConversionOptions {
|
||||
skip_images,
|
||||
preprocessing: PreprocessingOptions {
|
||||
enabled: clean_content,
|
||||
preset: PreprocessingPreset::Aggressive,
|
||||
remove_navigation: true,
|
||||
remove_forms: true,
|
||||
},
|
||||
..Default::default()
|
||||
};
|
||||
|
||||
convert(&html, Some(conversion_opts))
|
||||
.map_err(|err| Error::from_reason(format!("HTML conversion error: {err}")))
|
||||
}
|
||||
|
|
@ -9,3 +9,4 @@
|
|||
#![allow(clippy::needless_pass_by_value)]
|
||||
|
||||
mod grep;
|
||||
mod html;
|
||||
|
|
|
|||
|
|
@ -8,7 +8,7 @@
|
|||
"scripts": {
|
||||
"build:native": "node ../../native/scripts/build.js",
|
||||
"build:native:dev": "node ../../native/scripts/build.js --dev",
|
||||
"test": "node --test src/__tests__/grep.test.mjs"
|
||||
"test": "node --test src/__tests__/grep.test.mjs src/__tests__/html.test.mjs"
|
||||
},
|
||||
"exports": {
|
||||
".": {
|
||||
|
|
@ -18,6 +18,10 @@
|
|||
"./grep": {
|
||||
"types": "./src/grep/index.ts",
|
||||
"import": "./src/grep/index.ts"
|
||||
},
|
||||
"./html": {
|
||||
"types": "./src/html/index.ts",
|
||||
"import": "./src/html/index.ts"
|
||||
}
|
||||
},
|
||||
"files": [
|
||||
|
|
|
|||
98
packages/native/src/__tests__/html.test.mjs
Normal file
98
packages/native/src/__tests__/html.test.mjs
Normal file
|
|
@ -0,0 +1,98 @@
|
|||
import { test, describe } from "node:test";
|
||||
import assert from "node:assert/strict";
|
||||
import { createRequire } from "node:module";
|
||||
import * as path from "node:path";
|
||||
import { fileURLToPath } from "node:url";
|
||||
|
||||
const __dirname = path.dirname(fileURLToPath(import.meta.url));
|
||||
const require = createRequire(import.meta.url);
|
||||
|
||||
const addonDir = path.resolve(__dirname, "..", "..", "..", "..", "native", "addon");
|
||||
const platformTag = `${process.platform}-${process.arch}`;
|
||||
const candidates = [
|
||||
path.join(addonDir, `gsd_engine.${platformTag}.node`),
|
||||
path.join(addonDir, "gsd_engine.dev.node"),
|
||||
];
|
||||
|
||||
let native;
|
||||
for (const candidate of candidates) {
|
||||
try {
|
||||
native = require(candidate);
|
||||
break;
|
||||
} catch {
|
||||
// try next
|
||||
}
|
||||
}
|
||||
|
||||
if (!native) {
|
||||
console.error("Native addon not found. Run `npm run build:native -w @gsd/native` first.");
|
||||
process.exit(1);
|
||||
}
|
||||
|
||||
describe("native html: htmlToMarkdown()", () => {
|
||||
test("converts basic HTML to markdown", () => {
|
||||
const html = "<h1>Hello</h1><p>World</p>";
|
||||
const result = native.htmlToMarkdown(html);
|
||||
assert.ok(result.includes("Hello"), "Should contain heading text");
|
||||
assert.ok(result.includes("World"), "Should contain paragraph text");
|
||||
});
|
||||
|
||||
test("converts links to markdown links", () => {
|
||||
const html = '<p>Visit <a href="https://example.com">Example</a></p>';
|
||||
const result = native.htmlToMarkdown(html);
|
||||
assert.ok(result.includes("[Example]"), "Should contain markdown link text");
|
||||
assert.ok(result.includes("(https://example.com)"), "Should contain markdown link URL");
|
||||
});
|
||||
|
||||
test("converts lists to markdown", () => {
|
||||
const html = "<ul><li>First</li><li>Second</li><li>Third</li></ul>";
|
||||
const result = native.htmlToMarkdown(html);
|
||||
assert.ok(result.includes("First"), "Should contain first item");
|
||||
assert.ok(result.includes("Second"), "Should contain second item");
|
||||
assert.ok(result.includes("Third"), "Should contain third item");
|
||||
});
|
||||
|
||||
test("converts bold and italic", () => {
|
||||
const html = "<p><strong>bold</strong> and <em>italic</em></p>";
|
||||
const result = native.htmlToMarkdown(html);
|
||||
assert.ok(result.includes("**bold**") || result.includes("__bold__"), "Should contain bold");
|
||||
assert.ok(result.includes("*italic*") || result.includes("_italic_"), "Should contain italic");
|
||||
});
|
||||
|
||||
test("handles empty HTML", () => {
|
||||
const result = native.htmlToMarkdown("");
|
||||
assert.equal(typeof result, "string");
|
||||
});
|
||||
|
||||
test("handles plain text", () => {
|
||||
const result = native.htmlToMarkdown("Just plain text");
|
||||
assert.ok(result.includes("Just plain text"), "Should preserve plain text");
|
||||
});
|
||||
|
||||
test("accepts skipImages option", () => {
|
||||
const html = '<h1>Title</h1><p>Content with <img src="photo.jpg" alt="photo"> image</p>';
|
||||
const result = native.htmlToMarkdown(html, { skipImages: true });
|
||||
assert.ok(result.includes("Title"), "Should contain heading");
|
||||
assert.ok(result.includes("Content"), "Should contain paragraph text");
|
||||
});
|
||||
|
||||
test("accepts cleanContent option", () => {
|
||||
const html = '<nav><a href="/home">Home</a></nav><main><h1>Article</h1><p>Body text.</p></main><footer>Copyright</footer>';
|
||||
const result = native.htmlToMarkdown(html, { cleanContent: true });
|
||||
assert.ok(result.includes("Article") || result.includes("Body text"), "Should contain main content");
|
||||
});
|
||||
|
||||
test("converts code blocks", () => {
|
||||
const html = "<pre><code>const x = 1;</code></pre>";
|
||||
const result = native.htmlToMarkdown(html);
|
||||
assert.ok(result.includes("const x = 1;"), "Should contain code content");
|
||||
});
|
||||
|
||||
test("converts complex nested HTML", () => {
|
||||
const html = '<div><h2>Section</h2><p>Text with <a href="https://example.com"><strong>bold link</strong></a>.</p><ul><li>Item one</li><li>Item two</li></ul></div>';
|
||||
const result = native.htmlToMarkdown(html);
|
||||
assert.ok(result.includes("Section"), "Should contain heading");
|
||||
assert.ok(result.includes("example.com"), "Should contain link");
|
||||
assert.ok(result.includes("one"), "Should contain list items");
|
||||
});
|
||||
});
|
||||
24
packages/native/src/html/index.ts
Normal file
24
packages/native/src/html/index.ts
Normal file
|
|
@ -0,0 +1,24 @@
|
|||
/**
|
||||
* HTML to Markdown conversion via native Rust bindings.
|
||||
*
|
||||
* Uses `html-to-markdown-rs` under the hood for high-performance
|
||||
* conversion with optional content cleaning (stripping nav, forms, etc.).
|
||||
*/
|
||||
|
||||
import { native } from "../native.js";
|
||||
import type { HtmlToMarkdownOptions } from "./types.js";
|
||||
|
||||
export type { HtmlToMarkdownOptions };
|
||||
|
||||
/**
|
||||
* Convert an HTML string to Markdown.
|
||||
*
|
||||
* When `cleanContent` is true, boilerplate elements (nav, forms, headers,
|
||||
* footers) are stripped before conversion.
|
||||
*/
|
||||
export function htmlToMarkdown(
|
||||
html: string,
|
||||
options?: HtmlToMarkdownOptions,
|
||||
): string {
|
||||
return native.htmlToMarkdown(html, options ?? {}) as string;
|
||||
}
|
||||
7
packages/native/src/html/types.ts
Normal file
7
packages/native/src/html/types.ts
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
/** Options for HTML to Markdown conversion. */
|
||||
export interface HtmlToMarkdownOptions {
|
||||
/** Remove navigation elements, forms, headers, footers. */
|
||||
cleanContent?: boolean;
|
||||
/** Skip images during conversion. */
|
||||
skipImages?: boolean;
|
||||
}
|
||||
|
|
@ -3,6 +3,7 @@
|
|||
*
|
||||
* Modules:
|
||||
* - grep: ripgrep-backed regex search (content + filesystem)
|
||||
* - html: HTML to Markdown conversion
|
||||
*/
|
||||
|
||||
export { searchContent, grep } from "./grep/index.js";
|
||||
|
|
@ -15,3 +16,6 @@ export type {
|
|||
SearchOptions,
|
||||
SearchResult,
|
||||
} from "./grep/index.js";
|
||||
|
||||
export { htmlToMarkdown } from "./html/index.js";
|
||||
export type { HtmlToMarkdownOptions } from "./html/index.js";
|
||||
|
|
|
|||
|
|
@ -43,4 +43,5 @@ function loadNative(): Record<string, unknown> {
|
|||
export const native = loadNative() as {
|
||||
search: (content: Buffer | Uint8Array, options: unknown) => unknown;
|
||||
grep: (options: unknown) => unknown;
|
||||
htmlToMarkdown: (html: string, options: unknown) => unknown;
|
||||
};
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue