From 8b9cfae9e96f3ac627dff9164d42eca229ea1213 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?T=C3=82CHES?= Date: Fri, 13 Mar 2026 16:21:58 -0600 Subject: [PATCH] feat: native Rust streaming JSON parser (#266) * feat: add native Rust streaming JSON parser for LLM tool call argument parsing Replaces the JS partial-json library with a Rust implementation exposed via napi-rs. The parser handles incomplete JSON from streaming deltas by closing unclosed strings, objects, arrays, removing trailing commas, and completing truncated literals. Co-Authored-By: Claude Sonnet 4.6 * fix: handle truncated numbers and remove dead partial-json dependency Adds truncated number recovery (e.g. `{"key": 12`, `{"key": 3.`, `{"key": 1e`) to the Rust streaming JSON parser, and removes the now-unused `partial-json` npm dependency from pi-ai. Co-Authored-By: Claude Opus 4.6 (1M context) --------- Co-authored-by: Claude Sonnet 4.6 --- native/Cargo.lock | 1 + native/crates/engine/Cargo.toml | 1 + native/crates/engine/src/json_parse.rs | 410 ++++++++++++++++++ native/crates/engine/src/lib.rs | 1 + .../native/src/__tests__/json-parse.test.mjs | 158 +++++++ packages/native/src/index.ts | 6 + packages/native/src/json-parse/index.ts | 34 ++ packages/native/src/native.ts | 3 + packages/pi-ai/package.json | 1 - packages/pi-ai/src/utils/json-parse.ts | 22 +- 10 files changed, 618 insertions(+), 19 deletions(-) create mode 100644 native/crates/engine/src/json_parse.rs create mode 100644 packages/native/src/__tests__/json-parse.test.mjs create mode 100644 packages/native/src/json-parse/index.ts diff --git a/native/Cargo.lock b/native/Cargo.lock index 8da9a0833..ee19d016b 100644 --- a/native/Cargo.lock +++ b/native/Cargo.lock @@ -544,6 +544,7 @@ dependencies = [ "napi-build", "napi-derive", "regex", + "serde_json", "similar", "smallvec", "syntect", diff --git a/native/crates/engine/Cargo.toml b/native/crates/engine/Cargo.toml index 9cecd67a9..dc9ae6957 100644 --- a/native/crates/engine/Cargo.toml +++ b/native/crates/engine/Cargo.toml @@ -27,6 +27,7 @@ image = { version = "0.25", default-features = false, features = [ napi = { version = "2", features = ["napi8"] } napi-derive = "2" regex = "1" +serde_json = "1" similar = "2" smallvec = "1" syntect = { version = "5", default-features = false, features = ["default-syntaxes", "default-themes", "regex-fancy"] } diff --git a/native/crates/engine/src/json_parse.rs b/native/crates/engine/src/json_parse.rs new file mode 100644 index 000000000..7aa0d0fdf --- /dev/null +++ b/native/crates/engine/src/json_parse.rs @@ -0,0 +1,410 @@ +//! Streaming JSON parser via N-API. +//! +//! Exposes fast JSON parsing with partial/incomplete JSON recovery +//! for use during LLM streaming tool call argument parsing. + +use napi::bindgen_prelude::*; +use napi_derive::napi; + +/// Parse a complete JSON string. Returns the parsed value or an error. +#[napi(js_name = "parseJson")] +pub fn parse_json(env: Env, text: String) -> Result { + let value: serde_json::Value = + serde_json::from_str(&text).map_err(|e| Error::from_reason(format!("{e}")))?; + serde_value_to_napi(&env, &value) +} + +/// Parse potentially incomplete JSON by closing unclosed structures. +#[napi(js_name = "parsePartialJson")] +pub fn parse_partial_json(env: Env, text: String) -> Result { + let fixed = fix_partial_json(&text); + let value: serde_json::Value = + serde_json::from_str(&fixed).map_err(|e| Error::from_reason(format!("{e}")))?; + serde_value_to_napi(&env, &value) +} + +/// Try full JSON parse first; fall back to partial parse. Returns `{}` on total failure. +#[napi(js_name = "parseStreamingJson")] +pub fn parse_streaming_json(env: Env, text: String) -> Result { + let trimmed = text.trim(); + if trimmed.is_empty() { + // Return empty object + let obj = env.create_object()?; + return Ok(obj.into_unknown()); + } + + // Fast path: try complete parse + if let Ok(value) = serde_json::from_str::(trimmed) { + return serde_value_to_napi(&env, &value); + } + + // Slow path: fix partial JSON + let fixed = fix_partial_json(trimmed); + if let Ok(value) = serde_json::from_str::(&fixed) { + return serde_value_to_napi(&env, &value); + } + + // Total failure: return empty object + let obj = env.create_object()?; + Ok(obj.into_unknown()) +} + +/// Fix incomplete JSON by closing unclosed strings, objects, arrays, +/// removing trailing commas, and handling truncated values. +fn fix_partial_json(input: &str) -> String { + let mut result = String::with_capacity(input.len() + 16); + let mut stack: Vec = Vec::new(); // tracks expected closing chars + let mut in_string = false; + let mut escape_next = false; + let chars: Vec = input.chars().collect(); + let len = chars.len(); + let mut i = 0; + + while i < len { + let ch = chars[i]; + + if escape_next { + result.push(ch); + escape_next = false; + i += 1; + continue; + } + + if in_string { + if ch == '\\' { + result.push(ch); + escape_next = true; + } else if ch == '"' { + result.push(ch); + in_string = false; + } else { + result.push(ch); + } + i += 1; + continue; + } + + // Not in a string + match ch { + '"' => { + result.push(ch); + in_string = true; + } + '{' => { + result.push(ch); + stack.push('}'); + } + '[' => { + result.push(ch); + stack.push(']'); + } + '}' | ']' => { + // Remove trailing comma before closing + remove_trailing_comma(&mut result); + result.push(ch); + if let Some(expected) = stack.last() { + if *expected == ch { + stack.pop(); + } + } + } + _ => { + result.push(ch); + } + } + i += 1; + } + + // If we ended inside an escape sequence within a string + if escape_next && in_string { + // Drop the trailing backslash (incomplete escape) + result.pop(); + } + + // Close unclosed string + if in_string { + result.push('"'); + } + + // Remove any trailing comma before we close structures + remove_trailing_comma(&mut result); + + // Handle truncated values: if last meaningful token looks like a key with colon but no value + handle_truncated_value(&mut result); + + // Close unclosed structures + while let Some(closer) = stack.pop() { + remove_trailing_comma(&mut result); + result.push(closer); + } + + result +} + +/// Remove trailing comma (and whitespace before it) from the result buffer. +fn remove_trailing_comma(result: &mut String) { + let trimmed_len = result.trim_end().len(); + if trimmed_len > 0 { + let last_non_ws = result.as_bytes()[trimmed_len - 1]; + if last_non_ws == b',' { + result.truncate(trimmed_len - 1); + } + } +} + +/// Handle truncated values after a colon (e.g., `{"key":` or `{"key": tr`) +fn handle_truncated_value(result: &mut String) { + let trimmed = result.trim_end(); + + // If ends with colon, add null + if trimmed.ends_with(':') { + result.push_str("null"); + return; + } + + let bytes = trimmed.as_bytes(); + let len = bytes.len(); + + // Check for truncated number: digits (possibly with leading minus, dot, or 'e') + // at the end after a value-position character + if len > 0 { + let last = bytes[len - 1]; + if last.is_ascii_digit() || last == b'.' || last == b'-' || last == b'e' || last == b'E' || last == b'+' { + // Walk backwards to find the start of the number-like token + let mut start = len; + while start > 0 { + let b = bytes[start - 1]; + if b.is_ascii_digit() || b == b'.' || b == b'-' || b == b'e' || b == b'E' || b == b'+' { + start -= 1; + } else { + break; + } + } + if start < len { + let before = trimmed[..start].trim_end(); + if before.ends_with(':') || before.ends_with(',') || before.ends_with('[') { + let token = &trimmed[start..]; + // If it doesn't parse as a valid number, truncate to the last valid portion + if token.parse::().is_err() { + // Strip trailing non-digit chars (e.g. "12." -> "12", "1e" -> "1") + let mut valid_end = token.len(); + while valid_end > 0 && !token.as_bytes()[valid_end - 1].is_ascii_digit() { + valid_end -= 1; + } + if valid_end > 0 { + result.truncate(start + valid_end); + } else { + // Just a minus or dot with no digits — replace with 0 + result.truncate(start); + result.push('0'); + } + } + // If it parses fine, leave it as-is + return; + } + } + } + } + + // Check for truncated boolean/null literals after a value-position character + for prefix in &["tru", "tr", "t", "fals", "fal", "fa", "f", "nul", "nu", "n"] { + if trimmed.ends_with(prefix) { + let before = trimmed[..len - prefix.len()].trim_end(); + if before.ends_with(':') || before.ends_with(',') || before.ends_with('[') { + let full = match prefix.as_bytes()[0] { + b't' => "true", + b'f' => "false", + b'n' => "null", + _ => unreachable!(), + }; + result.truncate(len - prefix.len()); + result.push_str(full); + return; + } + } + } +} + +/// Convert a serde_json::Value to a napi JsUnknown. +fn serde_value_to_napi(env: &Env, value: &serde_json::Value) -> Result { + match value { + serde_json::Value::Null => { + env.get_null().map(|v| v.into_unknown()) + } + serde_json::Value::Bool(b) => { + env.get_boolean(*b).map(|v| v.into_unknown()) + } + serde_json::Value::Number(n) => { + if let Some(i) = n.as_i64() { + // Use i32 if it fits, otherwise f64 + if i >= i64::from(i32::MIN) && i <= i64::from(i32::MAX) { + env.create_int32(i as i32).map(|v| v.into_unknown()) + } else { + env.create_double(i as f64).map(|v| v.into_unknown()) + } + } else if let Some(f) = n.as_f64() { + env.create_double(f).map(|v| v.into_unknown()) + } else { + env.get_null().map(|v| v.into_unknown()) + } + } + serde_json::Value::String(s) => { + env.create_string(s).map(|v| v.into_unknown()) + } + serde_json::Value::Array(arr) => { + let mut js_arr = env.create_array_with_length(arr.len())?; + for (idx, item) in arr.iter().enumerate() { + let js_val = serde_value_to_napi(env, item)?; + js_arr.set_element(idx as u32, js_val)?; + } + Ok(js_arr.into_unknown()) + } + serde_json::Value::Object(map) => { + let mut obj = env.create_object()?; + for (key, val) in map { + let js_val = serde_value_to_napi(env, val)?; + obj.set_named_property(key, js_val)?; + } + Ok(obj.into_unknown()) + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_fix_complete_json() { + let input = r#"{"key": "value", "num": 42}"#; + let fixed = fix_partial_json(input); + let _: serde_json::Value = serde_json::from_str(&fixed).unwrap(); + } + + #[test] + fn test_fix_unclosed_string() { + let input = r#"{"key": "val"#; + let fixed = fix_partial_json(input); + let v: serde_json::Value = serde_json::from_str(&fixed).unwrap(); + assert_eq!(v["key"], "val"); + } + + #[test] + fn test_fix_unclosed_object() { + let input = r#"{"key": "value""#; + let fixed = fix_partial_json(input); + let v: serde_json::Value = serde_json::from_str(&fixed).unwrap(); + assert_eq!(v["key"], "value"); + } + + #[test] + fn test_fix_unclosed_array() { + let input = r#"{"arr": [1, 2, 3"#; + let fixed = fix_partial_json(input); + let v: serde_json::Value = serde_json::from_str(&fixed).unwrap(); + assert_eq!(v["arr"].as_array().unwrap().len(), 3); + } + + #[test] + fn test_fix_trailing_comma() { + let input = r#"{"a": 1, "b": 2,}"#; + let fixed = fix_partial_json(input); + let v: serde_json::Value = serde_json::from_str(&fixed).unwrap(); + assert_eq!(v["a"], 1); + assert_eq!(v["b"], 2); + } + + #[test] + fn test_fix_truncated_after_colon() { + let input = r#"{"key":"#; + let fixed = fix_partial_json(input); + let _: serde_json::Value = serde_json::from_str(&fixed).unwrap(); + } + + #[test] + fn test_fix_truncated_true() { + let input = r#"{"key": tr"#; + let fixed = fix_partial_json(input); + let v: serde_json::Value = serde_json::from_str(&fixed).unwrap(); + assert_eq!(v["key"], true); + } + + #[test] + fn test_fix_truncated_false() { + let input = r#"{"key": fal"#; + let fixed = fix_partial_json(input); + let v: serde_json::Value = serde_json::from_str(&fixed).unwrap(); + assert_eq!(v["key"], false); + } + + #[test] + fn test_fix_truncated_null() { + let input = r#"{"key": nu"#; + let fixed = fix_partial_json(input); + let v: serde_json::Value = serde_json::from_str(&fixed).unwrap(); + assert!(v["key"].is_null()); + } + + #[test] + fn test_fix_nested_partial() { + let input = r#"{"a": {"b": [1, 2"#; + let fixed = fix_partial_json(input); + let v: serde_json::Value = serde_json::from_str(&fixed).unwrap(); + assert_eq!(v["a"]["b"].as_array().unwrap().len(), 2); + } + + #[test] + fn test_empty_input() { + let fixed = fix_partial_json(""); + assert_eq!(fixed, ""); + } + + #[test] + fn test_fix_trailing_comma_in_array() { + let input = r#"[1, 2, 3,]"#; + let fixed = fix_partial_json(input); + let v: serde_json::Value = serde_json::from_str(&fixed).unwrap(); + assert_eq!(v.as_array().unwrap().len(), 3); + } + + #[test] + fn test_fix_truncated_number() { + let input = r#"{"key": 12"#; + let fixed = fix_partial_json(input); + let v: serde_json::Value = serde_json::from_str(&fixed).unwrap(); + assert_eq!(v["key"], 12); + } + + #[test] + fn test_fix_truncated_decimal() { + let input = r#"{"key": 3."#; + let fixed = fix_partial_json(input); + let v: serde_json::Value = serde_json::from_str(&fixed).unwrap(); + assert_eq!(v["key"], 3); + } + + #[test] + fn test_fix_truncated_negative_number() { + let input = r#"{"key": -"#; + let fixed = fix_partial_json(input); + let v: serde_json::Value = serde_json::from_str(&fixed).unwrap(); + assert_eq!(v["key"], 0); + } + + #[test] + fn test_fix_truncated_exponent() { + let input = r#"{"key": 1e"#; + let fixed = fix_partial_json(input); + let v: serde_json::Value = serde_json::from_str(&fixed).unwrap(); + assert_eq!(v["key"], 1); + } + + #[test] + fn test_fix_truncated_number_in_array() { + let input = r#"[1, 42"#; + let fixed = fix_partial_json(input); + let v: serde_json::Value = serde_json::from_str(&fixed).unwrap(); + assert_eq!(v[0], 1); + assert_eq!(v[1], 42); + } +} diff --git a/native/crates/engine/src/lib.rs b/native/crates/engine/src/lib.rs index e4b503e58..605296ba7 100644 --- a/native/crates/engine/src/lib.rs +++ b/native/crates/engine/src/lib.rs @@ -23,3 +23,4 @@ mod text; mod ttsr; mod gsd_parser; mod image; +mod json_parse; diff --git a/packages/native/src/__tests__/json-parse.test.mjs b/packages/native/src/__tests__/json-parse.test.mjs new file mode 100644 index 000000000..9fe763723 --- /dev/null +++ b/packages/native/src/__tests__/json-parse.test.mjs @@ -0,0 +1,158 @@ +import { test, describe } from "node:test"; +import assert from "node:assert/strict"; +import { createRequire } from "node:module"; +import * as path from "node:path"; +import { fileURLToPath } from "node:url"; + +const __dirname = path.dirname(fileURLToPath(import.meta.url)); +const require = createRequire(import.meta.url); + +const addonDir = path.resolve(__dirname, "..", "..", "..", "..", "native", "addon"); +const platformTag = `${process.platform}-${process.arch}`; +const candidates = [ + path.join(addonDir, `gsd_engine.${platformTag}.node`), + path.join(addonDir, "gsd_engine.dev.node"), +]; + +let native; +for (const candidate of candidates) { + try { + native = require(candidate); + break; + } catch { + // try next + } +} + +if (!native) { + console.error("Native addon not found. Run `npm run build:native -w @gsd/native` first."); + process.exit(1); +} + +describe("native json: parseJson()", () => { + test("parses complete JSON object", () => { + const result = native.parseJson('{"key": "value", "num": 42}'); + assert.equal(result.key, "value"); + assert.equal(result.num, 42); + }); + + test("parses JSON array", () => { + const result = native.parseJson("[1, 2, 3]"); + assert.deepEqual(result, [1, 2, 3]); + }); + + test("parses JSON string", () => { + const result = native.parseJson('"hello"'); + assert.equal(result, "hello"); + }); + + test("parses JSON number", () => { + const result = native.parseJson("42.5"); + assert.equal(result, 42.5); + }); + + test("parses JSON boolean", () => { + assert.equal(native.parseJson("true"), true); + assert.equal(native.parseJson("false"), false); + }); + + test("parses JSON null", () => { + assert.equal(native.parseJson("null"), null); + }); + + test("throws on invalid JSON", () => { + assert.throws(() => native.parseJson("{invalid}")); + }); +}); + +describe("native json: parsePartialJson()", () => { + test("parses complete JSON unchanged", () => { + const result = native.parsePartialJson('{"key": "value"}'); + assert.equal(result.key, "value"); + }); + + test("closes unclosed string", () => { + const result = native.parsePartialJson('{"key": "val'); + assert.equal(result.key, "val"); + }); + + test("closes unclosed object", () => { + const result = native.parsePartialJson('{"key": "value"'); + assert.equal(result.key, "value"); + }); + + test("closes unclosed array", () => { + const result = native.parsePartialJson('{"arr": [1, 2, 3'); + assert.deepEqual(result.arr, [1, 2, 3]); + }); + + test("removes trailing comma in object", () => { + const result = native.parsePartialJson('{"a": 1, "b": 2,}'); + assert.equal(result.a, 1); + assert.equal(result.b, 2); + }); + + test("removes trailing comma in array", () => { + const result = native.parsePartialJson("[1, 2, 3,]"); + assert.deepEqual(result, [1, 2, 3]); + }); + + test("handles truncated value after colon", () => { + const result = native.parsePartialJson('{"key":'); + assert.equal(result.key, null); + }); + + test("handles truncated true", () => { + const result = native.parsePartialJson('{"key": tr'); + assert.equal(result.key, true); + }); + + test("handles truncated false", () => { + const result = native.parsePartialJson('{"key": fal'); + assert.equal(result.key, false); + }); + + test("handles truncated null", () => { + const result = native.parsePartialJson('{"key": nu'); + assert.equal(result.key, null); + }); + + test("handles nested partial structures", () => { + const result = native.parsePartialJson('{"a": {"b": [1, 2'); + assert.deepEqual(result.a.b, [1, 2]); + }); +}); + +describe("native json: parseStreamingJson()", () => { + test("returns empty object for empty string", () => { + const result = native.parseStreamingJson(""); + assert.deepEqual(result, {}); + }); + + test("returns empty object for whitespace", () => { + const result = native.parseStreamingJson(" "); + assert.deepEqual(result, {}); + }); + + test("parses complete JSON", () => { + const result = native.parseStreamingJson('{"tool": "search", "args": {"query": "test"}}'); + assert.equal(result.tool, "search"); + assert.equal(result.args.query, "test"); + }); + + test("parses partial JSON (streaming scenario)", () => { + const result = native.parseStreamingJson('{"tool": "search", "args": {"query": "te'); + assert.equal(result.tool, "search"); + assert.equal(result.args.query, "te"); + }); + + test("handles deeply nested partial JSON", () => { + const result = native.parseStreamingJson('{"a": {"b": {"c": [1, 2, {"d": "val'); + assert.equal(result.a.b.c[2].d, "val"); + }); + + test("handles escaped characters in strings", () => { + const result = native.parseStreamingJson('{"path": "C:\\\\Users\\\\test'); + assert.ok(result.path.includes("C:\\Users\\test")); + }); +}); diff --git a/packages/native/src/index.ts b/packages/native/src/index.ts index 14fe0a6a9..2a6db2883 100644 --- a/packages/native/src/index.ts +++ b/packages/native/src/index.ts @@ -93,6 +93,12 @@ export type { NativeImageHandle } from "./image/index.js"; export { ttsrCompileRules, ttsrCheckBuffer, ttsrFreeRules } from "./ttsr/index.js"; export type { TtsrHandle, TtsrRuleInput } from "./ttsr/index.js"; +export { + parseJson, + parsePartialJson, + parseStreamingJson, +} from "./json-parse/index.js"; + export { parseFrontmatter, extractSection as nativeExtractSection, diff --git a/packages/native/src/json-parse/index.ts b/packages/native/src/json-parse/index.ts new file mode 100644 index 000000000..62f21f9dc --- /dev/null +++ b/packages/native/src/json-parse/index.ts @@ -0,0 +1,34 @@ +/** + * Streaming JSON parser via native Rust bindings. + * + * Provides fast JSON parsing with recovery for incomplete/partial JSON, + * used during LLM streaming tool call argument parsing. + */ + +import { native } from "../native.js"; + +/** + * Parse a complete JSON string. Throws on invalid JSON. + */ +export function parseJson(text: string): T { + return native.parseJson(text) as T; +} + +/** + * Parse potentially incomplete JSON by closing unclosed structures. + * Handles unclosed strings, objects, arrays, trailing commas, and truncated literals. + */ +export function parsePartialJson(text: string): T { + return native.parsePartialJson(text) as T; +} + +/** + * Try full JSON parse first; fall back to partial parse. + * Returns `{}` on total failure. Drop-in replacement for the JS streaming parser. + */ +export function parseStreamingJson(text: string | undefined): T { + if (!text || text.trim() === "") { + return {} as T; + } + return native.parseStreamingJson(text) as T; +} diff --git a/packages/native/src/native.ts b/packages/native/src/native.ts index b4c09cf65..f80210340 100644 --- a/packages/native/src/native.ts +++ b/packages/native/src/native.ts @@ -129,4 +129,7 @@ export const native = loadNative() as { extractAllSections: (content: string, level?: number) => string; batchParseGsdFiles: (directory: string) => unknown; parseRoadmapFile: (content: string) => unknown; + parseJson: (text: string) => unknown; + parsePartialJson: (text: string) => unknown; + parseStreamingJson: (text: string) => unknown; }; diff --git a/packages/pi-ai/package.json b/packages/pi-ai/package.json index fa4bdcf8e..4e8f4b94d 100644 --- a/packages/pi-ai/package.json +++ b/packages/pi-ai/package.json @@ -32,7 +32,6 @@ "ajv-formats": "^3.0.1", "chalk": "^5.6.2", "openai": "6.26.0", - "partial-json": "^0.1.7", "proxy-agent": "^6.5.0", "undici": "^7.19.1", "zod-to-json-schema": "^3.24.6" diff --git a/packages/pi-ai/src/utils/json-parse.ts b/packages/pi-ai/src/utils/json-parse.ts index feeb32ad1..ad907e8d0 100644 --- a/packages/pi-ai/src/utils/json-parse.ts +++ b/packages/pi-ai/src/utils/json-parse.ts @@ -1,28 +1,14 @@ -import { parse as partialParse } from "partial-json"; +import { parseStreamingJson as nativeParseStreamingJson } from "@gsd/native"; /** * Attempts to parse potentially incomplete JSON during streaming. * Always returns a valid object, even if the JSON is incomplete. * + * Uses the native Rust streaming JSON parser for performance. + * * @param partialJson The partial JSON string from streaming * @returns Parsed object or empty object if parsing fails */ export function parseStreamingJson(partialJson: string | undefined): T { - if (!partialJson || partialJson.trim() === "") { - return {} as T; - } - - // Try standard parsing first (fastest for complete JSON) - try { - return JSON.parse(partialJson) as T; - } catch { - // Try partial-json for incomplete JSON - try { - const result = partialParse(partialJson); - return (result ?? {}) as T; - } catch { - // If all parsing fails, return empty object - return {} as T; - } - } + return nativeParseStreamingJson(partialJson); }