fix(voice): preserve transcription across pauses

Apple's on-device speech recognition resets bestTranscription after silence gaps, discarding previous text. The Swift recognizer now detects these resets (word count drop / different starting word) and accumulates finalized segments so speech continues appending instead of overwriting. TS side simplified to pass through the already- accumulated text from the Swift process.
2026-03-11 16:53:49 -06:00 · 2026-03-11 16:53:49 -06:00 · 8a4572edef
commit 8a4572edef
parent bc0049e51d
2 changed files with 88 additions and 11 deletions
--- a/src/resources/extensions/voice/index.ts
+++ b/src/resources/extensions/voice/index.ts
@ -26,7 +26,6 @@ export default function (pi: ExtensionAPI) {

 	let active = false;
 	let recognizerProcess: ChildProcess | null = null;
-	let finalized = "";
 	let flashOn = true;
 	let flashTimer: ReturnType<typeof setInterval> | null = null;
 	let footerTui: { requestRender: () => void } | null = null;
@ -122,7 +121,6 @@ export default function (pi: ExtensionAPI) {
 		}

 		active = true;
-		finalized = "";
 		setVoiceFooter(ctx, true);
 		await runVoiceSession(ctx);
 	}
@ -161,14 +159,15 @@ export default function (pi: ExtensionAPI) {

 	async function runVoiceSession(ctx: ExtensionContext): Promise<void> {
 		return new Promise<void>((resolve) => {
+			// The Swift recognizer handles accumulation across pause-induced
+			// transcription resets. Both PARTIAL and FINAL messages contain
+			// the full accumulated text, so we just pass them through.
 			startRecognizer(
 				(text) => {
-					const full = finalized + (finalized && text ? " " : "") + text;
-					ctx.ui.setEditorText(full);
+					ctx.ui.setEditorText(text);
 				},
 				(text) => {
-					finalized = (finalized ? finalized + " " : "") + text;
-					ctx.ui.setEditorText(finalized);
+					ctx.ui.setEditorText(text);
 				},
 				(msg) => ctx.ui.notify(`Voice: ${msg}`, "error"),
 				() => {},
--- a/src/resources/extensions/voice/speech-recognizer.swift
+++ b/src/resources/extensions/voice/speech-recognizer.swift
@ -45,15 +45,93 @@ do {
    exit(1)
 }

-var lastText = ""
+// Accumulated finalized text from previous recognition segments.
+// On-device recognition (especially macOS/iOS 18+) can reset
+// bestTranscription.formattedString after a pause, discarding
+// previous text. We detect this by tracking the last known good
+// text and noticing when the new text is shorter / doesn't start
+// with the previous text. When that happens we treat the previous
+// text as finalized and start accumulating the new segment on top.
+var accumulated = ""
+var lastPartialText = ""
+var lastEmitted = ""

 recognizer.recognitionTask(with: request) { result, error in
    if let result = result {
        let text = result.bestTranscription.formattedString
-        if text != lastText {
-            lastText = text
-            let prefix = result.isFinal ? "FINAL" : "PARTIAL"
-            print("\(prefix):\(text)")
+
+        if result.isFinal {
+            // True final from the recognizer — commit everything
+            let full: String
+            // Check if the final text already includes accumulated content
+            // (some OS versions give cumulative finals, others reset)
+            if !accumulated.isEmpty && !text.lowercased().hasPrefix(accumulated.lowercased()) {
+                full = accumulated + " " + text
+            } else if !accumulated.isEmpty && text.count < accumulated.count {
+                // Final is shorter than what we accumulated — use accumulated + new
+                full = accumulated + " " + text
+            } else {
+                full = text
+            }
+            accumulated = ""
+            lastPartialText = ""
+            if full != lastEmitted {
+                lastEmitted = full
+                print("FINAL:\(full)")
+            }
+            return
+        }
+
+        // Detect transcription reset: if the new partial text is significantly
+        // shorter than what we had, or doesn't start with the previous text,
+        // the recognizer has reset after a pause. Finalize what we had.
+        let prevText = lastPartialText
+        if !prevText.isEmpty && !text.isEmpty {
+            let prevWords = prevText.split(separator: " ")
+            let newWords = text.split(separator: " ")
+
+            // Reset detection: new text has fewer words than previous AND
+            // the first few words don't match (i.e. it's truly new speech,
+            // not just the recognizer revising the last word)
+            let looksLikeReset: Bool
+            if newWords.count < prevWords.count / 2 {
+                // Significant drop in word count — likely a reset
+                looksLikeReset = true
+            } else if newWords.count < prevWords.count &&
+                      !prevWords.isEmpty && !newWords.isEmpty &&
+                      newWords[0] != prevWords[0] {
+                // Different starting word + fewer words — reset
+                looksLikeReset = true
+            } else {
+                looksLikeReset = false
+            }
+
+            if looksLikeReset {
+                // Commit the previous partial text to accumulated
+                if accumulated.isEmpty {
+                    accumulated = prevText
+                } else {
+                    accumulated = accumulated + " " + prevText
+                }
+                // Emit a FINAL for the committed text so the TS side updates
+                print("FINAL:\(accumulated)")
+                lastEmitted = accumulated
+            }
+        }
+
+        lastPartialText = text
+
+        // Build the full display text
+        let displayText: String
+        if accumulated.isEmpty {
+            displayText = text
+        } else {
+            displayText = accumulated + " " + text
+        }
+
+        if displayText != lastEmitted {
+            lastEmitted = displayText
+            print("PARTIAL:\(displayText)")
        }
    }
    if let error = error {