From 8a4572edef02f034e1f35dfe60d3f0a38585d242 Mon Sep 17 00:00:00 2001 From: Lex Christopherson Date: Wed, 11 Mar 2026 16:53:49 -0600 Subject: [PATCH] fix(voice): preserve transcription across pauses Apple's on-device speech recognition resets bestTranscription after silence gaps, discarding previous text. The Swift recognizer now detects these resets (word count drop / different starting word) and accumulates finalized segments so speech continues appending instead of overwriting. TS side simplified to pass through the already- accumulated text from the Swift process. --- src/resources/extensions/voice/index.ts | 11 ++- .../extensions/voice/speech-recognizer.swift | 88 +++++++++++++++++-- 2 files changed, 88 insertions(+), 11 deletions(-) diff --git a/src/resources/extensions/voice/index.ts b/src/resources/extensions/voice/index.ts index bf2be9fb1..39433dbf4 100644 --- a/src/resources/extensions/voice/index.ts +++ b/src/resources/extensions/voice/index.ts @@ -26,7 +26,6 @@ export default function (pi: ExtensionAPI) { let active = false; let recognizerProcess: ChildProcess | null = null; - let finalized = ""; let flashOn = true; let flashTimer: ReturnType | null = null; let footerTui: { requestRender: () => void } | null = null; @@ -122,7 +121,6 @@ export default function (pi: ExtensionAPI) { } active = true; - finalized = ""; setVoiceFooter(ctx, true); await runVoiceSession(ctx); } @@ -161,14 +159,15 @@ export default function (pi: ExtensionAPI) { async function runVoiceSession(ctx: ExtensionContext): Promise { return new Promise((resolve) => { + // The Swift recognizer handles accumulation across pause-induced + // transcription resets. Both PARTIAL and FINAL messages contain + // the full accumulated text, so we just pass them through. startRecognizer( (text) => { - const full = finalized + (finalized && text ? " " : "") + text; - ctx.ui.setEditorText(full); + ctx.ui.setEditorText(text); }, (text) => { - finalized = (finalized ? finalized + " " : "") + text; - ctx.ui.setEditorText(finalized); + ctx.ui.setEditorText(text); }, (msg) => ctx.ui.notify(`Voice: ${msg}`, "error"), () => {}, diff --git a/src/resources/extensions/voice/speech-recognizer.swift b/src/resources/extensions/voice/speech-recognizer.swift index 32735ba51..e1408f507 100644 --- a/src/resources/extensions/voice/speech-recognizer.swift +++ b/src/resources/extensions/voice/speech-recognizer.swift @@ -45,15 +45,93 @@ do { exit(1) } -var lastText = "" +// Accumulated finalized text from previous recognition segments. +// On-device recognition (especially macOS/iOS 18+) can reset +// bestTranscription.formattedString after a pause, discarding +// previous text. We detect this by tracking the last known good +// text and noticing when the new text is shorter / doesn't start +// with the previous text. When that happens we treat the previous +// text as finalized and start accumulating the new segment on top. +var accumulated = "" +var lastPartialText = "" +var lastEmitted = "" recognizer.recognitionTask(with: request) { result, error in if let result = result { let text = result.bestTranscription.formattedString - if text != lastText { - lastText = text - let prefix = result.isFinal ? "FINAL" : "PARTIAL" - print("\(prefix):\(text)") + + if result.isFinal { + // True final from the recognizer — commit everything + let full: String + // Check if the final text already includes accumulated content + // (some OS versions give cumulative finals, others reset) + if !accumulated.isEmpty && !text.lowercased().hasPrefix(accumulated.lowercased()) { + full = accumulated + " " + text + } else if !accumulated.isEmpty && text.count < accumulated.count { + // Final is shorter than what we accumulated — use accumulated + new + full = accumulated + " " + text + } else { + full = text + } + accumulated = "" + lastPartialText = "" + if full != lastEmitted { + lastEmitted = full + print("FINAL:\(full)") + } + return + } + + // Detect transcription reset: if the new partial text is significantly + // shorter than what we had, or doesn't start with the previous text, + // the recognizer has reset after a pause. Finalize what we had. + let prevText = lastPartialText + if !prevText.isEmpty && !text.isEmpty { + let prevWords = prevText.split(separator: " ") + let newWords = text.split(separator: " ") + + // Reset detection: new text has fewer words than previous AND + // the first few words don't match (i.e. it's truly new speech, + // not just the recognizer revising the last word) + let looksLikeReset: Bool + if newWords.count < prevWords.count / 2 { + // Significant drop in word count — likely a reset + looksLikeReset = true + } else if newWords.count < prevWords.count && + !prevWords.isEmpty && !newWords.isEmpty && + newWords[0] != prevWords[0] { + // Different starting word + fewer words — reset + looksLikeReset = true + } else { + looksLikeReset = false + } + + if looksLikeReset { + // Commit the previous partial text to accumulated + if accumulated.isEmpty { + accumulated = prevText + } else { + accumulated = accumulated + " " + prevText + } + // Emit a FINAL for the committed text so the TS side updates + print("FINAL:\(accumulated)") + lastEmitted = accumulated + } + } + + lastPartialText = text + + // Build the full display text + let displayText: String + if accumulated.isEmpty { + displayText = text + } else { + displayText = accumulated + " " + text + } + + if displayText != lastEmitted { + lastEmitted = displayText + print("PARTIAL:\(displayText)") } } if let error = error {