fix(voice): preserve transcription across pauses

Apple's on-device speech recognition resets bestTranscription after
silence gaps, discarding previous text. The Swift recognizer now
detects these resets (word count drop / different starting word) and
accumulates finalized segments so speech continues appending instead
of overwriting. TS side simplified to pass through the already-
accumulated text from the Swift process.
This commit is contained in:
Lex Christopherson 2026-03-11 16:53:49 -06:00
parent bc0049e51d
commit 8a4572edef
2 changed files with 88 additions and 11 deletions

View file

@ -26,7 +26,6 @@ export default function (pi: ExtensionAPI) {
let active = false;
let recognizerProcess: ChildProcess | null = null;
let finalized = "";
let flashOn = true;
let flashTimer: ReturnType<typeof setInterval> | null = null;
let footerTui: { requestRender: () => void } | null = null;
@ -122,7 +121,6 @@ export default function (pi: ExtensionAPI) {
}
active = true;
finalized = "";
setVoiceFooter(ctx, true);
await runVoiceSession(ctx);
}
@ -161,14 +159,15 @@ export default function (pi: ExtensionAPI) {
async function runVoiceSession(ctx: ExtensionContext): Promise<void> {
return new Promise<void>((resolve) => {
// The Swift recognizer handles accumulation across pause-induced
// transcription resets. Both PARTIAL and FINAL messages contain
// the full accumulated text, so we just pass them through.
startRecognizer(
(text) => {
const full = finalized + (finalized && text ? " " : "") + text;
ctx.ui.setEditorText(full);
ctx.ui.setEditorText(text);
},
(text) => {
finalized = (finalized ? finalized + " " : "") + text;
ctx.ui.setEditorText(finalized);
ctx.ui.setEditorText(text);
},
(msg) => ctx.ui.notify(`Voice: ${msg}`, "error"),
() => {},

View file

@ -45,15 +45,93 @@ do {
exit(1)
}
var lastText = ""
// Accumulated finalized text from previous recognition segments.
// On-device recognition (especially macOS/iOS 18+) can reset
// bestTranscription.formattedString after a pause, discarding
// previous text. We detect this by tracking the last known good
// text and noticing when the new text is shorter / doesn't start
// with the previous text. When that happens we treat the previous
// text as finalized and start accumulating the new segment on top.
var accumulated = ""
var lastPartialText = ""
var lastEmitted = ""
recognizer.recognitionTask(with: request) { result, error in
if let result = result {
let text = result.bestTranscription.formattedString
if text != lastText {
lastText = text
let prefix = result.isFinal ? "FINAL" : "PARTIAL"
print("\(prefix):\(text)")
if result.isFinal {
// True final from the recognizer commit everything
let full: String
// Check if the final text already includes accumulated content
// (some OS versions give cumulative finals, others reset)
if !accumulated.isEmpty && !text.lowercased().hasPrefix(accumulated.lowercased()) {
full = accumulated + " " + text
} else if !accumulated.isEmpty && text.count < accumulated.count {
// Final is shorter than what we accumulated use accumulated + new
full = accumulated + " " + text
} else {
full = text
}
accumulated = ""
lastPartialText = ""
if full != lastEmitted {
lastEmitted = full
print("FINAL:\(full)")
}
return
}
// Detect transcription reset: if the new partial text is significantly
// shorter than what we had, or doesn't start with the previous text,
// the recognizer has reset after a pause. Finalize what we had.
let prevText = lastPartialText
if !prevText.isEmpty && !text.isEmpty {
let prevWords = prevText.split(separator: " ")
let newWords = text.split(separator: " ")
// Reset detection: new text has fewer words than previous AND
// the first few words don't match (i.e. it's truly new speech,
// not just the recognizer revising the last word)
let looksLikeReset: Bool
if newWords.count < prevWords.count / 2 {
// Significant drop in word count likely a reset
looksLikeReset = true
} else if newWords.count < prevWords.count &&
!prevWords.isEmpty && !newWords.isEmpty &&
newWords[0] != prevWords[0] {
// Different starting word + fewer words reset
looksLikeReset = true
} else {
looksLikeReset = false
}
if looksLikeReset {
// Commit the previous partial text to accumulated
if accumulated.isEmpty {
accumulated = prevText
} else {
accumulated = accumulated + " " + prevText
}
// Emit a FINAL for the committed text so the TS side updates
print("FINAL:\(accumulated)")
lastEmitted = accumulated
}
}
lastPartialText = text
// Build the full display text
let displayText: String
if accumulated.isEmpty {
displayText = text
} else {
displayText = accumulated + " " + text
}
if displayText != lastEmitted {
lastEmitted = displayText
print("PARTIAL:\(displayText)")
}
}
if let error = error {