fix(voice): preserve transcription across pauses
Apple's on-device speech recognition resets bestTranscription after silence gaps, discarding previous text. The Swift recognizer now detects these resets (word count drop / different starting word) and accumulates finalized segments so speech continues appending instead of overwriting. TS side simplified to pass through the already- accumulated text from the Swift process.
This commit is contained in:
parent
bc0049e51d
commit
8a4572edef
2 changed files with 88 additions and 11 deletions
|
|
@ -26,7 +26,6 @@ export default function (pi: ExtensionAPI) {
|
|||
|
||||
let active = false;
|
||||
let recognizerProcess: ChildProcess | null = null;
|
||||
let finalized = "";
|
||||
let flashOn = true;
|
||||
let flashTimer: ReturnType<typeof setInterval> | null = null;
|
||||
let footerTui: { requestRender: () => void } | null = null;
|
||||
|
|
@ -122,7 +121,6 @@ export default function (pi: ExtensionAPI) {
|
|||
}
|
||||
|
||||
active = true;
|
||||
finalized = "";
|
||||
setVoiceFooter(ctx, true);
|
||||
await runVoiceSession(ctx);
|
||||
}
|
||||
|
|
@ -161,14 +159,15 @@ export default function (pi: ExtensionAPI) {
|
|||
|
||||
async function runVoiceSession(ctx: ExtensionContext): Promise<void> {
|
||||
return new Promise<void>((resolve) => {
|
||||
// The Swift recognizer handles accumulation across pause-induced
|
||||
// transcription resets. Both PARTIAL and FINAL messages contain
|
||||
// the full accumulated text, so we just pass them through.
|
||||
startRecognizer(
|
||||
(text) => {
|
||||
const full = finalized + (finalized && text ? " " : "") + text;
|
||||
ctx.ui.setEditorText(full);
|
||||
ctx.ui.setEditorText(text);
|
||||
},
|
||||
(text) => {
|
||||
finalized = (finalized ? finalized + " " : "") + text;
|
||||
ctx.ui.setEditorText(finalized);
|
||||
ctx.ui.setEditorText(text);
|
||||
},
|
||||
(msg) => ctx.ui.notify(`Voice: ${msg}`, "error"),
|
||||
() => {},
|
||||
|
|
|
|||
|
|
@ -45,15 +45,93 @@ do {
|
|||
exit(1)
|
||||
}
|
||||
|
||||
var lastText = ""
|
||||
// Accumulated finalized text from previous recognition segments.
|
||||
// On-device recognition (especially macOS/iOS 18+) can reset
|
||||
// bestTranscription.formattedString after a pause, discarding
|
||||
// previous text. We detect this by tracking the last known good
|
||||
// text and noticing when the new text is shorter / doesn't start
|
||||
// with the previous text. When that happens we treat the previous
|
||||
// text as finalized and start accumulating the new segment on top.
|
||||
var accumulated = ""
|
||||
var lastPartialText = ""
|
||||
var lastEmitted = ""
|
||||
|
||||
recognizer.recognitionTask(with: request) { result, error in
|
||||
if let result = result {
|
||||
let text = result.bestTranscription.formattedString
|
||||
if text != lastText {
|
||||
lastText = text
|
||||
let prefix = result.isFinal ? "FINAL" : "PARTIAL"
|
||||
print("\(prefix):\(text)")
|
||||
|
||||
if result.isFinal {
|
||||
// True final from the recognizer — commit everything
|
||||
let full: String
|
||||
// Check if the final text already includes accumulated content
|
||||
// (some OS versions give cumulative finals, others reset)
|
||||
if !accumulated.isEmpty && !text.lowercased().hasPrefix(accumulated.lowercased()) {
|
||||
full = accumulated + " " + text
|
||||
} else if !accumulated.isEmpty && text.count < accumulated.count {
|
||||
// Final is shorter than what we accumulated — use accumulated + new
|
||||
full = accumulated + " " + text
|
||||
} else {
|
||||
full = text
|
||||
}
|
||||
accumulated = ""
|
||||
lastPartialText = ""
|
||||
if full != lastEmitted {
|
||||
lastEmitted = full
|
||||
print("FINAL:\(full)")
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Detect transcription reset: if the new partial text is significantly
|
||||
// shorter than what we had, or doesn't start with the previous text,
|
||||
// the recognizer has reset after a pause. Finalize what we had.
|
||||
let prevText = lastPartialText
|
||||
if !prevText.isEmpty && !text.isEmpty {
|
||||
let prevWords = prevText.split(separator: " ")
|
||||
let newWords = text.split(separator: " ")
|
||||
|
||||
// Reset detection: new text has fewer words than previous AND
|
||||
// the first few words don't match (i.e. it's truly new speech,
|
||||
// not just the recognizer revising the last word)
|
||||
let looksLikeReset: Bool
|
||||
if newWords.count < prevWords.count / 2 {
|
||||
// Significant drop in word count — likely a reset
|
||||
looksLikeReset = true
|
||||
} else if newWords.count < prevWords.count &&
|
||||
!prevWords.isEmpty && !newWords.isEmpty &&
|
||||
newWords[0] != prevWords[0] {
|
||||
// Different starting word + fewer words — reset
|
||||
looksLikeReset = true
|
||||
} else {
|
||||
looksLikeReset = false
|
||||
}
|
||||
|
||||
if looksLikeReset {
|
||||
// Commit the previous partial text to accumulated
|
||||
if accumulated.isEmpty {
|
||||
accumulated = prevText
|
||||
} else {
|
||||
accumulated = accumulated + " " + prevText
|
||||
}
|
||||
// Emit a FINAL for the committed text so the TS side updates
|
||||
print("FINAL:\(accumulated)")
|
||||
lastEmitted = accumulated
|
||||
}
|
||||
}
|
||||
|
||||
lastPartialText = text
|
||||
|
||||
// Build the full display text
|
||||
let displayText: String
|
||||
if accumulated.isEmpty {
|
||||
displayText = text
|
||||
} else {
|
||||
displayText = accumulated + " " + text
|
||||
}
|
||||
|
||||
if displayText != lastEmitted {
|
||||
lastEmitted = displayText
|
||||
print("PARTIAL:\(displayText)")
|
||||
}
|
||||
}
|
||||
if let error = error {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue