diff --git a/src/resources/extensions/voice/index.ts b/src/resources/extensions/voice/index.ts new file mode 100644 index 000000000..c99400767 --- /dev/null +++ b/src/resources/extensions/voice/index.ts @@ -0,0 +1,176 @@ +import type { ExtensionAPI, ExtensionContext } from "@mariozechner/pi-coding-agent"; +import type { AssistantMessage } from "@mariozechner/pi-ai"; +import { isKeyRelease, Key, matchesKey, truncateToWidth, visibleWidth } from "@mariozechner/pi-tui"; +import { spawn, type ChildProcess } from "node:child_process"; +import * as path from "node:path"; +import * as readline from "node:readline"; + +const RECOGNIZER_BIN = path.join(__dirname, "speech-recognizer"); + +export default function (pi: ExtensionAPI) { + if (process.platform !== "darwin") return; + + let active = false; + let recognizerProcess: ChildProcess | null = null; + let finalized = ""; + let flashOn = true; + let flashTimer: ReturnType | null = null; + let footerTui: { requestRender: () => void } | null = null; + + function setVoiceFooter(ctx: ExtensionContext, on: boolean) { + if (!on) { + stopFlash(); + ctx.ui.setFooter(undefined); + return; + } + + flashOn = true; + flashTimer = setInterval(() => { + flashOn = !flashOn; + footerTui?.requestRender(); + }, 500); + + ctx.ui.setFooter((tui, theme, footerData) => { + footerTui = tui; + const branchUnsub = footerData.onBranchChange(() => tui.requestRender()); + + return { + dispose: branchUnsub, + invalidate() {}, + render(width: number): string[] { + // --- Row 1: pwd (branch) ... ● transcribing --- + let pwd = process.cwd(); + const home = process.env.HOME || process.env.USERPROFILE; + if (home && pwd.startsWith(home)) pwd = `~${pwd.slice(home.length)}`; + const branch = footerData.getGitBranch(); + if (branch) pwd = `${pwd} (${branch})`; + + const dot = flashOn ? theme.fg("error", "●") : theme.fg("dim", "●"); + const voiceTag = `${dot} ${theme.fg("error", "transcribing")}`; + const voiceTagWidth = visibleWidth(voiceTag); + + const maxPwdWidth = width - voiceTagWidth - 2; + const pwdStr = truncateToWidth(theme.fg("dim", pwd), maxPwdWidth, theme.fg("dim", "...")); + const pad1 = " ".repeat(Math.max(1, width - visibleWidth(pwdStr) - voiceTagWidth)); + const row1 = truncateToWidth(pwdStr + pad1 + voiceTag, width); + + // --- Row 2: stats ... model (replicate default) --- + let totalInput = 0, totalOutput = 0, totalCost = 0; + for (const entry of ctx.sessionManager.getEntries()) { + if (entry.type === "message" && entry.message.role === "assistant") { + const m = entry.message as AssistantMessage; + totalInput += m.usage.input; + totalOutput += m.usage.output; + totalCost += m.usage.cost.total; + } + } + + const fmt = (n: number) => n < 1000 ? `${n}` : n < 10000 ? `${(n / 1000).toFixed(1)}k` : `${Math.round(n / 1000)}k`; + const parts: string[] = []; + if (totalInput) parts.push(`↑${fmt(totalInput)}`); + if (totalOutput) parts.push(`↓${fmt(totalOutput)}`); + if (totalCost) parts.push(`$${totalCost.toFixed(3)}`); + + const usage = ctx.getContextUsage(); + const ctxPct = usage?.percent !== null && usage?.percent !== undefined ? `${usage.percent.toFixed(1)}%` : "?"; + const ctxWin = usage?.contextWindow ?? ctx.model?.contextWindow ?? 0; + parts.push(`${ctxPct}/${fmt(ctxWin)}`); + + const statsLeft = theme.fg("dim", parts.join(" ")); + const modelRight = theme.fg("dim", ctx.model?.id || "no-model"); + const statsLeftW = visibleWidth(statsLeft); + const modelRightW = visibleWidth(modelRight); + const pad2 = " ".repeat(Math.max(2, width - statsLeftW - modelRightW)); + const row2 = truncateToWidth(statsLeft + pad2 + modelRight, width); + + return [row1, row2]; + }, + }; + }); + } + + function stopFlash() { + if (flashTimer) { clearInterval(flashTimer); flashTimer = null; } + footerTui = null; + } + + async function toggleVoice(ctx: ExtensionContext) { + if (active) { + killRecognizer(); + active = false; + setVoiceFooter(ctx, false); + return; + } + + active = true; + finalized = ""; + setVoiceFooter(ctx, true); + await runVoiceSession(ctx); + } + + pi.registerCommand("voice", { + description: "Toggle voice mode", + handler: async (_args, ctx) => toggleVoice(ctx), + }); + + pi.registerShortcut("ctrl+alt+v", { + description: "Toggle voice mode", + handler: async (ctx) => toggleVoice(ctx), + }); + + function killRecognizer() { + if (recognizerProcess) { recognizerProcess.kill("SIGTERM"); recognizerProcess = null; } + } + + function startRecognizer( + onPartial: (text: string) => void, + onFinal: (text: string) => void, + onError: (msg: string) => void, + onReady: () => void, + ) { + recognizerProcess = spawn(RECOGNIZER_BIN, [], { stdio: ["pipe", "pipe", "pipe"] }); + const rl = readline.createInterface({ input: recognizerProcess.stdout! }); + rl.on("line", (line: string) => { + if (line === "READY") { onReady(); return; } + if (line.startsWith("PARTIAL:")) onPartial(line.slice(8)); + else if (line.startsWith("FINAL:")) onFinal(line.slice(6)); + else if (line.startsWith("ERROR:")) onError(line.slice(6)); + }); + recognizerProcess.on("error", (err) => onError(err.message)); + recognizerProcess.on("exit", () => { recognizerProcess = null; }); + } + + async function runVoiceSession(ctx: ExtensionContext): Promise { + return new Promise((resolve) => { + startRecognizer( + (text) => { + const full = finalized + (finalized && text ? " " : "") + text; + ctx.ui.setEditorText(full); + }, + (text) => { + finalized = (finalized ? finalized + " " : "") + text; + ctx.ui.setEditorText(finalized); + }, + (msg) => ctx.ui.notify(`Voice: ${msg}`, "error"), + () => {}, + ); + + ctx.ui.custom( + (_tui, _theme, _kb, done) => ({ + render(): string[] { return []; }, + handleInput(data: string) { + if (isKeyRelease(data)) return; + if (matchesKey(data, Key.escape) || matchesKey(data, Key.enter)) { + killRecognizer(); + active = false; + setVoiceFooter(ctx, false); + done(); + } + }, + invalidate() {}, + }), + { overlay: true, overlayOptions: { anchor: "bottom-center", width: "100%" } }, + ).then(() => resolve()); + }); + } +} diff --git a/src/resources/extensions/voice/speech-recognizer b/src/resources/extensions/voice/speech-recognizer new file mode 100755 index 000000000..9251292d9 Binary files /dev/null and b/src/resources/extensions/voice/speech-recognizer differ diff --git a/src/resources/extensions/voice/speech-recognizer.swift b/src/resources/extensions/voice/speech-recognizer.swift new file mode 100644 index 000000000..32735ba51 --- /dev/null +++ b/src/resources/extensions/voice/speech-recognizer.swift @@ -0,0 +1,76 @@ +import Foundation +import Speech +import AVFoundation + +// Unbuffered stdout +setbuf(stdout, nil) + +guard SFSpeechRecognizer.authorizationStatus() == .authorized || + SFSpeechRecognizer.authorizationStatus() == .notDetermined else { + print("ERROR:Speech recognition not authorized") + exit(1) +} + +SFSpeechRecognizer.requestAuthorization { status in + guard status == .authorized else { + print("ERROR:Speech recognition denied") + exit(1) + } +} + +let recognizer = SFSpeechRecognizer(locale: Locale(identifier: "en-US"))! +guard recognizer.isAvailable else { + print("ERROR:Speech recognizer not available") + exit(1) +} + +let audioEngine = AVAudioEngine() +let request = SFSpeechAudioBufferRecognitionRequest() +request.shouldReportPartialResults = true +request.requiresOnDeviceRecognition = true + +let node = audioEngine.inputNode +let format = node.outputFormat(forBus: 0) + +node.installTap(onBus: 0, bufferSize: 1024, format: format) { buffer, _ in + request.append(buffer) +} + +audioEngine.prepare() +do { + try audioEngine.start() + print("READY") +} catch { + print("ERROR:Failed to start audio engine: \(error.localizedDescription)") + exit(1) +} + +var lastText = "" + +recognizer.recognitionTask(with: request) { result, error in + if let result = result { + let text = result.bestTranscription.formattedString + if text != lastText { + lastText = text + let prefix = result.isFinal ? "FINAL" : "PARTIAL" + print("\(prefix):\(text)") + } + } + if let error = error { + // Task finished errors are normal on kill + let nsError = error as NSError + if nsError.code != 216 { // kAFAssistantErrorDomain code for cancelled + print("ERROR:\(error.localizedDescription)") + } + } +} + +// Handle SIGTERM/SIGINT gracefully +signal(SIGTERM) { _ in + exit(0) +} +signal(SIGINT) { _ in + exit(0) +} + +RunLoop.current.run()