feat: add /voice extension for real-time speech-to-text

- macOS-only (SFSpeechRecognizer), no-op on other platforms - /voice command and Ctrl+Alt+V shortcut to toggle - Streams partial transcription results directly into editor input - Custom footer with flashing red dot + 'transcribing' indicator on row 1 - Enter to stop and keep text, Esc to cancel - Ships precompiled Swift binary (60KB)
2026-03-11 16:13:49 -06:00 · 2026-03-11 16:13:49 -06:00 · 8d04971ac1
commit 8d04971ac1
parent 0c9fb1d1da
3 changed files with 252 additions and 0 deletions
--- a/src/resources/extensions/voice/index.ts
+++ b/src/resources/extensions/voice/index.ts
@ -0,0 +1,176 @@
+import type { ExtensionAPI, ExtensionContext } from "@mariozechner/pi-coding-agent";
+import type { AssistantMessage } from "@mariozechner/pi-ai";
+import { isKeyRelease, Key, matchesKey, truncateToWidth, visibleWidth } from "@mariozechner/pi-tui";
+import { spawn, type ChildProcess } from "node:child_process";
+import * as path from "node:path";
+import * as readline from "node:readline";
+
+const RECOGNIZER_BIN = path.join(__dirname, "speech-recognizer");
+
+export default function (pi: ExtensionAPI) {
+	if (process.platform !== "darwin") return;
+
+	let active = false;
+	let recognizerProcess: ChildProcess | null = null;
+	let finalized = "";
+	let flashOn = true;
+	let flashTimer: ReturnType<typeof setInterval> | null = null;
+	let footerTui: { requestRender: () => void } | null = null;
+
+	function setVoiceFooter(ctx: ExtensionContext, on: boolean) {
+		if (!on) {
+			stopFlash();
+			ctx.ui.setFooter(undefined);
+			return;
+		}
+
+		flashOn = true;
+		flashTimer = setInterval(() => {
+			flashOn = !flashOn;
+			footerTui?.requestRender();
+		}, 500);
+
+		ctx.ui.setFooter((tui, theme, footerData) => {
+			footerTui = tui;
+			const branchUnsub = footerData.onBranchChange(() => tui.requestRender());
+
+			return {
+				dispose: branchUnsub,
+				invalidate() {},
+				render(width: number): string[] {
+					// --- Row 1: pwd (branch) ... ● transcribing ---
+					let pwd = process.cwd();
+					const home = process.env.HOME || process.env.USERPROFILE;
+					if (home && pwd.startsWith(home)) pwd = `~${pwd.slice(home.length)}`;
+					const branch = footerData.getGitBranch();
+					if (branch) pwd = `${pwd} (${branch})`;
+
+					const dot = flashOn ? theme.fg("error", "●") : theme.fg("dim", "●");
+					const voiceTag = `${dot} ${theme.fg("error", "transcribing")}`;
+					const voiceTagWidth = visibleWidth(voiceTag);
+
+					const maxPwdWidth = width - voiceTagWidth - 2;
+					const pwdStr = truncateToWidth(theme.fg("dim", pwd), maxPwdWidth, theme.fg("dim", "..."));
+					const pad1 = " ".repeat(Math.max(1, width - visibleWidth(pwdStr) - voiceTagWidth));
+					const row1 = truncateToWidth(pwdStr + pad1 + voiceTag, width);
+
+					// --- Row 2: stats ... model (replicate default) ---
+					let totalInput = 0, totalOutput = 0, totalCost = 0;
+					for (const entry of ctx.sessionManager.getEntries()) {
+						if (entry.type === "message" && entry.message.role === "assistant") {
+							const m = entry.message as AssistantMessage;
+							totalInput += m.usage.input;
+							totalOutput += m.usage.output;
+							totalCost += m.usage.cost.total;
+						}
+					}
+
+					const fmt = (n: number) => n < 1000 ? `${n}` : n < 10000 ? `${(n / 1000).toFixed(1)}k` : `${Math.round(n / 1000)}k`;
+					const parts: string[] = [];
+					if (totalInput) parts.push(`↑${fmt(totalInput)}`);
+					if (totalOutput) parts.push(`↓${fmt(totalOutput)}`);
+					if (totalCost) parts.push(`$${totalCost.toFixed(3)}`);
+
+					const usage = ctx.getContextUsage();
+					const ctxPct = usage?.percent !== null && usage?.percent !== undefined ? `${usage.percent.toFixed(1)}%` : "?";
+					const ctxWin = usage?.contextWindow ?? ctx.model?.contextWindow ?? 0;
+					parts.push(`${ctxPct}/${fmt(ctxWin)}`);
+
+					const statsLeft = theme.fg("dim", parts.join(" "));
+					const modelRight = theme.fg("dim", ctx.model?.id || "no-model");
+					const statsLeftW = visibleWidth(statsLeft);
+					const modelRightW = visibleWidth(modelRight);
+					const pad2 = " ".repeat(Math.max(2, width - statsLeftW - modelRightW));
+					const row2 = truncateToWidth(statsLeft + pad2 + modelRight, width);
+
+					return [row1, row2];
+				},
+			};
+		});
+	}
+
+	function stopFlash() {
+		if (flashTimer) { clearInterval(flashTimer); flashTimer = null; }
+		footerTui = null;
+	}
+
+	async function toggleVoice(ctx: ExtensionContext) {
+		if (active) {
+			killRecognizer();
+			active = false;
+			setVoiceFooter(ctx, false);
+			return;
+		}
+
+		active = true;
+		finalized = "";
+		setVoiceFooter(ctx, true);
+		await runVoiceSession(ctx);
+	}
+
+	pi.registerCommand("voice", {
+		description: "Toggle voice mode",
+		handler: async (_args, ctx) => toggleVoice(ctx),
+	});
+
+	pi.registerShortcut("ctrl+alt+v", {
+		description: "Toggle voice mode",
+		handler: async (ctx) => toggleVoice(ctx),
+	});
+
+	function killRecognizer() {
+		if (recognizerProcess) { recognizerProcess.kill("SIGTERM"); recognizerProcess = null; }
+	}
+
+	function startRecognizer(
+		onPartial: (text: string) => void,
+		onFinal: (text: string) => void,
+		onError: (msg: string) => void,
+		onReady: () => void,
+	) {
+		recognizerProcess = spawn(RECOGNIZER_BIN, [], { stdio: ["pipe", "pipe", "pipe"] });
+		const rl = readline.createInterface({ input: recognizerProcess.stdout! });
+		rl.on("line", (line: string) => {
+			if (line === "READY") { onReady(); return; }
+			if (line.startsWith("PARTIAL:")) onPartial(line.slice(8));
+			else if (line.startsWith("FINAL:")) onFinal(line.slice(6));
+			else if (line.startsWith("ERROR:")) onError(line.slice(6));
+		});
+		recognizerProcess.on("error", (err) => onError(err.message));
+		recognizerProcess.on("exit", () => { recognizerProcess = null; });
+	}
+
+	async function runVoiceSession(ctx: ExtensionContext): Promise<void> {
+		return new Promise<void>((resolve) => {
+			startRecognizer(
+				(text) => {
+					const full = finalized + (finalized && text ? " " : "") + text;
+					ctx.ui.setEditorText(full);
+				},
+				(text) => {
+					finalized = (finalized ? finalized + " " : "") + text;
+					ctx.ui.setEditorText(finalized);
+				},
+				(msg) => ctx.ui.notify(`Voice: ${msg}`, "error"),
+				() => {},
+			);
+
+			ctx.ui.custom<void>(
+				(_tui, _theme, _kb, done) => ({
+					render(): string[] { return []; },
+					handleInput(data: string) {
+						if (isKeyRelease(data)) return;
+						if (matchesKey(data, Key.escape) || matchesKey(data, Key.enter)) {
+							killRecognizer();
+							active = false;
+							setVoiceFooter(ctx, false);
+							done();
+						}
+					},
+					invalidate() {},
+				}),
+				{ overlay: true, overlayOptions: { anchor: "bottom-center", width: "100%" } },
+			).then(() => resolve());
+		});
+	}
+}
--- a/src/resources/extensions/voice/speech-recognizer
+++ b/src/resources/extensions/voice/speech-recognizer
--- a/src/resources/extensions/voice/speech-recognizer.swift
+++ b/src/resources/extensions/voice/speech-recognizer.swift
@ -0,0 +1,76 @@
+import Foundation
+import Speech
+import AVFoundation
+
+// Unbuffered stdout
+setbuf(stdout, nil)
+
+guard SFSpeechRecognizer.authorizationStatus() == .authorized ||
+      SFSpeechRecognizer.authorizationStatus() == .notDetermined else {
+    print("ERROR:Speech recognition not authorized")
+    exit(1)
+}
+
+SFSpeechRecognizer.requestAuthorization { status in
+    guard status == .authorized else {
+        print("ERROR:Speech recognition denied")
+        exit(1)
+    }
+}
+
+let recognizer = SFSpeechRecognizer(locale: Locale(identifier: "en-US"))!
+guard recognizer.isAvailable else {
+    print("ERROR:Speech recognizer not available")
+    exit(1)
+}
+
+let audioEngine = AVAudioEngine()
+let request = SFSpeechAudioBufferRecognitionRequest()
+request.shouldReportPartialResults = true
+request.requiresOnDeviceRecognition = true
+
+let node = audioEngine.inputNode
+let format = node.outputFormat(forBus: 0)
+
+node.installTap(onBus: 0, bufferSize: 1024, format: format) { buffer, _ in
+    request.append(buffer)
+}
+
+audioEngine.prepare()
+do {
+    try audioEngine.start()
+    print("READY")
+} catch {
+    print("ERROR:Failed to start audio engine: \(error.localizedDescription)")
+    exit(1)
+}
+
+var lastText = ""
+
+recognizer.recognitionTask(with: request) { result, error in
+    if let result = result {
+        let text = result.bestTranscription.formattedString
+        if text != lastText {
+            lastText = text
+            let prefix = result.isFinal ? "FINAL" : "PARTIAL"
+            print("\(prefix):\(text)")
+        }
+    }
+    if let error = error {
+        // Task finished errors are normal on kill
+        let nsError = error as NSError
+        if nsError.code != 216 { // kAFAssistantErrorDomain code for cancelled
+            print("ERROR:\(error.localizedDescription)")
+        }
+    }
+}
+
+// Handle SIGTERM/SIGINT gracefully
+signal(SIGTERM) { _ in
+    exit(0)
+}
+signal(SIGINT) { _ in
+    exit(0)
+}
+
+RunLoop.current.run()