feat: add /voice extension for real-time speech-to-text

- macOS-only (SFSpeechRecognizer), no-op on other platforms
- /voice command and Ctrl+Alt+V shortcut to toggle
- Streams partial transcription results directly into editor input
- Custom footer with flashing red dot + 'transcribing' indicator on row 1
- Enter to stop and keep text, Esc to cancel
- Ships precompiled Swift binary (60KB)
This commit is contained in:
Lex Christopherson 2026-03-11 16:13:49 -06:00
parent 0c9fb1d1da
commit 8d04971ac1
3 changed files with 252 additions and 0 deletions

View file

@ -0,0 +1,176 @@
import type { ExtensionAPI, ExtensionContext } from "@mariozechner/pi-coding-agent";
import type { AssistantMessage } from "@mariozechner/pi-ai";
import { isKeyRelease, Key, matchesKey, truncateToWidth, visibleWidth } from "@mariozechner/pi-tui";
import { spawn, type ChildProcess } from "node:child_process";
import * as path from "node:path";
import * as readline from "node:readline";
const RECOGNIZER_BIN = path.join(__dirname, "speech-recognizer");
export default function (pi: ExtensionAPI) {
if (process.platform !== "darwin") return;
let active = false;
let recognizerProcess: ChildProcess | null = null;
let finalized = "";
let flashOn = true;
let flashTimer: ReturnType<typeof setInterval> | null = null;
let footerTui: { requestRender: () => void } | null = null;
function setVoiceFooter(ctx: ExtensionContext, on: boolean) {
if (!on) {
stopFlash();
ctx.ui.setFooter(undefined);
return;
}
flashOn = true;
flashTimer = setInterval(() => {
flashOn = !flashOn;
footerTui?.requestRender();
}, 500);
ctx.ui.setFooter((tui, theme, footerData) => {
footerTui = tui;
const branchUnsub = footerData.onBranchChange(() => tui.requestRender());
return {
dispose: branchUnsub,
invalidate() {},
render(width: number): string[] {
// --- Row 1: pwd (branch) ... ● transcribing ---
let pwd = process.cwd();
const home = process.env.HOME || process.env.USERPROFILE;
if (home && pwd.startsWith(home)) pwd = `~${pwd.slice(home.length)}`;
const branch = footerData.getGitBranch();
if (branch) pwd = `${pwd} (${branch})`;
const dot = flashOn ? theme.fg("error", "●") : theme.fg("dim", "●");
const voiceTag = `${dot} ${theme.fg("error", "transcribing")}`;
const voiceTagWidth = visibleWidth(voiceTag);
const maxPwdWidth = width - voiceTagWidth - 2;
const pwdStr = truncateToWidth(theme.fg("dim", pwd), maxPwdWidth, theme.fg("dim", "..."));
const pad1 = " ".repeat(Math.max(1, width - visibleWidth(pwdStr) - voiceTagWidth));
const row1 = truncateToWidth(pwdStr + pad1 + voiceTag, width);
// --- Row 2: stats ... model (replicate default) ---
let totalInput = 0, totalOutput = 0, totalCost = 0;
for (const entry of ctx.sessionManager.getEntries()) {
if (entry.type === "message" && entry.message.role === "assistant") {
const m = entry.message as AssistantMessage;
totalInput += m.usage.input;
totalOutput += m.usage.output;
totalCost += m.usage.cost.total;
}
}
const fmt = (n: number) => n < 1000 ? `${n}` : n < 10000 ? `${(n / 1000).toFixed(1)}k` : `${Math.round(n / 1000)}k`;
const parts: string[] = [];
if (totalInput) parts.push(`${fmt(totalInput)}`);
if (totalOutput) parts.push(`${fmt(totalOutput)}`);
if (totalCost) parts.push(`$${totalCost.toFixed(3)}`);
const usage = ctx.getContextUsage();
const ctxPct = usage?.percent !== null && usage?.percent !== undefined ? `${usage.percent.toFixed(1)}%` : "?";
const ctxWin = usage?.contextWindow ?? ctx.model?.contextWindow ?? 0;
parts.push(`${ctxPct}/${fmt(ctxWin)}`);
const statsLeft = theme.fg("dim", parts.join(" "));
const modelRight = theme.fg("dim", ctx.model?.id || "no-model");
const statsLeftW = visibleWidth(statsLeft);
const modelRightW = visibleWidth(modelRight);
const pad2 = " ".repeat(Math.max(2, width - statsLeftW - modelRightW));
const row2 = truncateToWidth(statsLeft + pad2 + modelRight, width);
return [row1, row2];
},
};
});
}
function stopFlash() {
if (flashTimer) { clearInterval(flashTimer); flashTimer = null; }
footerTui = null;
}
async function toggleVoice(ctx: ExtensionContext) {
if (active) {
killRecognizer();
active = false;
setVoiceFooter(ctx, false);
return;
}
active = true;
finalized = "";
setVoiceFooter(ctx, true);
await runVoiceSession(ctx);
}
pi.registerCommand("voice", {
description: "Toggle voice mode",
handler: async (_args, ctx) => toggleVoice(ctx),
});
pi.registerShortcut("ctrl+alt+v", {
description: "Toggle voice mode",
handler: async (ctx) => toggleVoice(ctx),
});
function killRecognizer() {
if (recognizerProcess) { recognizerProcess.kill("SIGTERM"); recognizerProcess = null; }
}
function startRecognizer(
onPartial: (text: string) => void,
onFinal: (text: string) => void,
onError: (msg: string) => void,
onReady: () => void,
) {
recognizerProcess = spawn(RECOGNIZER_BIN, [], { stdio: ["pipe", "pipe", "pipe"] });
const rl = readline.createInterface({ input: recognizerProcess.stdout! });
rl.on("line", (line: string) => {
if (line === "READY") { onReady(); return; }
if (line.startsWith("PARTIAL:")) onPartial(line.slice(8));
else if (line.startsWith("FINAL:")) onFinal(line.slice(6));
else if (line.startsWith("ERROR:")) onError(line.slice(6));
});
recognizerProcess.on("error", (err) => onError(err.message));
recognizerProcess.on("exit", () => { recognizerProcess = null; });
}
async function runVoiceSession(ctx: ExtensionContext): Promise<void> {
return new Promise<void>((resolve) => {
startRecognizer(
(text) => {
const full = finalized + (finalized && text ? " " : "") + text;
ctx.ui.setEditorText(full);
},
(text) => {
finalized = (finalized ? finalized + " " : "") + text;
ctx.ui.setEditorText(finalized);
},
(msg) => ctx.ui.notify(`Voice: ${msg}`, "error"),
() => {},
);
ctx.ui.custom<void>(
(_tui, _theme, _kb, done) => ({
render(): string[] { return []; },
handleInput(data: string) {
if (isKeyRelease(data)) return;
if (matchesKey(data, Key.escape) || matchesKey(data, Key.enter)) {
killRecognizer();
active = false;
setVoiceFooter(ctx, false);
done();
}
},
invalidate() {},
}),
{ overlay: true, overlayOptions: { anchor: "bottom-center", width: "100%" } },
).then(() => resolve());
});
}
}

Binary file not shown.

View file

@ -0,0 +1,76 @@
import Foundation
import Speech
import AVFoundation
// Unbuffered stdout
setbuf(stdout, nil)
guard SFSpeechRecognizer.authorizationStatus() == .authorized ||
SFSpeechRecognizer.authorizationStatus() == .notDetermined else {
print("ERROR:Speech recognition not authorized")
exit(1)
}
SFSpeechRecognizer.requestAuthorization { status in
guard status == .authorized else {
print("ERROR:Speech recognition denied")
exit(1)
}
}
let recognizer = SFSpeechRecognizer(locale: Locale(identifier: "en-US"))!
guard recognizer.isAvailable else {
print("ERROR:Speech recognizer not available")
exit(1)
}
let audioEngine = AVAudioEngine()
let request = SFSpeechAudioBufferRecognitionRequest()
request.shouldReportPartialResults = true
request.requiresOnDeviceRecognition = true
let node = audioEngine.inputNode
let format = node.outputFormat(forBus: 0)
node.installTap(onBus: 0, bufferSize: 1024, format: format) { buffer, _ in
request.append(buffer)
}
audioEngine.prepare()
do {
try audioEngine.start()
print("READY")
} catch {
print("ERROR:Failed to start audio engine: \(error.localizedDescription)")
exit(1)
}
var lastText = ""
recognizer.recognitionTask(with: request) { result, error in
if let result = result {
let text = result.bestTranscription.formattedString
if text != lastText {
lastText = text
let prefix = result.isFinal ? "FINAL" : "PARTIAL"
print("\(prefix):\(text)")
}
}
if let error = error {
// Task finished errors are normal on kill
let nsError = error as NSError
if nsError.code != 216 { // kAFAssistantErrorDomain code for cancelled
print("ERROR:\(error.localizedDescription)")
}
}
}
// Handle SIGTERM/SIGINT gracefully
signal(SIGTERM) { _ in
exit(0)
}
signal(SIGINT) { _ in
exit(0)
}
RunLoop.current.run()