feat: add /voice extension for real-time speech-to-text
- macOS-only (SFSpeechRecognizer), no-op on other platforms - /voice command and Ctrl+Alt+V shortcut to toggle - Streams partial transcription results directly into editor input - Custom footer with flashing red dot + 'transcribing' indicator on row 1 - Enter to stop and keep text, Esc to cancel - Ships precompiled Swift binary (60KB)
This commit is contained in:
parent
0c9fb1d1da
commit
8d04971ac1
3 changed files with 252 additions and 0 deletions
176
src/resources/extensions/voice/index.ts
Normal file
176
src/resources/extensions/voice/index.ts
Normal file
|
|
@ -0,0 +1,176 @@
|
|||
import type { ExtensionAPI, ExtensionContext } from "@mariozechner/pi-coding-agent";
|
||||
import type { AssistantMessage } from "@mariozechner/pi-ai";
|
||||
import { isKeyRelease, Key, matchesKey, truncateToWidth, visibleWidth } from "@mariozechner/pi-tui";
|
||||
import { spawn, type ChildProcess } from "node:child_process";
|
||||
import * as path from "node:path";
|
||||
import * as readline from "node:readline";
|
||||
|
||||
const RECOGNIZER_BIN = path.join(__dirname, "speech-recognizer");
|
||||
|
||||
export default function (pi: ExtensionAPI) {
|
||||
if (process.platform !== "darwin") return;
|
||||
|
||||
let active = false;
|
||||
let recognizerProcess: ChildProcess | null = null;
|
||||
let finalized = "";
|
||||
let flashOn = true;
|
||||
let flashTimer: ReturnType<typeof setInterval> | null = null;
|
||||
let footerTui: { requestRender: () => void } | null = null;
|
||||
|
||||
function setVoiceFooter(ctx: ExtensionContext, on: boolean) {
|
||||
if (!on) {
|
||||
stopFlash();
|
||||
ctx.ui.setFooter(undefined);
|
||||
return;
|
||||
}
|
||||
|
||||
flashOn = true;
|
||||
flashTimer = setInterval(() => {
|
||||
flashOn = !flashOn;
|
||||
footerTui?.requestRender();
|
||||
}, 500);
|
||||
|
||||
ctx.ui.setFooter((tui, theme, footerData) => {
|
||||
footerTui = tui;
|
||||
const branchUnsub = footerData.onBranchChange(() => tui.requestRender());
|
||||
|
||||
return {
|
||||
dispose: branchUnsub,
|
||||
invalidate() {},
|
||||
render(width: number): string[] {
|
||||
// --- Row 1: pwd (branch) ... ● transcribing ---
|
||||
let pwd = process.cwd();
|
||||
const home = process.env.HOME || process.env.USERPROFILE;
|
||||
if (home && pwd.startsWith(home)) pwd = `~${pwd.slice(home.length)}`;
|
||||
const branch = footerData.getGitBranch();
|
||||
if (branch) pwd = `${pwd} (${branch})`;
|
||||
|
||||
const dot = flashOn ? theme.fg("error", "●") : theme.fg("dim", "●");
|
||||
const voiceTag = `${dot} ${theme.fg("error", "transcribing")}`;
|
||||
const voiceTagWidth = visibleWidth(voiceTag);
|
||||
|
||||
const maxPwdWidth = width - voiceTagWidth - 2;
|
||||
const pwdStr = truncateToWidth(theme.fg("dim", pwd), maxPwdWidth, theme.fg("dim", "..."));
|
||||
const pad1 = " ".repeat(Math.max(1, width - visibleWidth(pwdStr) - voiceTagWidth));
|
||||
const row1 = truncateToWidth(pwdStr + pad1 + voiceTag, width);
|
||||
|
||||
// --- Row 2: stats ... model (replicate default) ---
|
||||
let totalInput = 0, totalOutput = 0, totalCost = 0;
|
||||
for (const entry of ctx.sessionManager.getEntries()) {
|
||||
if (entry.type === "message" && entry.message.role === "assistant") {
|
||||
const m = entry.message as AssistantMessage;
|
||||
totalInput += m.usage.input;
|
||||
totalOutput += m.usage.output;
|
||||
totalCost += m.usage.cost.total;
|
||||
}
|
||||
}
|
||||
|
||||
const fmt = (n: number) => n < 1000 ? `${n}` : n < 10000 ? `${(n / 1000).toFixed(1)}k` : `${Math.round(n / 1000)}k`;
|
||||
const parts: string[] = [];
|
||||
if (totalInput) parts.push(`↑${fmt(totalInput)}`);
|
||||
if (totalOutput) parts.push(`↓${fmt(totalOutput)}`);
|
||||
if (totalCost) parts.push(`$${totalCost.toFixed(3)}`);
|
||||
|
||||
const usage = ctx.getContextUsage();
|
||||
const ctxPct = usage?.percent !== null && usage?.percent !== undefined ? `${usage.percent.toFixed(1)}%` : "?";
|
||||
const ctxWin = usage?.contextWindow ?? ctx.model?.contextWindow ?? 0;
|
||||
parts.push(`${ctxPct}/${fmt(ctxWin)}`);
|
||||
|
||||
const statsLeft = theme.fg("dim", parts.join(" "));
|
||||
const modelRight = theme.fg("dim", ctx.model?.id || "no-model");
|
||||
const statsLeftW = visibleWidth(statsLeft);
|
||||
const modelRightW = visibleWidth(modelRight);
|
||||
const pad2 = " ".repeat(Math.max(2, width - statsLeftW - modelRightW));
|
||||
const row2 = truncateToWidth(statsLeft + pad2 + modelRight, width);
|
||||
|
||||
return [row1, row2];
|
||||
},
|
||||
};
|
||||
});
|
||||
}
|
||||
|
||||
function stopFlash() {
|
||||
if (flashTimer) { clearInterval(flashTimer); flashTimer = null; }
|
||||
footerTui = null;
|
||||
}
|
||||
|
||||
async function toggleVoice(ctx: ExtensionContext) {
|
||||
if (active) {
|
||||
killRecognizer();
|
||||
active = false;
|
||||
setVoiceFooter(ctx, false);
|
||||
return;
|
||||
}
|
||||
|
||||
active = true;
|
||||
finalized = "";
|
||||
setVoiceFooter(ctx, true);
|
||||
await runVoiceSession(ctx);
|
||||
}
|
||||
|
||||
pi.registerCommand("voice", {
|
||||
description: "Toggle voice mode",
|
||||
handler: async (_args, ctx) => toggleVoice(ctx),
|
||||
});
|
||||
|
||||
pi.registerShortcut("ctrl+alt+v", {
|
||||
description: "Toggle voice mode",
|
||||
handler: async (ctx) => toggleVoice(ctx),
|
||||
});
|
||||
|
||||
function killRecognizer() {
|
||||
if (recognizerProcess) { recognizerProcess.kill("SIGTERM"); recognizerProcess = null; }
|
||||
}
|
||||
|
||||
function startRecognizer(
|
||||
onPartial: (text: string) => void,
|
||||
onFinal: (text: string) => void,
|
||||
onError: (msg: string) => void,
|
||||
onReady: () => void,
|
||||
) {
|
||||
recognizerProcess = spawn(RECOGNIZER_BIN, [], { stdio: ["pipe", "pipe", "pipe"] });
|
||||
const rl = readline.createInterface({ input: recognizerProcess.stdout! });
|
||||
rl.on("line", (line: string) => {
|
||||
if (line === "READY") { onReady(); return; }
|
||||
if (line.startsWith("PARTIAL:")) onPartial(line.slice(8));
|
||||
else if (line.startsWith("FINAL:")) onFinal(line.slice(6));
|
||||
else if (line.startsWith("ERROR:")) onError(line.slice(6));
|
||||
});
|
||||
recognizerProcess.on("error", (err) => onError(err.message));
|
||||
recognizerProcess.on("exit", () => { recognizerProcess = null; });
|
||||
}
|
||||
|
||||
async function runVoiceSession(ctx: ExtensionContext): Promise<void> {
|
||||
return new Promise<void>((resolve) => {
|
||||
startRecognizer(
|
||||
(text) => {
|
||||
const full = finalized + (finalized && text ? " " : "") + text;
|
||||
ctx.ui.setEditorText(full);
|
||||
},
|
||||
(text) => {
|
||||
finalized = (finalized ? finalized + " " : "") + text;
|
||||
ctx.ui.setEditorText(finalized);
|
||||
},
|
||||
(msg) => ctx.ui.notify(`Voice: ${msg}`, "error"),
|
||||
() => {},
|
||||
);
|
||||
|
||||
ctx.ui.custom<void>(
|
||||
(_tui, _theme, _kb, done) => ({
|
||||
render(): string[] { return []; },
|
||||
handleInput(data: string) {
|
||||
if (isKeyRelease(data)) return;
|
||||
if (matchesKey(data, Key.escape) || matchesKey(data, Key.enter)) {
|
||||
killRecognizer();
|
||||
active = false;
|
||||
setVoiceFooter(ctx, false);
|
||||
done();
|
||||
}
|
||||
},
|
||||
invalidate() {},
|
||||
}),
|
||||
{ overlay: true, overlayOptions: { anchor: "bottom-center", width: "100%" } },
|
||||
).then(() => resolve());
|
||||
});
|
||||
}
|
||||
}
|
||||
BIN
src/resources/extensions/voice/speech-recognizer
Executable file
BIN
src/resources/extensions/voice/speech-recognizer
Executable file
Binary file not shown.
76
src/resources/extensions/voice/speech-recognizer.swift
Normal file
76
src/resources/extensions/voice/speech-recognizer.swift
Normal file
|
|
@ -0,0 +1,76 @@
|
|||
import Foundation
|
||||
import Speech
|
||||
import AVFoundation
|
||||
|
||||
// Unbuffered stdout
|
||||
setbuf(stdout, nil)
|
||||
|
||||
guard SFSpeechRecognizer.authorizationStatus() == .authorized ||
|
||||
SFSpeechRecognizer.authorizationStatus() == .notDetermined else {
|
||||
print("ERROR:Speech recognition not authorized")
|
||||
exit(1)
|
||||
}
|
||||
|
||||
SFSpeechRecognizer.requestAuthorization { status in
|
||||
guard status == .authorized else {
|
||||
print("ERROR:Speech recognition denied")
|
||||
exit(1)
|
||||
}
|
||||
}
|
||||
|
||||
let recognizer = SFSpeechRecognizer(locale: Locale(identifier: "en-US"))!
|
||||
guard recognizer.isAvailable else {
|
||||
print("ERROR:Speech recognizer not available")
|
||||
exit(1)
|
||||
}
|
||||
|
||||
let audioEngine = AVAudioEngine()
|
||||
let request = SFSpeechAudioBufferRecognitionRequest()
|
||||
request.shouldReportPartialResults = true
|
||||
request.requiresOnDeviceRecognition = true
|
||||
|
||||
let node = audioEngine.inputNode
|
||||
let format = node.outputFormat(forBus: 0)
|
||||
|
||||
node.installTap(onBus: 0, bufferSize: 1024, format: format) { buffer, _ in
|
||||
request.append(buffer)
|
||||
}
|
||||
|
||||
audioEngine.prepare()
|
||||
do {
|
||||
try audioEngine.start()
|
||||
print("READY")
|
||||
} catch {
|
||||
print("ERROR:Failed to start audio engine: \(error.localizedDescription)")
|
||||
exit(1)
|
||||
}
|
||||
|
||||
var lastText = ""
|
||||
|
||||
recognizer.recognitionTask(with: request) { result, error in
|
||||
if let result = result {
|
||||
let text = result.bestTranscription.formattedString
|
||||
if text != lastText {
|
||||
lastText = text
|
||||
let prefix = result.isFinal ? "FINAL" : "PARTIAL"
|
||||
print("\(prefix):\(text)")
|
||||
}
|
||||
}
|
||||
if let error = error {
|
||||
// Task finished errors are normal on kill
|
||||
let nsError = error as NSError
|
||||
if nsError.code != 216 { // kAFAssistantErrorDomain code for cancelled
|
||||
print("ERROR:\(error.localizedDescription)")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Handle SIGTERM/SIGINT gracefully
|
||||
signal(SIGTERM) { _ in
|
||||
exit(0)
|
||||
}
|
||||
signal(SIGINT) { _ in
|
||||
exit(0)
|
||||
}
|
||||
|
||||
RunLoop.current.run()
|
||||
Loading…
Add table
Reference in a new issue