feat(voice): Groq Whisper API backend for Linux voice mode (#366)

This commit is contained in:
jpmarques19 2026-03-14 15:08:36 +00:00 committed by GitHub
parent f24f63f290
commit 6a39f7226b
7 changed files with 611 additions and 7 deletions

View file

@ -6,6 +6,12 @@ Format based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
## [Unreleased]
### Added
- Linux voice mode: Groq Whisper API backend for fast, accurate speech-to-text (Ctrl+Alt+V toggle)
- Auto-reads `GROQ_API_KEY` from project `.env` file
- Fallback `--backend=local` for offline faster-whisper on CPU
- Venv-aware Python detection (`~/.gsd/voice-venv/bin/python3`)
## [2.10.9] - 2026-03-14
### Added

View file

@ -220,7 +220,7 @@ On first run, GSD launches a branded setup wizard that walks you through LLM pro
| `/gsd migrate` | Migrate a v1 `.planning` directory to `.gsd` format |
| `/gsd doctor` | Validate `.gsd/` integrity, find and fix issues |
| `/worktree` (`/wt`) | Git worktree lifecycle — create, switch, merge, remove |
| `/voice` | Toggle real-time speech-to-text (macOS only) |
| `/voice` | Toggle real-time speech-to-text (macOS, Linux) |
| `/exit` | Graceful shutdown — saves session state before exiting |
| `/kill` | Kill GSD process immediately |
| `/clear` | Start a new session (alias for `/new`) |
@ -348,7 +348,7 @@ GSD ships with 14 extensions, all loaded automatically:
| **Subagent** | Delegated tasks with isolated context windows |
| **Mac Tools** | macOS native app automation via Accessibility APIs |
| **MCPorter** | Lazy on-demand MCP server integration |
| **Voice** | Real-time speech-to-text transcription (macOS) |
| **Voice** | Real-time speech-to-text transcription (macOS, Linux — Ubuntu 22.04+) |
| **Slash Commands** | Custom command creation |
| **LSP** | Language Server Protocol integration — diagnostics, go-to-definition, references, hover, symbols, rename, code actions |
| **Ask User Questions** | Structured user input with single/multi-select |

View file

@ -49,6 +49,12 @@ const TOOL_KEYS: ToolKeyConfig[] = [
label: 'Jina AI',
hint: 'clean web page extraction',
},
{
provider: 'groq',
envVar: 'GROQ_API_KEY',
label: 'Groq',
hint: 'voice transcription — free at console.groq.com',
},
]
/** Known LLM provider IDs that, if authed, mean the user doesn't need onboarding */
@ -764,6 +770,7 @@ export function loadStoredEnvKeys(authStorage: AuthStorage): void {
['jina', 'JINA_API_KEY'],
['slack_bot', 'SLACK_BOT_TOKEN'],
['discord_bot', 'DISCORD_BOT_TOKEN'],
['groq', 'GROQ_API_KEY'],
]
for (const [provider, envVar] of providers) {
if (!process.env[envVar]) {

View file

@ -1 +1,4 @@
speech-recognizer
__pycache__/
*.pyc
.venv/

View file

@ -9,6 +9,23 @@ import * as readline from "node:readline";
const SWIFT_SRC = path.join(__dirname, "speech-recognizer.swift");
const RECOGNIZER_BIN = path.join(__dirname, "speech-recognizer");
const PYTHON_SCRIPT = path.join(__dirname, "speech-recognizer.py");
const IS_DARWIN = process.platform === "darwin";
const IS_LINUX = process.platform === "linux";
const VOICE_VENV_PYTHON = path.join(
process.env.HOME || process.env.USERPROFILE || "/tmp",
".gsd",
"voice-venv",
"bin",
"python3",
);
/** Return the python3 binary path — prefer venv if it exists, else system. */
function linuxPython(): string {
if (fs.existsSync(VOICE_VENV_PYTHON)) return VOICE_VENV_PYTHON;
return "python3";
}
function ensureBinary(): boolean {
if (fs.existsSync(RECOGNIZER_BIN)) return true;
@ -22,8 +39,49 @@ function ensureBinary(): boolean {
}
}
let linuxReady = false;
function ensureLinuxReady(ctx: ExtensionContext): boolean {
if (linuxReady) return true;
// Check python3 exists
try {
execSync("which python3", { stdio: "pipe" });
} catch {
ctx.ui.notify("Voice: python3 not found — install with: sudo apt install python3", "error");
return false;
}
// Check that sounddevice is importable
const py = linuxPython();
try {
execSync(`${py} -c "import sounddevice"`, {
stdio: "pipe",
timeout: 10000,
});
} catch (err: unknown) {
const stderr = (err as { stderr?: Buffer })?.stderr?.toString() ?? "";
if (stderr.includes("sounddevice") || stderr.includes("PortAudio") || stderr.includes("portaudio")) {
ctx.ui.notify("Voice: install libportaudio2 with: sudo apt install libportaudio2", "error");
} else if (stderr.includes("No module") || stderr.includes("ModuleNotFoundError")) {
// Deps missing — the Python script handles auto-install on first run,
// so we let it through. The script's own ensure_deps() will pip install.
ctx.ui.notify("Voice: installing dependencies on first run — this may take a moment", "info");
linuxReady = true;
return true;
} else {
ctx.ui.notify(`Voice: dependency check failed — ${stderr.split("\n")[0] || "unknown error"}`, "error");
return false;
}
return false;
}
linuxReady = true;
return true;
}
export default function (pi: ExtensionAPI) {
if (process.platform !== "darwin") return;
if (!IS_DARWIN && !IS_LINUX) return;
let active = false;
let recognizerProcess: ChildProcess | null = null;
@ -116,9 +174,15 @@ export default function (pi: ExtensionAPI) {
return;
}
if (!ensureBinary()) {
ctx.ui.notify("Voice: failed to compile speech recognizer (need Xcode CLI tools)", "error");
return;
if (IS_DARWIN) {
if (!ensureBinary()) {
ctx.ui.notify("Voice: failed to compile speech recognizer (need Xcode CLI tools)", "error");
return;
}
} else if (IS_LINUX) {
if (!ensureLinuxReady(ctx)) {
return;
}
}
active = true;
@ -146,7 +210,26 @@ export default function (pi: ExtensionAPI) {
onError: (msg: string) => void,
onReady: () => void,
) {
recognizerProcess = spawn(RECOGNIZER_BIN, [], { stdio: ["pipe", "pipe", "pipe"] });
if (IS_LINUX) {
// Pass GROQ_API_KEY to the Python process — check process.env, then .env file
const spawnEnv = { ...process.env };
if (!spawnEnv.GROQ_API_KEY) {
try {
const envPath = path.join(process.cwd(), ".env");
const envContent = fs.readFileSync(envPath, "utf-8");
const match = envContent.match(/^GROQ_API_KEY=(.+)$/m);
if (match) spawnEnv.GROQ_API_KEY = match[1].trim();
} catch {
// .env not found — Python script will emit ERROR if key needed
}
}
recognizerProcess = spawn(linuxPython(), [PYTHON_SCRIPT], {
stdio: ["pipe", "pipe", "pipe"],
env: spawnEnv,
});
} else {
recognizerProcess = spawn(RECOGNIZER_BIN, [], { stdio: ["pipe", "pipe", "pipe"] });
}
const rl = readline.createInterface({ input: recognizerProcess.stdout! });
rl.on("line", (line: string) => {
if (line === "READY") { onReady(); return; }

View file

@ -0,0 +1,504 @@
#!/usr/bin/env python3
"""
speech-recognizer.py STT recognizer for Linux.
Emits line protocol on stdout (unbuffered):
READY model loaded, mic active
PARTIAL:<text> partial transcription update (during speech)
FINAL:<text> finalized transcription (after pause/endpoint)
ERROR:<msg> fatal error (human-readable)
Backend: Groq Whisper API (default) or local faster-whisper.
--backend=groq Groq API (fast, accurate, requires GROQ_API_KEY)
--backend=local Local faster-whisper (offline, slower on CPU)
Requires: sounddevice (pip install sounddevice)
System dep: libportaudio2 (sudo apt install libportaudio2)
Designed to be spawned by index.ts startRecognizer() and communicate
exclusively via the stdout line protocol above.
"""
import io
import os
import signal
import subprocess
import struct
import sys
import time
import queue
import threading
import wave
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def emit(tag, msg=""):
"""Emit a single protocol line, flushed immediately."""
if msg:
print(f"{tag}:{msg}", flush=True)
else:
print(tag, flush=True)
def _try_pip_install(*packages):
"""Attempt pip install. Returns (success, error_detail)."""
try:
result = subprocess.run(
[sys.executable, "-m", "pip", "install", *packages, "--quiet"],
capture_output=True,
timeout=300,
)
if result.returncode == 0:
return True, ""
stderr = result.stderr.decode("utf-8", errors="replace").strip()
return False, stderr
except FileNotFoundError:
return False, "pip not found"
except subprocess.TimeoutExpired:
return False, "install timed out after 300s"
except Exception as exc:
return False, str(exc)
def ensure_deps():
"""Import sounddevice, auto-installing if missing.
Returns True on success. On failure, emits ERROR: and returns False.
Never raises all failures go through the line protocol.
"""
try:
__import__("sounddevice")
__import__("requests")
return True
except ImportError:
pass
# Attempt install
ok, detail = _try_pip_install("sounddevice", "requests")
if not ok:
if "externally-managed" in detail.lower():
emit(
"ERROR",
"Python environment is externally managed (PEP 668). "
"Create a venv first: python3 -m venv ~/.gsd/voice-venv && "
"~/.gsd/voice-venv/bin/pip install sounddevice requests",
)
elif "pip not found" in detail:
emit("ERROR", "pip is not available. Install: sudo apt install python3-pip")
else:
emit("ERROR", f"Failed to install sounddevice: {detail}")
return False
# Verify import after install
try:
__import__("sounddevice")
__import__("requests")
return True
except ImportError as exc:
emit("ERROR", f"Packages installed but cannot import: {exc}")
return False
def audio_to_wav_bytes(audio_data, sample_rate=16000):
"""Convert float32 numpy array to WAV bytes for API upload."""
import numpy as np
# Convert float32 [-1, 1] to int16
int16_data = (audio_data * 32767).astype(np.int16)
buf = io.BytesIO()
with wave.open(buf, "wb") as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(sample_rate)
wf.writeframes(int16_data.tobytes())
return buf.getvalue()
# ---------------------------------------------------------------------------
# Audio capture (shared by all backends)
# ---------------------------------------------------------------------------
SAMPLE_RATE = 16000
BLOCK_DURATION = 0.5 # seconds per audio block
BLOCK_SIZE = int(SAMPLE_RATE * BLOCK_DURATION)
SILENCE_THRESHOLD = 0.01 # RMS threshold for silence detection
SILENCE_DURATION = 0.8 # seconds of silence before finalizing
MIN_SPEECH_DURATION = 0.3 # minimum speech duration to trigger transcription
def open_mic():
"""Open mic stream and return (stream, audio_queue)."""
import sounddevice as sd
audio_queue = queue.Queue()
def audio_callback(indata, frames, time_info, status):
audio_queue.put(indata[:, 0].copy())
try:
stream = sd.InputStream(
samplerate=SAMPLE_RATE,
channels=1,
dtype="float32",
blocksize=BLOCK_SIZE,
callback=audio_callback,
)
stream.start()
return stream, audio_queue
except Exception as exc:
msg = str(exc).lower()
if "portaudio" in msg or "no module" in msg:
emit("ERROR", "Audio system not available. Install: sudo apt install libportaudio2")
else:
emit("ERROR", f"Failed to initialize microphone: {exc}")
sys.exit(1)
# ---------------------------------------------------------------------------
# Groq backend
# ---------------------------------------------------------------------------
def run_groq():
"""Groq Whisper API backend — fast cloud transcription."""
import numpy as np
import requests
api_key = os.environ.get("GROQ_API_KEY", "")
if not api_key:
emit("ERROR", "GROQ_API_KEY not set. Run 'gsd config' to set up, or get a free key at https://console.groq.com")
sys.exit(1)
groq_model = os.environ.get("GSD_GROQ_MODEL", "whisper-large-v3-turbo")
api_url = "https://api.groq.com/openai/v1/audio/transcriptions"
# --- Signal handling ---
shutdown_requested = False
def _handle_signal(signum, frame):
nonlocal shutdown_requested
shutdown_requested = True
signal.signal(signal.SIGTERM, _handle_signal)
signal.signal(signal.SIGINT, _handle_signal)
def transcribe_audio(audio_data):
"""Send audio to Groq API, return transcription text."""
wav_bytes = audio_to_wav_bytes(audio_data, SAMPLE_RATE)
try:
resp = requests.post(
api_url,
headers={"Authorization": f"Bearer {api_key}"},
files={"file": ("audio.wav", wav_bytes, "audio/wav")},
data={
"model": groq_model,
"language": "en",
"response_format": "json",
"temperature": "0.0",
},
timeout=10,
)
if resp.ok:
return resp.json().get("text", "").strip()
else:
emit("ERROR", f"Groq API error ({resp.status_code}): {resp.text[:200]}")
return ""
except requests.exceptions.Timeout:
emit("ERROR", "Groq API timeout")
return ""
except Exception as e:
emit("ERROR", f"Groq API connection error: {e}")
return ""
# --- Open mic ---
stream, audio_queue = open_mic()
emit("READY")
# --- State ---
completed_lines = []
speech_buffer = []
silence_counter = 0.0
in_speech = False
# Background transcription for partials
partial_lock = threading.Lock()
latest_partial = [None]
partial_thread = None
last_partial_time = 0.0
def _full_text(current=""):
parts = list(completed_lines)
if current:
parts.append(current)
return " ".join(parts)
def _transcribe_partial(audio_data):
try:
text = transcribe_audio(audio_data)
if text:
with partial_lock:
latest_partial[0] = text
except Exception:
pass
try:
while not shutdown_requested:
try:
block = audio_queue.get(timeout=0.2)
except queue.Empty:
with partial_lock:
if latest_partial[0] is not None:
emit("PARTIAL", _full_text(latest_partial[0]))
latest_partial[0] = None
continue
rms = float(np.sqrt(np.mean(block ** 2)))
is_speech = rms > SILENCE_THRESHOLD
if is_speech:
speech_buffer.append(block)
silence_counter = 0.0
if not in_speech:
in_speech = True
# Emit completed partial results
with partial_lock:
if latest_partial[0] is not None:
emit("PARTIAL", _full_text(latest_partial[0]))
latest_partial[0] = None
# Launch partial every ~2s, non-blocking
now = time.monotonic()
speech_duration = len(speech_buffer) * BLOCK_DURATION
can_partial = (
speech_duration >= 1.5
and now - last_partial_time >= 2.0
and (partial_thread is None or not partial_thread.is_alive())
)
if can_partial:
audio_data = np.concatenate(speech_buffer).copy()
partial_thread = threading.Thread(
target=_transcribe_partial,
args=(audio_data,),
daemon=True,
)
partial_thread.start()
last_partial_time = now
else:
if in_speech:
silence_counter += BLOCK_DURATION
if silence_counter >= SILENCE_DURATION:
speech_duration = len(speech_buffer) * BLOCK_DURATION
if speech_duration >= MIN_SPEECH_DURATION:
# Wait for any in-flight partial
if partial_thread is not None and partial_thread.is_alive():
partial_thread.join(timeout=5.0)
audio_data = np.concatenate(speech_buffer)
text = transcribe_audio(audio_data)
if text:
completed_lines.append(text)
emit("FINAL", _full_text())
speech_buffer.clear()
silence_counter = 0.0
in_speech = False
except Exception as exc:
emit("ERROR", f"Runtime error: {exc}")
sys.exit(1)
finally:
try:
stream.stop()
stream.close()
except Exception:
pass
# ---------------------------------------------------------------------------
# Local Whisper backend
# ---------------------------------------------------------------------------
def run_local():
"""Local faster-whisper backend (offline, CPU)."""
import numpy as np
from faster_whisper import WhisperModel
# --- Signal handling ---
shutdown_requested = False
def _handle_signal(signum, frame):
nonlocal shutdown_requested
shutdown_requested = True
signal.signal(signal.SIGTERM, _handle_signal)
signal.signal(signal.SIGINT, _handle_signal)
# --- Load model ---
model_size = os.environ.get("GSD_WHISPER_MODEL", "small")
cache_root = os.path.join(
os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache")),
"gsd", "whisper",
)
try:
model = WhisperModel(
model_size,
device="cpu",
compute_type="int8",
download_root=cache_root,
)
except Exception as exc:
emit("ERROR", f"Failed to load Whisper model ({model_size}): {exc}")
sys.exit(1)
# --- Open mic ---
stream, audio_queue = open_mic()
emit("READY")
# --- State ---
completed_lines = []
speech_buffer = []
silence_counter = 0.0
in_speech = False
partial_lock = threading.Lock()
latest_partial = [None]
partial_thread = None
last_partial_time = 0.0
def _full_text(current=""):
parts = list(completed_lines)
if current:
parts.append(current)
return " ".join(parts)
def _transcribe_partial(audio_data):
try:
segments, _ = model.transcribe(
audio_data, language="en", beam_size=1,
vad_filter=False, condition_on_previous_text=False,
)
text = " ".join(s.text.strip() for s in segments).strip()
if text:
with partial_lock:
latest_partial[0] = text
except Exception:
pass
try:
while not shutdown_requested:
try:
block = audio_queue.get(timeout=0.2)
except queue.Empty:
with partial_lock:
if latest_partial[0] is not None:
emit("PARTIAL", _full_text(latest_partial[0]))
latest_partial[0] = None
continue
rms = float(np.sqrt(np.mean(block ** 2)))
is_speech = rms > SILENCE_THRESHOLD
if is_speech:
speech_buffer.append(block)
silence_counter = 0.0
if not in_speech:
in_speech = True
with partial_lock:
if latest_partial[0] is not None:
emit("PARTIAL", _full_text(latest_partial[0]))
latest_partial[0] = None
now = time.monotonic()
speech_duration = len(speech_buffer) * BLOCK_DURATION
can_partial = (
speech_duration >= 1.5
and now - last_partial_time >= 2.0
and (partial_thread is None or not partial_thread.is_alive())
)
if can_partial:
audio_data = np.concatenate(speech_buffer).copy()
partial_thread = threading.Thread(
target=_transcribe_partial,
args=(audio_data,),
daemon=True,
)
partial_thread.start()
last_partial_time = now
else:
if in_speech:
silence_counter += BLOCK_DURATION
if silence_counter >= SILENCE_DURATION:
speech_duration = len(speech_buffer) * BLOCK_DURATION
if speech_duration >= MIN_SPEECH_DURATION:
if partial_thread is not None and partial_thread.is_alive():
partial_thread.join(timeout=5.0)
audio_data = np.concatenate(speech_buffer)
try:
segments, _ = model.transcribe(
audio_data, language="en", beam_size=5,
vad_filter=True,
)
text = " ".join(s.text.strip() for s in segments).strip()
if text:
completed_lines.append(text)
emit("FINAL", _full_text())
except Exception as exc:
emit("ERROR", f"Transcription error: {exc}")
speech_buffer.clear()
silence_counter = 0.0
in_speech = False
except Exception as exc:
emit("ERROR", f"Runtime error: {exc}")
sys.exit(1)
finally:
try:
stream.stop()
stream.close()
except Exception:
pass
# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------
def main():
backend = "groq"
for arg in sys.argv[1:]:
if arg == "--backend=groq":
backend = "groq"
elif arg == "--backend=local":
backend = "local"
if not ensure_deps():
sys.exit(1)
if backend == "local":
# Check for faster-whisper
try:
__import__("faster_whisper")
except ImportError:
ok, detail = _try_pip_install("faster-whisper")
if not ok:
if "externally-managed" in detail.lower():
emit("ERROR",
"Python environment is externally managed (PEP 668). "
"Install in your venv: pip install faster-whisper")
else:
emit("ERROR", f"Failed to install faster-whisper: {detail}")
sys.exit(1)
run_local()
else:
run_groq()
if __name__ == "__main__":
main()

View file

@ -16,6 +16,7 @@ export function loadStoredEnvKeys(authStorage: AuthStorage): void {
['tavily', 'TAVILY_API_KEY'],
['slack_bot', 'SLACK_BOT_TOKEN'],
['discord_bot', 'DISCORD_BOT_TOKEN'],
['groq', 'GROQ_API_KEY'],
]
for (const [provider, envVar] of providers) {
if (!process.env[envVar]) {