feat(voice): Groq Whisper API backend for Linux voice mode (#366)

2026-03-14 15:08:36 +00:00 · 2026-03-14 15:08:36 +00:00 · 6a39f7226b
commit 6a39f7226b
parent f24f63f290
7 changed files with 611 additions and 7 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -6,6 +6,12 @@ Format based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).

 ## [Unreleased]

+### Added
+- Linux voice mode: Groq Whisper API backend for fast, accurate speech-to-text (Ctrl+Alt+V toggle)
+- Auto-reads `GROQ_API_KEY` from project `.env` file
+- Fallback `--backend=local` for offline faster-whisper on CPU
+- Venv-aware Python detection (`~/.gsd/voice-venv/bin/python3`)
+
 ## [2.10.9] - 2026-03-14

 ### Added
--- a/README.md
+++ b/README.md
@ -220,7 +220,7 @@ On first run, GSD launches a branded setup wizard that walks you through LLM pro
 | `/gsd migrate`          | Migrate a v1 `.planning` directory to `.gsd` format             |
 | `/gsd doctor`           | Validate `.gsd/` integrity, find and fix issues                 |
 | `/worktree` (`/wt`)     | Git worktree lifecycle — create, switch, merge, remove          |
-| `/voice`                | Toggle real-time speech-to-text (macOS only)                    |
+| `/voice`                | Toggle real-time speech-to-text (macOS, Linux)                  |
 | `/exit`                 | Graceful shutdown — saves session state before exiting          |
 | `/kill`                 | Kill GSD process immediately                                    |
 | `/clear`                | Start a new session (alias for `/new`)                          |
@ -348,7 +348,7 @@ GSD ships with 14 extensions, all loaded automatically:
 | **Subagent**           | Delegated tasks with isolated context windows                                                                          |
 | **Mac Tools**          | macOS native app automation via Accessibility APIs                                                                     |
 | **MCPorter**           | Lazy on-demand MCP server integration                                                                                  |
-| **Voice**              | Real-time speech-to-text transcription (macOS)                                                                         |
+| **Voice**              | Real-time speech-to-text transcription (macOS, Linux — Ubuntu 22.04+)                                                  |
 | **Slash Commands**     | Custom command creation                                                                                                |
 | **LSP**                | Language Server Protocol integration — diagnostics, go-to-definition, references, hover, symbols, rename, code actions |
 | **Ask User Questions** | Structured user input with single/multi-select                                                                         |
--- a/src/onboarding.ts
+++ b/src/onboarding.ts
@ -49,6 +49,12 @@ const TOOL_KEYS: ToolKeyConfig[] = [
    label: 'Jina AI',
    hint: 'clean web page extraction',
  },
+  {
+    provider: 'groq',
+    envVar: 'GROQ_API_KEY',
+    label: 'Groq',
+    hint: 'voice transcription — free at console.groq.com',
+  },
 ]

 /** Known LLM provider IDs that, if authed, mean the user doesn't need onboarding */
@ -764,6 +770,7 @@ export function loadStoredEnvKeys(authStorage: AuthStorage): void {
    ['jina',          'JINA_API_KEY'],
    ['slack_bot',     'SLACK_BOT_TOKEN'],
    ['discord_bot',   'DISCORD_BOT_TOKEN'],
+    ['groq',          'GROQ_API_KEY'],
  ]
  for (const [provider, envVar] of providers) {
    if (!process.env[envVar]) {
--- a/src/resources/extensions/voice/.gitignore
+++ b/src/resources/extensions/voice/.gitignore
@ -1 +1,4 @@
 speech-recognizer
+__pycache__/
+*.pyc
+.venv/
--- a/src/resources/extensions/voice/index.ts
+++ b/src/resources/extensions/voice/index.ts
@ -9,6 +9,23 @@ import * as readline from "node:readline";

 const SWIFT_SRC = path.join(__dirname, "speech-recognizer.swift");
 const RECOGNIZER_BIN = path.join(__dirname, "speech-recognizer");
+const PYTHON_SCRIPT = path.join(__dirname, "speech-recognizer.py");
+
+const IS_DARWIN = process.platform === "darwin";
+const IS_LINUX = process.platform === "linux";
+const VOICE_VENV_PYTHON = path.join(
+	process.env.HOME || process.env.USERPROFILE || "/tmp",
+	".gsd",
+	"voice-venv",
+	"bin",
+	"python3",
+);
+
+/** Return the python3 binary path — prefer venv if it exists, else system. */
+function linuxPython(): string {
+	if (fs.existsSync(VOICE_VENV_PYTHON)) return VOICE_VENV_PYTHON;
+	return "python3";
+}

 function ensureBinary(): boolean {
 	if (fs.existsSync(RECOGNIZER_BIN)) return true;
@ -22,8 +39,49 @@ function ensureBinary(): boolean {
 	}
 }

+let linuxReady = false;
+
+function ensureLinuxReady(ctx: ExtensionContext): boolean {
+	if (linuxReady) return true;
+
+	// Check python3 exists
+	try {
+		execSync("which python3", { stdio: "pipe" });
+	} catch {
+		ctx.ui.notify("Voice: python3 not found — install with: sudo apt install python3", "error");
+		return false;
+	}
+
+	// Check that sounddevice is importable
+	const py = linuxPython();
+	try {
+		execSync(`${py} -c "import sounddevice"`, {
+			stdio: "pipe",
+			timeout: 10000,
+		});
+	} catch (err: unknown) {
+		const stderr = (err as { stderr?: Buffer })?.stderr?.toString() ?? "";
+		if (stderr.includes("sounddevice") || stderr.includes("PortAudio") || stderr.includes("portaudio")) {
+			ctx.ui.notify("Voice: install libportaudio2 with: sudo apt install libportaudio2", "error");
+		} else if (stderr.includes("No module") || stderr.includes("ModuleNotFoundError")) {
+			// Deps missing — the Python script handles auto-install on first run,
+			// so we let it through. The script's own ensure_deps() will pip install.
+			ctx.ui.notify("Voice: installing dependencies on first run — this may take a moment", "info");
+			linuxReady = true;
+			return true;
+		} else {
+			ctx.ui.notify(`Voice: dependency check failed — ${stderr.split("\n")[0] || "unknown error"}`, "error");
+			return false;
+		}
+		return false;
+	}
+
+	linuxReady = true;
+	return true;
+}
+
 export default function (pi: ExtensionAPI) {
-	if (process.platform !== "darwin") return;
+	if (!IS_DARWIN && !IS_LINUX) return;

 	let active = false;
 	let recognizerProcess: ChildProcess | null = null;
@ -116,9 +174,15 @@ export default function (pi: ExtensionAPI) {
 			return;
 		}

-		if (!ensureBinary()) {
-			ctx.ui.notify("Voice: failed to compile speech recognizer (need Xcode CLI tools)", "error");
-			return;
+		if (IS_DARWIN) {
+			if (!ensureBinary()) {
+				ctx.ui.notify("Voice: failed to compile speech recognizer (need Xcode CLI tools)", "error");
+				return;
+			}
+		} else if (IS_LINUX) {
+			if (!ensureLinuxReady(ctx)) {
+				return;
+			}
 		}

 		active = true;
@ -146,7 +210,26 @@ export default function (pi: ExtensionAPI) {
 		onError: (msg: string) => void,
 		onReady: () => void,
 	) {
-		recognizerProcess = spawn(RECOGNIZER_BIN, [], { stdio: ["pipe", "pipe", "pipe"] });
+		if (IS_LINUX) {
+			// Pass GROQ_API_KEY to the Python process — check process.env, then .env file
+			const spawnEnv = { ...process.env };
+			if (!spawnEnv.GROQ_API_KEY) {
+				try {
+					const envPath = path.join(process.cwd(), ".env");
+					const envContent = fs.readFileSync(envPath, "utf-8");
+					const match = envContent.match(/^GROQ_API_KEY=(.+)$/m);
+					if (match) spawnEnv.GROQ_API_KEY = match[1].trim();
+				} catch {
+					// .env not found — Python script will emit ERROR if key needed
+				}
+			}
+			recognizerProcess = spawn(linuxPython(), [PYTHON_SCRIPT], {
+				stdio: ["pipe", "pipe", "pipe"],
+				env: spawnEnv,
+			});
+		} else {
+			recognizerProcess = spawn(RECOGNIZER_BIN, [], { stdio: ["pipe", "pipe", "pipe"] });
+		}
 		const rl = readline.createInterface({ input: recognizerProcess.stdout! });
 		rl.on("line", (line: string) => {
 			if (line === "READY") { onReady(); return; }
--- a/src/resources/extensions/voice/speech-recognizer.py
+++ b/src/resources/extensions/voice/speech-recognizer.py
@ -0,0 +1,504 @@
+#!/usr/bin/env python3
+"""
+speech-recognizer.py — STT recognizer for Linux.
+
+Emits line protocol on stdout (unbuffered):
+  READY          — model loaded, mic active
+  PARTIAL:<text> — partial transcription update (during speech)
+  FINAL:<text>   — finalized transcription (after pause/endpoint)
+  ERROR:<msg>    — fatal error (human-readable)
+
+Backend: Groq Whisper API (default) or local faster-whisper.
+  --backend=groq     → Groq API (fast, accurate, requires GROQ_API_KEY)
+  --backend=local    → Local faster-whisper (offline, slower on CPU)
+
+Requires: sounddevice (pip install sounddevice)
+System dep: libportaudio2 (sudo apt install libportaudio2)
+
+Designed to be spawned by index.ts startRecognizer() and communicate
+exclusively via the stdout line protocol above.
+"""
+
+import io
+import os
+import signal
+import subprocess
+import struct
+import sys
+import time
+import queue
+import threading
+import wave
+
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+
+def emit(tag, msg=""):
+    """Emit a single protocol line, flushed immediately."""
+    if msg:
+        print(f"{tag}:{msg}", flush=True)
+    else:
+        print(tag, flush=True)
+
+
+def _try_pip_install(*packages):
+    """Attempt pip install. Returns (success, error_detail)."""
+    try:
+        result = subprocess.run(
+            [sys.executable, "-m", "pip", "install", *packages, "--quiet"],
+            capture_output=True,
+            timeout=300,
+        )
+        if result.returncode == 0:
+            return True, ""
+        stderr = result.stderr.decode("utf-8", errors="replace").strip()
+        return False, stderr
+    except FileNotFoundError:
+        return False, "pip not found"
+    except subprocess.TimeoutExpired:
+        return False, "install timed out after 300s"
+    except Exception as exc:
+        return False, str(exc)
+
+
+def ensure_deps():
+    """Import sounddevice, auto-installing if missing.
+
+    Returns True on success. On failure, emits ERROR: and returns False.
+    Never raises — all failures go through the line protocol.
+    """
+    try:
+        __import__("sounddevice")
+        __import__("requests")
+        return True
+    except ImportError:
+        pass
+
+    # Attempt install
+    ok, detail = _try_pip_install("sounddevice", "requests")
+    if not ok:
+        if "externally-managed" in detail.lower():
+            emit(
+                "ERROR",
+                "Python environment is externally managed (PEP 668). "
+                "Create a venv first: python3 -m venv ~/.gsd/voice-venv && "
+                "~/.gsd/voice-venv/bin/pip install sounddevice requests",
+            )
+        elif "pip not found" in detail:
+            emit("ERROR", "pip is not available. Install: sudo apt install python3-pip")
+        else:
+            emit("ERROR", f"Failed to install sounddevice: {detail}")
+        return False
+
+    # Verify import after install
+    try:
+        __import__("sounddevice")
+        __import__("requests")
+        return True
+    except ImportError as exc:
+        emit("ERROR", f"Packages installed but cannot import: {exc}")
+        return False
+
+
+def audio_to_wav_bytes(audio_data, sample_rate=16000):
+    """Convert float32 numpy array to WAV bytes for API upload."""
+    import numpy as np
+    # Convert float32 [-1, 1] to int16
+    int16_data = (audio_data * 32767).astype(np.int16)
+    buf = io.BytesIO()
+    with wave.open(buf, "wb") as wf:
+        wf.setnchannels(1)
+        wf.setsampwidth(2)
+        wf.setframerate(sample_rate)
+        wf.writeframes(int16_data.tobytes())
+    return buf.getvalue()
+
+
+# ---------------------------------------------------------------------------
+# Audio capture (shared by all backends)
+# ---------------------------------------------------------------------------
+
+SAMPLE_RATE = 16000
+BLOCK_DURATION = 0.5  # seconds per audio block
+BLOCK_SIZE = int(SAMPLE_RATE * BLOCK_DURATION)
+SILENCE_THRESHOLD = 0.01  # RMS threshold for silence detection
+SILENCE_DURATION = 0.8  # seconds of silence before finalizing
+MIN_SPEECH_DURATION = 0.3  # minimum speech duration to trigger transcription
+
+
+def open_mic():
+    """Open mic stream and return (stream, audio_queue)."""
+    import sounddevice as sd
+
+    audio_queue = queue.Queue()
+
+    def audio_callback(indata, frames, time_info, status):
+        audio_queue.put(indata[:, 0].copy())
+
+    try:
+        stream = sd.InputStream(
+            samplerate=SAMPLE_RATE,
+            channels=1,
+            dtype="float32",
+            blocksize=BLOCK_SIZE,
+            callback=audio_callback,
+        )
+        stream.start()
+        return stream, audio_queue
+    except Exception as exc:
+        msg = str(exc).lower()
+        if "portaudio" in msg or "no module" in msg:
+            emit("ERROR", "Audio system not available. Install: sudo apt install libportaudio2")
+        else:
+            emit("ERROR", f"Failed to initialize microphone: {exc}")
+        sys.exit(1)
+
+
+# ---------------------------------------------------------------------------
+# Groq backend
+# ---------------------------------------------------------------------------
+
+def run_groq():
+    """Groq Whisper API backend — fast cloud transcription."""
+    import numpy as np
+    import requests
+
+    api_key = os.environ.get("GROQ_API_KEY", "")
+    if not api_key:
+        emit("ERROR", "GROQ_API_KEY not set. Run 'gsd config' to set up, or get a free key at https://console.groq.com")
+        sys.exit(1)
+
+    groq_model = os.environ.get("GSD_GROQ_MODEL", "whisper-large-v3-turbo")
+    api_url = "https://api.groq.com/openai/v1/audio/transcriptions"
+
+    # --- Signal handling ---
+    shutdown_requested = False
+
+    def _handle_signal(signum, frame):
+        nonlocal shutdown_requested
+        shutdown_requested = True
+
+    signal.signal(signal.SIGTERM, _handle_signal)
+    signal.signal(signal.SIGINT, _handle_signal)
+
+    def transcribe_audio(audio_data):
+        """Send audio to Groq API, return transcription text."""
+        wav_bytes = audio_to_wav_bytes(audio_data, SAMPLE_RATE)
+
+        try:
+            resp = requests.post(
+                api_url,
+                headers={"Authorization": f"Bearer {api_key}"},
+                files={"file": ("audio.wav", wav_bytes, "audio/wav")},
+                data={
+                    "model": groq_model,
+                    "language": "en",
+                    "response_format": "json",
+                    "temperature": "0.0",
+                },
+                timeout=10,
+            )
+            if resp.ok:
+                return resp.json().get("text", "").strip()
+            else:
+                emit("ERROR", f"Groq API error ({resp.status_code}): {resp.text[:200]}")
+                return ""
+        except requests.exceptions.Timeout:
+            emit("ERROR", "Groq API timeout")
+            return ""
+        except Exception as e:
+            emit("ERROR", f"Groq API connection error: {e}")
+            return ""
+
+    # --- Open mic ---
+    stream, audio_queue = open_mic()
+    emit("READY")
+
+    # --- State ---
+    completed_lines = []
+    speech_buffer = []
+    silence_counter = 0.0
+    in_speech = False
+
+    # Background transcription for partials
+    partial_lock = threading.Lock()
+    latest_partial = [None]
+    partial_thread = None
+    last_partial_time = 0.0
+
+    def _full_text(current=""):
+        parts = list(completed_lines)
+        if current:
+            parts.append(current)
+        return " ".join(parts)
+
+    def _transcribe_partial(audio_data):
+        try:
+            text = transcribe_audio(audio_data)
+            if text:
+                with partial_lock:
+                    latest_partial[0] = text
+        except Exception:
+            pass
+
+    try:
+        while not shutdown_requested:
+            try:
+                block = audio_queue.get(timeout=0.2)
+            except queue.Empty:
+                with partial_lock:
+                    if latest_partial[0] is not None:
+                        emit("PARTIAL", _full_text(latest_partial[0]))
+                        latest_partial[0] = None
+                continue
+
+            rms = float(np.sqrt(np.mean(block ** 2)))
+            is_speech = rms > SILENCE_THRESHOLD
+
+            if is_speech:
+                speech_buffer.append(block)
+                silence_counter = 0.0
+
+                if not in_speech:
+                    in_speech = True
+
+                # Emit completed partial results
+                with partial_lock:
+                    if latest_partial[0] is not None:
+                        emit("PARTIAL", _full_text(latest_partial[0]))
+                        latest_partial[0] = None
+
+                # Launch partial every ~2s, non-blocking
+                now = time.monotonic()
+                speech_duration = len(speech_buffer) * BLOCK_DURATION
+                can_partial = (
+                    speech_duration >= 1.5
+                    and now - last_partial_time >= 2.0
+                    and (partial_thread is None or not partial_thread.is_alive())
+                )
+                if can_partial:
+                    audio_data = np.concatenate(speech_buffer).copy()
+                    partial_thread = threading.Thread(
+                        target=_transcribe_partial,
+                        args=(audio_data,),
+                        daemon=True,
+                    )
+                    partial_thread.start()
+                    last_partial_time = now
+            else:
+                if in_speech:
+                    silence_counter += BLOCK_DURATION
+
+                    if silence_counter >= SILENCE_DURATION:
+                        speech_duration = len(speech_buffer) * BLOCK_DURATION
+                        if speech_duration >= MIN_SPEECH_DURATION:
+                            # Wait for any in-flight partial
+                            if partial_thread is not None and partial_thread.is_alive():
+                                partial_thread.join(timeout=5.0)
+
+                            audio_data = np.concatenate(speech_buffer)
+                            text = transcribe_audio(audio_data)
+                            if text:
+                                completed_lines.append(text)
+                                emit("FINAL", _full_text())
+
+                        speech_buffer.clear()
+                        silence_counter = 0.0
+                        in_speech = False
+    except Exception as exc:
+        emit("ERROR", f"Runtime error: {exc}")
+        sys.exit(1)
+    finally:
+        try:
+            stream.stop()
+            stream.close()
+        except Exception:
+            pass
+
+
+# ---------------------------------------------------------------------------
+# Local Whisper backend
+# ---------------------------------------------------------------------------
+
+def run_local():
+    """Local faster-whisper backend (offline, CPU)."""
+    import numpy as np
+    from faster_whisper import WhisperModel
+
+    # --- Signal handling ---
+    shutdown_requested = False
+
+    def _handle_signal(signum, frame):
+        nonlocal shutdown_requested
+        shutdown_requested = True
+
+    signal.signal(signal.SIGTERM, _handle_signal)
+    signal.signal(signal.SIGINT, _handle_signal)
+
+    # --- Load model ---
+    model_size = os.environ.get("GSD_WHISPER_MODEL", "small")
+    cache_root = os.path.join(
+        os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache")),
+        "gsd", "whisper",
+    )
+    try:
+        model = WhisperModel(
+            model_size,
+            device="cpu",
+            compute_type="int8",
+            download_root=cache_root,
+        )
+    except Exception as exc:
+        emit("ERROR", f"Failed to load Whisper model ({model_size}): {exc}")
+        sys.exit(1)
+
+    # --- Open mic ---
+    stream, audio_queue = open_mic()
+    emit("READY")
+
+    # --- State ---
+    completed_lines = []
+    speech_buffer = []
+    silence_counter = 0.0
+    in_speech = False
+
+    partial_lock = threading.Lock()
+    latest_partial = [None]
+    partial_thread = None
+    last_partial_time = 0.0
+
+    def _full_text(current=""):
+        parts = list(completed_lines)
+        if current:
+            parts.append(current)
+        return " ".join(parts)
+
+    def _transcribe_partial(audio_data):
+        try:
+            segments, _ = model.transcribe(
+                audio_data, language="en", beam_size=1,
+                vad_filter=False, condition_on_previous_text=False,
+            )
+            text = " ".join(s.text.strip() for s in segments).strip()
+            if text:
+                with partial_lock:
+                    latest_partial[0] = text
+        except Exception:
+            pass
+
+    try:
+        while not shutdown_requested:
+            try:
+                block = audio_queue.get(timeout=0.2)
+            except queue.Empty:
+                with partial_lock:
+                    if latest_partial[0] is not None:
+                        emit("PARTIAL", _full_text(latest_partial[0]))
+                        latest_partial[0] = None
+                continue
+
+            rms = float(np.sqrt(np.mean(block ** 2)))
+            is_speech = rms > SILENCE_THRESHOLD
+
+            if is_speech:
+                speech_buffer.append(block)
+                silence_counter = 0.0
+
+                if not in_speech:
+                    in_speech = True
+
+                with partial_lock:
+                    if latest_partial[0] is not None:
+                        emit("PARTIAL", _full_text(latest_partial[0]))
+                        latest_partial[0] = None
+
+                now = time.monotonic()
+                speech_duration = len(speech_buffer) * BLOCK_DURATION
+                can_partial = (
+                    speech_duration >= 1.5
+                    and now - last_partial_time >= 2.0
+                    and (partial_thread is None or not partial_thread.is_alive())
+                )
+                if can_partial:
+                    audio_data = np.concatenate(speech_buffer).copy()
+                    partial_thread = threading.Thread(
+                        target=_transcribe_partial,
+                        args=(audio_data,),
+                        daemon=True,
+                    )
+                    partial_thread.start()
+                    last_partial_time = now
+            else:
+                if in_speech:
+                    silence_counter += BLOCK_DURATION
+
+                    if silence_counter >= SILENCE_DURATION:
+                        speech_duration = len(speech_buffer) * BLOCK_DURATION
+                        if speech_duration >= MIN_SPEECH_DURATION:
+                            if partial_thread is not None and partial_thread.is_alive():
+                                partial_thread.join(timeout=5.0)
+
+                            audio_data = np.concatenate(speech_buffer)
+                            try:
+                                segments, _ = model.transcribe(
+                                    audio_data, language="en", beam_size=5,
+                                    vad_filter=True,
+                                )
+                                text = " ".join(s.text.strip() for s in segments).strip()
+                                if text:
+                                    completed_lines.append(text)
+                                    emit("FINAL", _full_text())
+                            except Exception as exc:
+                                emit("ERROR", f"Transcription error: {exc}")
+
+                        speech_buffer.clear()
+                        silence_counter = 0.0
+                        in_speech = False
+    except Exception as exc:
+        emit("ERROR", f"Runtime error: {exc}")
+        sys.exit(1)
+    finally:
+        try:
+            stream.stop()
+            stream.close()
+        except Exception:
+            pass
+
+
+# ---------------------------------------------------------------------------
+# Entry point
+# ---------------------------------------------------------------------------
+
+def main():
+    backend = "groq"
+    for arg in sys.argv[1:]:
+        if arg == "--backend=groq":
+            backend = "groq"
+        elif arg == "--backend=local":
+            backend = "local"
+
+    if not ensure_deps():
+        sys.exit(1)
+
+    if backend == "local":
+        # Check for faster-whisper
+        try:
+            __import__("faster_whisper")
+        except ImportError:
+            ok, detail = _try_pip_install("faster-whisper")
+            if not ok:
+                if "externally-managed" in detail.lower():
+                    emit("ERROR",
+                        "Python environment is externally managed (PEP 668). "
+                        "Install in your venv: pip install faster-whisper")
+                else:
+                    emit("ERROR", f"Failed to install faster-whisper: {detail}")
+                sys.exit(1)
+        run_local()
+    else:
+        run_groq()
+
+
+if __name__ == "__main__":
+    main()
--- a/src/wizard.ts
+++ b/src/wizard.ts
@ -16,6 +16,7 @@ export function loadStoredEnvKeys(authStorage: AuthStorage): void {
    ['tavily',        'TAVILY_API_KEY'],
    ['slack_bot',     'SLACK_BOT_TOKEN'],
    ['discord_bot',   'DISCORD_BOT_TOKEN'],
+    ['groq',          'GROQ_API_KEY'],
  ]
  for (const [provider, envVar] of providers) {
    if (!process.env[envVar]) {