diff --git a/CHANGELOG.md b/CHANGELOG.md index 792de38b8..40425c809 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,12 @@ Format based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). ## [Unreleased] +### Added +- Linux voice mode: Groq Whisper API backend for fast, accurate speech-to-text (Ctrl+Alt+V toggle) +- Auto-reads `GROQ_API_KEY` from project `.env` file +- Fallback `--backend=local` for offline faster-whisper on CPU +- Venv-aware Python detection (`~/.gsd/voice-venv/bin/python3`) + ## [2.10.9] - 2026-03-14 ### Added diff --git a/README.md b/README.md index e5cb93223..0040ca909 100644 --- a/README.md +++ b/README.md @@ -220,7 +220,7 @@ On first run, GSD launches a branded setup wizard that walks you through LLM pro | `/gsd migrate` | Migrate a v1 `.planning` directory to `.gsd` format | | `/gsd doctor` | Validate `.gsd/` integrity, find and fix issues | | `/worktree` (`/wt`) | Git worktree lifecycle — create, switch, merge, remove | -| `/voice` | Toggle real-time speech-to-text (macOS only) | +| `/voice` | Toggle real-time speech-to-text (macOS, Linux) | | `/exit` | Graceful shutdown — saves session state before exiting | | `/kill` | Kill GSD process immediately | | `/clear` | Start a new session (alias for `/new`) | @@ -348,7 +348,7 @@ GSD ships with 14 extensions, all loaded automatically: | **Subagent** | Delegated tasks with isolated context windows | | **Mac Tools** | macOS native app automation via Accessibility APIs | | **MCPorter** | Lazy on-demand MCP server integration | -| **Voice** | Real-time speech-to-text transcription (macOS) | +| **Voice** | Real-time speech-to-text transcription (macOS, Linux — Ubuntu 22.04+) | | **Slash Commands** | Custom command creation | | **LSP** | Language Server Protocol integration — diagnostics, go-to-definition, references, hover, symbols, rename, code actions | | **Ask User Questions** | Structured user input with single/multi-select | diff --git a/src/onboarding.ts b/src/onboarding.ts index 665746f98..b7fcb0349 100644 --- a/src/onboarding.ts +++ b/src/onboarding.ts @@ -49,6 +49,12 @@ const TOOL_KEYS: ToolKeyConfig[] = [ label: 'Jina AI', hint: 'clean web page extraction', }, + { + provider: 'groq', + envVar: 'GROQ_API_KEY', + label: 'Groq', + hint: 'voice transcription — free at console.groq.com', + }, ] /** Known LLM provider IDs that, if authed, mean the user doesn't need onboarding */ @@ -764,6 +770,7 @@ export function loadStoredEnvKeys(authStorage: AuthStorage): void { ['jina', 'JINA_API_KEY'], ['slack_bot', 'SLACK_BOT_TOKEN'], ['discord_bot', 'DISCORD_BOT_TOKEN'], + ['groq', 'GROQ_API_KEY'], ] for (const [provider, envVar] of providers) { if (!process.env[envVar]) { diff --git a/src/resources/extensions/voice/.gitignore b/src/resources/extensions/voice/.gitignore index 2c61a071c..8f8f4cca7 100644 --- a/src/resources/extensions/voice/.gitignore +++ b/src/resources/extensions/voice/.gitignore @@ -1 +1,4 @@ speech-recognizer +__pycache__/ +*.pyc +.venv/ diff --git a/src/resources/extensions/voice/index.ts b/src/resources/extensions/voice/index.ts index 4f997ffb9..555350e5c 100644 --- a/src/resources/extensions/voice/index.ts +++ b/src/resources/extensions/voice/index.ts @@ -9,6 +9,23 @@ import * as readline from "node:readline"; const SWIFT_SRC = path.join(__dirname, "speech-recognizer.swift"); const RECOGNIZER_BIN = path.join(__dirname, "speech-recognizer"); +const PYTHON_SCRIPT = path.join(__dirname, "speech-recognizer.py"); + +const IS_DARWIN = process.platform === "darwin"; +const IS_LINUX = process.platform === "linux"; +const VOICE_VENV_PYTHON = path.join( + process.env.HOME || process.env.USERPROFILE || "/tmp", + ".gsd", + "voice-venv", + "bin", + "python3", +); + +/** Return the python3 binary path — prefer venv if it exists, else system. */ +function linuxPython(): string { + if (fs.existsSync(VOICE_VENV_PYTHON)) return VOICE_VENV_PYTHON; + return "python3"; +} function ensureBinary(): boolean { if (fs.existsSync(RECOGNIZER_BIN)) return true; @@ -22,8 +39,49 @@ function ensureBinary(): boolean { } } +let linuxReady = false; + +function ensureLinuxReady(ctx: ExtensionContext): boolean { + if (linuxReady) return true; + + // Check python3 exists + try { + execSync("which python3", { stdio: "pipe" }); + } catch { + ctx.ui.notify("Voice: python3 not found — install with: sudo apt install python3", "error"); + return false; + } + + // Check that sounddevice is importable + const py = linuxPython(); + try { + execSync(`${py} -c "import sounddevice"`, { + stdio: "pipe", + timeout: 10000, + }); + } catch (err: unknown) { + const stderr = (err as { stderr?: Buffer })?.stderr?.toString() ?? ""; + if (stderr.includes("sounddevice") || stderr.includes("PortAudio") || stderr.includes("portaudio")) { + ctx.ui.notify("Voice: install libportaudio2 with: sudo apt install libportaudio2", "error"); + } else if (stderr.includes("No module") || stderr.includes("ModuleNotFoundError")) { + // Deps missing — the Python script handles auto-install on first run, + // so we let it through. The script's own ensure_deps() will pip install. + ctx.ui.notify("Voice: installing dependencies on first run — this may take a moment", "info"); + linuxReady = true; + return true; + } else { + ctx.ui.notify(`Voice: dependency check failed — ${stderr.split("\n")[0] || "unknown error"}`, "error"); + return false; + } + return false; + } + + linuxReady = true; + return true; +} + export default function (pi: ExtensionAPI) { - if (process.platform !== "darwin") return; + if (!IS_DARWIN && !IS_LINUX) return; let active = false; let recognizerProcess: ChildProcess | null = null; @@ -116,9 +174,15 @@ export default function (pi: ExtensionAPI) { return; } - if (!ensureBinary()) { - ctx.ui.notify("Voice: failed to compile speech recognizer (need Xcode CLI tools)", "error"); - return; + if (IS_DARWIN) { + if (!ensureBinary()) { + ctx.ui.notify("Voice: failed to compile speech recognizer (need Xcode CLI tools)", "error"); + return; + } + } else if (IS_LINUX) { + if (!ensureLinuxReady(ctx)) { + return; + } } active = true; @@ -146,7 +210,26 @@ export default function (pi: ExtensionAPI) { onError: (msg: string) => void, onReady: () => void, ) { - recognizerProcess = spawn(RECOGNIZER_BIN, [], { stdio: ["pipe", "pipe", "pipe"] }); + if (IS_LINUX) { + // Pass GROQ_API_KEY to the Python process — check process.env, then .env file + const spawnEnv = { ...process.env }; + if (!spawnEnv.GROQ_API_KEY) { + try { + const envPath = path.join(process.cwd(), ".env"); + const envContent = fs.readFileSync(envPath, "utf-8"); + const match = envContent.match(/^GROQ_API_KEY=(.+)$/m); + if (match) spawnEnv.GROQ_API_KEY = match[1].trim(); + } catch { + // .env not found — Python script will emit ERROR if key needed + } + } + recognizerProcess = spawn(linuxPython(), [PYTHON_SCRIPT], { + stdio: ["pipe", "pipe", "pipe"], + env: spawnEnv, + }); + } else { + recognizerProcess = spawn(RECOGNIZER_BIN, [], { stdio: ["pipe", "pipe", "pipe"] }); + } const rl = readline.createInterface({ input: recognizerProcess.stdout! }); rl.on("line", (line: string) => { if (line === "READY") { onReady(); return; } diff --git a/src/resources/extensions/voice/speech-recognizer.py b/src/resources/extensions/voice/speech-recognizer.py new file mode 100644 index 000000000..27e5bb92b --- /dev/null +++ b/src/resources/extensions/voice/speech-recognizer.py @@ -0,0 +1,504 @@ +#!/usr/bin/env python3 +""" +speech-recognizer.py — STT recognizer for Linux. + +Emits line protocol on stdout (unbuffered): + READY — model loaded, mic active + PARTIAL: — partial transcription update (during speech) + FINAL: — finalized transcription (after pause/endpoint) + ERROR: — fatal error (human-readable) + +Backend: Groq Whisper API (default) or local faster-whisper. + --backend=groq → Groq API (fast, accurate, requires GROQ_API_KEY) + --backend=local → Local faster-whisper (offline, slower on CPU) + +Requires: sounddevice (pip install sounddevice) +System dep: libportaudio2 (sudo apt install libportaudio2) + +Designed to be spawned by index.ts startRecognizer() and communicate +exclusively via the stdout line protocol above. +""" + +import io +import os +import signal +import subprocess +import struct +import sys +import time +import queue +import threading +import wave + +# --------------------------------------------------------------------------- +# Helpers +# --------------------------------------------------------------------------- + +def emit(tag, msg=""): + """Emit a single protocol line, flushed immediately.""" + if msg: + print(f"{tag}:{msg}", flush=True) + else: + print(tag, flush=True) + + +def _try_pip_install(*packages): + """Attempt pip install. Returns (success, error_detail).""" + try: + result = subprocess.run( + [sys.executable, "-m", "pip", "install", *packages, "--quiet"], + capture_output=True, + timeout=300, + ) + if result.returncode == 0: + return True, "" + stderr = result.stderr.decode("utf-8", errors="replace").strip() + return False, stderr + except FileNotFoundError: + return False, "pip not found" + except subprocess.TimeoutExpired: + return False, "install timed out after 300s" + except Exception as exc: + return False, str(exc) + + +def ensure_deps(): + """Import sounddevice, auto-installing if missing. + + Returns True on success. On failure, emits ERROR: and returns False. + Never raises — all failures go through the line protocol. + """ + try: + __import__("sounddevice") + __import__("requests") + return True + except ImportError: + pass + + # Attempt install + ok, detail = _try_pip_install("sounddevice", "requests") + if not ok: + if "externally-managed" in detail.lower(): + emit( + "ERROR", + "Python environment is externally managed (PEP 668). " + "Create a venv first: python3 -m venv ~/.gsd/voice-venv && " + "~/.gsd/voice-venv/bin/pip install sounddevice requests", + ) + elif "pip not found" in detail: + emit("ERROR", "pip is not available. Install: sudo apt install python3-pip") + else: + emit("ERROR", f"Failed to install sounddevice: {detail}") + return False + + # Verify import after install + try: + __import__("sounddevice") + __import__("requests") + return True + except ImportError as exc: + emit("ERROR", f"Packages installed but cannot import: {exc}") + return False + + +def audio_to_wav_bytes(audio_data, sample_rate=16000): + """Convert float32 numpy array to WAV bytes for API upload.""" + import numpy as np + # Convert float32 [-1, 1] to int16 + int16_data = (audio_data * 32767).astype(np.int16) + buf = io.BytesIO() + with wave.open(buf, "wb") as wf: + wf.setnchannels(1) + wf.setsampwidth(2) + wf.setframerate(sample_rate) + wf.writeframes(int16_data.tobytes()) + return buf.getvalue() + + +# --------------------------------------------------------------------------- +# Audio capture (shared by all backends) +# --------------------------------------------------------------------------- + +SAMPLE_RATE = 16000 +BLOCK_DURATION = 0.5 # seconds per audio block +BLOCK_SIZE = int(SAMPLE_RATE * BLOCK_DURATION) +SILENCE_THRESHOLD = 0.01 # RMS threshold for silence detection +SILENCE_DURATION = 0.8 # seconds of silence before finalizing +MIN_SPEECH_DURATION = 0.3 # minimum speech duration to trigger transcription + + +def open_mic(): + """Open mic stream and return (stream, audio_queue).""" + import sounddevice as sd + + audio_queue = queue.Queue() + + def audio_callback(indata, frames, time_info, status): + audio_queue.put(indata[:, 0].copy()) + + try: + stream = sd.InputStream( + samplerate=SAMPLE_RATE, + channels=1, + dtype="float32", + blocksize=BLOCK_SIZE, + callback=audio_callback, + ) + stream.start() + return stream, audio_queue + except Exception as exc: + msg = str(exc).lower() + if "portaudio" in msg or "no module" in msg: + emit("ERROR", "Audio system not available. Install: sudo apt install libportaudio2") + else: + emit("ERROR", f"Failed to initialize microphone: {exc}") + sys.exit(1) + + +# --------------------------------------------------------------------------- +# Groq backend +# --------------------------------------------------------------------------- + +def run_groq(): + """Groq Whisper API backend — fast cloud transcription.""" + import numpy as np + import requests + + api_key = os.environ.get("GROQ_API_KEY", "") + if not api_key: + emit("ERROR", "GROQ_API_KEY not set. Run 'gsd config' to set up, or get a free key at https://console.groq.com") + sys.exit(1) + + groq_model = os.environ.get("GSD_GROQ_MODEL", "whisper-large-v3-turbo") + api_url = "https://api.groq.com/openai/v1/audio/transcriptions" + + # --- Signal handling --- + shutdown_requested = False + + def _handle_signal(signum, frame): + nonlocal shutdown_requested + shutdown_requested = True + + signal.signal(signal.SIGTERM, _handle_signal) + signal.signal(signal.SIGINT, _handle_signal) + + def transcribe_audio(audio_data): + """Send audio to Groq API, return transcription text.""" + wav_bytes = audio_to_wav_bytes(audio_data, SAMPLE_RATE) + + try: + resp = requests.post( + api_url, + headers={"Authorization": f"Bearer {api_key}"}, + files={"file": ("audio.wav", wav_bytes, "audio/wav")}, + data={ + "model": groq_model, + "language": "en", + "response_format": "json", + "temperature": "0.0", + }, + timeout=10, + ) + if resp.ok: + return resp.json().get("text", "").strip() + else: + emit("ERROR", f"Groq API error ({resp.status_code}): {resp.text[:200]}") + return "" + except requests.exceptions.Timeout: + emit("ERROR", "Groq API timeout") + return "" + except Exception as e: + emit("ERROR", f"Groq API connection error: {e}") + return "" + + # --- Open mic --- + stream, audio_queue = open_mic() + emit("READY") + + # --- State --- + completed_lines = [] + speech_buffer = [] + silence_counter = 0.0 + in_speech = False + + # Background transcription for partials + partial_lock = threading.Lock() + latest_partial = [None] + partial_thread = None + last_partial_time = 0.0 + + def _full_text(current=""): + parts = list(completed_lines) + if current: + parts.append(current) + return " ".join(parts) + + def _transcribe_partial(audio_data): + try: + text = transcribe_audio(audio_data) + if text: + with partial_lock: + latest_partial[0] = text + except Exception: + pass + + try: + while not shutdown_requested: + try: + block = audio_queue.get(timeout=0.2) + except queue.Empty: + with partial_lock: + if latest_partial[0] is not None: + emit("PARTIAL", _full_text(latest_partial[0])) + latest_partial[0] = None + continue + + rms = float(np.sqrt(np.mean(block ** 2))) + is_speech = rms > SILENCE_THRESHOLD + + if is_speech: + speech_buffer.append(block) + silence_counter = 0.0 + + if not in_speech: + in_speech = True + + # Emit completed partial results + with partial_lock: + if latest_partial[0] is not None: + emit("PARTIAL", _full_text(latest_partial[0])) + latest_partial[0] = None + + # Launch partial every ~2s, non-blocking + now = time.monotonic() + speech_duration = len(speech_buffer) * BLOCK_DURATION + can_partial = ( + speech_duration >= 1.5 + and now - last_partial_time >= 2.0 + and (partial_thread is None or not partial_thread.is_alive()) + ) + if can_partial: + audio_data = np.concatenate(speech_buffer).copy() + partial_thread = threading.Thread( + target=_transcribe_partial, + args=(audio_data,), + daemon=True, + ) + partial_thread.start() + last_partial_time = now + else: + if in_speech: + silence_counter += BLOCK_DURATION + + if silence_counter >= SILENCE_DURATION: + speech_duration = len(speech_buffer) * BLOCK_DURATION + if speech_duration >= MIN_SPEECH_DURATION: + # Wait for any in-flight partial + if partial_thread is not None and partial_thread.is_alive(): + partial_thread.join(timeout=5.0) + + audio_data = np.concatenate(speech_buffer) + text = transcribe_audio(audio_data) + if text: + completed_lines.append(text) + emit("FINAL", _full_text()) + + speech_buffer.clear() + silence_counter = 0.0 + in_speech = False + except Exception as exc: + emit("ERROR", f"Runtime error: {exc}") + sys.exit(1) + finally: + try: + stream.stop() + stream.close() + except Exception: + pass + + +# --------------------------------------------------------------------------- +# Local Whisper backend +# --------------------------------------------------------------------------- + +def run_local(): + """Local faster-whisper backend (offline, CPU).""" + import numpy as np + from faster_whisper import WhisperModel + + # --- Signal handling --- + shutdown_requested = False + + def _handle_signal(signum, frame): + nonlocal shutdown_requested + shutdown_requested = True + + signal.signal(signal.SIGTERM, _handle_signal) + signal.signal(signal.SIGINT, _handle_signal) + + # --- Load model --- + model_size = os.environ.get("GSD_WHISPER_MODEL", "small") + cache_root = os.path.join( + os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache")), + "gsd", "whisper", + ) + try: + model = WhisperModel( + model_size, + device="cpu", + compute_type="int8", + download_root=cache_root, + ) + except Exception as exc: + emit("ERROR", f"Failed to load Whisper model ({model_size}): {exc}") + sys.exit(1) + + # --- Open mic --- + stream, audio_queue = open_mic() + emit("READY") + + # --- State --- + completed_lines = [] + speech_buffer = [] + silence_counter = 0.0 + in_speech = False + + partial_lock = threading.Lock() + latest_partial = [None] + partial_thread = None + last_partial_time = 0.0 + + def _full_text(current=""): + parts = list(completed_lines) + if current: + parts.append(current) + return " ".join(parts) + + def _transcribe_partial(audio_data): + try: + segments, _ = model.transcribe( + audio_data, language="en", beam_size=1, + vad_filter=False, condition_on_previous_text=False, + ) + text = " ".join(s.text.strip() for s in segments).strip() + if text: + with partial_lock: + latest_partial[0] = text + except Exception: + pass + + try: + while not shutdown_requested: + try: + block = audio_queue.get(timeout=0.2) + except queue.Empty: + with partial_lock: + if latest_partial[0] is not None: + emit("PARTIAL", _full_text(latest_partial[0])) + latest_partial[0] = None + continue + + rms = float(np.sqrt(np.mean(block ** 2))) + is_speech = rms > SILENCE_THRESHOLD + + if is_speech: + speech_buffer.append(block) + silence_counter = 0.0 + + if not in_speech: + in_speech = True + + with partial_lock: + if latest_partial[0] is not None: + emit("PARTIAL", _full_text(latest_partial[0])) + latest_partial[0] = None + + now = time.monotonic() + speech_duration = len(speech_buffer) * BLOCK_DURATION + can_partial = ( + speech_duration >= 1.5 + and now - last_partial_time >= 2.0 + and (partial_thread is None or not partial_thread.is_alive()) + ) + if can_partial: + audio_data = np.concatenate(speech_buffer).copy() + partial_thread = threading.Thread( + target=_transcribe_partial, + args=(audio_data,), + daemon=True, + ) + partial_thread.start() + last_partial_time = now + else: + if in_speech: + silence_counter += BLOCK_DURATION + + if silence_counter >= SILENCE_DURATION: + speech_duration = len(speech_buffer) * BLOCK_DURATION + if speech_duration >= MIN_SPEECH_DURATION: + if partial_thread is not None and partial_thread.is_alive(): + partial_thread.join(timeout=5.0) + + audio_data = np.concatenate(speech_buffer) + try: + segments, _ = model.transcribe( + audio_data, language="en", beam_size=5, + vad_filter=True, + ) + text = " ".join(s.text.strip() for s in segments).strip() + if text: + completed_lines.append(text) + emit("FINAL", _full_text()) + except Exception as exc: + emit("ERROR", f"Transcription error: {exc}") + + speech_buffer.clear() + silence_counter = 0.0 + in_speech = False + except Exception as exc: + emit("ERROR", f"Runtime error: {exc}") + sys.exit(1) + finally: + try: + stream.stop() + stream.close() + except Exception: + pass + + +# --------------------------------------------------------------------------- +# Entry point +# --------------------------------------------------------------------------- + +def main(): + backend = "groq" + for arg in sys.argv[1:]: + if arg == "--backend=groq": + backend = "groq" + elif arg == "--backend=local": + backend = "local" + + if not ensure_deps(): + sys.exit(1) + + if backend == "local": + # Check for faster-whisper + try: + __import__("faster_whisper") + except ImportError: + ok, detail = _try_pip_install("faster-whisper") + if not ok: + if "externally-managed" in detail.lower(): + emit("ERROR", + "Python environment is externally managed (PEP 668). " + "Install in your venv: pip install faster-whisper") + else: + emit("ERROR", f"Failed to install faster-whisper: {detail}") + sys.exit(1) + run_local() + else: + run_groq() + + +if __name__ == "__main__": + main() diff --git a/src/wizard.ts b/src/wizard.ts index 786d21ef8..19ed3ed2e 100644 --- a/src/wizard.ts +++ b/src/wizard.ts @@ -16,6 +16,7 @@ export function loadStoredEnvKeys(authStorage: AuthStorage): void { ['tavily', 'TAVILY_API_KEY'], ['slack_bot', 'SLACK_BOT_TOKEN'], ['discord_bot', 'DISCORD_BOT_TOKEN'], + ['groq', 'GROQ_API_KEY'], ] for (const [provider, envVar] of providers) { if (!process.env[envVar]) {