feat(voice): Groq Whisper API backend for Linux voice mode (#366)
This commit is contained in:
parent
f24f63f290
commit
6a39f7226b
7 changed files with 611 additions and 7 deletions
|
|
@ -6,6 +6,12 @@ Format based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
|
|||
|
||||
## [Unreleased]
|
||||
|
||||
### Added
|
||||
- Linux voice mode: Groq Whisper API backend for fast, accurate speech-to-text (Ctrl+Alt+V toggle)
|
||||
- Auto-reads `GROQ_API_KEY` from project `.env` file
|
||||
- Fallback `--backend=local` for offline faster-whisper on CPU
|
||||
- Venv-aware Python detection (`~/.gsd/voice-venv/bin/python3`)
|
||||
|
||||
## [2.10.9] - 2026-03-14
|
||||
|
||||
### Added
|
||||
|
|
|
|||
|
|
@ -220,7 +220,7 @@ On first run, GSD launches a branded setup wizard that walks you through LLM pro
|
|||
| `/gsd migrate` | Migrate a v1 `.planning` directory to `.gsd` format |
|
||||
| `/gsd doctor` | Validate `.gsd/` integrity, find and fix issues |
|
||||
| `/worktree` (`/wt`) | Git worktree lifecycle — create, switch, merge, remove |
|
||||
| `/voice` | Toggle real-time speech-to-text (macOS only) |
|
||||
| `/voice` | Toggle real-time speech-to-text (macOS, Linux) |
|
||||
| `/exit` | Graceful shutdown — saves session state before exiting |
|
||||
| `/kill` | Kill GSD process immediately |
|
||||
| `/clear` | Start a new session (alias for `/new`) |
|
||||
|
|
@ -348,7 +348,7 @@ GSD ships with 14 extensions, all loaded automatically:
|
|||
| **Subagent** | Delegated tasks with isolated context windows |
|
||||
| **Mac Tools** | macOS native app automation via Accessibility APIs |
|
||||
| **MCPorter** | Lazy on-demand MCP server integration |
|
||||
| **Voice** | Real-time speech-to-text transcription (macOS) |
|
||||
| **Voice** | Real-time speech-to-text transcription (macOS, Linux — Ubuntu 22.04+) |
|
||||
| **Slash Commands** | Custom command creation |
|
||||
| **LSP** | Language Server Protocol integration — diagnostics, go-to-definition, references, hover, symbols, rename, code actions |
|
||||
| **Ask User Questions** | Structured user input with single/multi-select |
|
||||
|
|
|
|||
|
|
@ -49,6 +49,12 @@ const TOOL_KEYS: ToolKeyConfig[] = [
|
|||
label: 'Jina AI',
|
||||
hint: 'clean web page extraction',
|
||||
},
|
||||
{
|
||||
provider: 'groq',
|
||||
envVar: 'GROQ_API_KEY',
|
||||
label: 'Groq',
|
||||
hint: 'voice transcription — free at console.groq.com',
|
||||
},
|
||||
]
|
||||
|
||||
/** Known LLM provider IDs that, if authed, mean the user doesn't need onboarding */
|
||||
|
|
@ -764,6 +770,7 @@ export function loadStoredEnvKeys(authStorage: AuthStorage): void {
|
|||
['jina', 'JINA_API_KEY'],
|
||||
['slack_bot', 'SLACK_BOT_TOKEN'],
|
||||
['discord_bot', 'DISCORD_BOT_TOKEN'],
|
||||
['groq', 'GROQ_API_KEY'],
|
||||
]
|
||||
for (const [provider, envVar] of providers) {
|
||||
if (!process.env[envVar]) {
|
||||
|
|
|
|||
3
src/resources/extensions/voice/.gitignore
vendored
3
src/resources/extensions/voice/.gitignore
vendored
|
|
@ -1 +1,4 @@
|
|||
speech-recognizer
|
||||
__pycache__/
|
||||
*.pyc
|
||||
.venv/
|
||||
|
|
|
|||
|
|
@ -9,6 +9,23 @@ import * as readline from "node:readline";
|
|||
|
||||
const SWIFT_SRC = path.join(__dirname, "speech-recognizer.swift");
|
||||
const RECOGNIZER_BIN = path.join(__dirname, "speech-recognizer");
|
||||
const PYTHON_SCRIPT = path.join(__dirname, "speech-recognizer.py");
|
||||
|
||||
const IS_DARWIN = process.platform === "darwin";
|
||||
const IS_LINUX = process.platform === "linux";
|
||||
const VOICE_VENV_PYTHON = path.join(
|
||||
process.env.HOME || process.env.USERPROFILE || "/tmp",
|
||||
".gsd",
|
||||
"voice-venv",
|
||||
"bin",
|
||||
"python3",
|
||||
);
|
||||
|
||||
/** Return the python3 binary path — prefer venv if it exists, else system. */
|
||||
function linuxPython(): string {
|
||||
if (fs.existsSync(VOICE_VENV_PYTHON)) return VOICE_VENV_PYTHON;
|
||||
return "python3";
|
||||
}
|
||||
|
||||
function ensureBinary(): boolean {
|
||||
if (fs.existsSync(RECOGNIZER_BIN)) return true;
|
||||
|
|
@ -22,8 +39,49 @@ function ensureBinary(): boolean {
|
|||
}
|
||||
}
|
||||
|
||||
let linuxReady = false;
|
||||
|
||||
function ensureLinuxReady(ctx: ExtensionContext): boolean {
|
||||
if (linuxReady) return true;
|
||||
|
||||
// Check python3 exists
|
||||
try {
|
||||
execSync("which python3", { stdio: "pipe" });
|
||||
} catch {
|
||||
ctx.ui.notify("Voice: python3 not found — install with: sudo apt install python3", "error");
|
||||
return false;
|
||||
}
|
||||
|
||||
// Check that sounddevice is importable
|
||||
const py = linuxPython();
|
||||
try {
|
||||
execSync(`${py} -c "import sounddevice"`, {
|
||||
stdio: "pipe",
|
||||
timeout: 10000,
|
||||
});
|
||||
} catch (err: unknown) {
|
||||
const stderr = (err as { stderr?: Buffer })?.stderr?.toString() ?? "";
|
||||
if (stderr.includes("sounddevice") || stderr.includes("PortAudio") || stderr.includes("portaudio")) {
|
||||
ctx.ui.notify("Voice: install libportaudio2 with: sudo apt install libportaudio2", "error");
|
||||
} else if (stderr.includes("No module") || stderr.includes("ModuleNotFoundError")) {
|
||||
// Deps missing — the Python script handles auto-install on first run,
|
||||
// so we let it through. The script's own ensure_deps() will pip install.
|
||||
ctx.ui.notify("Voice: installing dependencies on first run — this may take a moment", "info");
|
||||
linuxReady = true;
|
||||
return true;
|
||||
} else {
|
||||
ctx.ui.notify(`Voice: dependency check failed — ${stderr.split("\n")[0] || "unknown error"}`, "error");
|
||||
return false;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
linuxReady = true;
|
||||
return true;
|
||||
}
|
||||
|
||||
export default function (pi: ExtensionAPI) {
|
||||
if (process.platform !== "darwin") return;
|
||||
if (!IS_DARWIN && !IS_LINUX) return;
|
||||
|
||||
let active = false;
|
||||
let recognizerProcess: ChildProcess | null = null;
|
||||
|
|
@ -116,9 +174,15 @@ export default function (pi: ExtensionAPI) {
|
|||
return;
|
||||
}
|
||||
|
||||
if (!ensureBinary()) {
|
||||
ctx.ui.notify("Voice: failed to compile speech recognizer (need Xcode CLI tools)", "error");
|
||||
return;
|
||||
if (IS_DARWIN) {
|
||||
if (!ensureBinary()) {
|
||||
ctx.ui.notify("Voice: failed to compile speech recognizer (need Xcode CLI tools)", "error");
|
||||
return;
|
||||
}
|
||||
} else if (IS_LINUX) {
|
||||
if (!ensureLinuxReady(ctx)) {
|
||||
return;
|
||||
}
|
||||
}
|
||||
|
||||
active = true;
|
||||
|
|
@ -146,7 +210,26 @@ export default function (pi: ExtensionAPI) {
|
|||
onError: (msg: string) => void,
|
||||
onReady: () => void,
|
||||
) {
|
||||
recognizerProcess = spawn(RECOGNIZER_BIN, [], { stdio: ["pipe", "pipe", "pipe"] });
|
||||
if (IS_LINUX) {
|
||||
// Pass GROQ_API_KEY to the Python process — check process.env, then .env file
|
||||
const spawnEnv = { ...process.env };
|
||||
if (!spawnEnv.GROQ_API_KEY) {
|
||||
try {
|
||||
const envPath = path.join(process.cwd(), ".env");
|
||||
const envContent = fs.readFileSync(envPath, "utf-8");
|
||||
const match = envContent.match(/^GROQ_API_KEY=(.+)$/m);
|
||||
if (match) spawnEnv.GROQ_API_KEY = match[1].trim();
|
||||
} catch {
|
||||
// .env not found — Python script will emit ERROR if key needed
|
||||
}
|
||||
}
|
||||
recognizerProcess = spawn(linuxPython(), [PYTHON_SCRIPT], {
|
||||
stdio: ["pipe", "pipe", "pipe"],
|
||||
env: spawnEnv,
|
||||
});
|
||||
} else {
|
||||
recognizerProcess = spawn(RECOGNIZER_BIN, [], { stdio: ["pipe", "pipe", "pipe"] });
|
||||
}
|
||||
const rl = readline.createInterface({ input: recognizerProcess.stdout! });
|
||||
rl.on("line", (line: string) => {
|
||||
if (line === "READY") { onReady(); return; }
|
||||
|
|
|
|||
504
src/resources/extensions/voice/speech-recognizer.py
Normal file
504
src/resources/extensions/voice/speech-recognizer.py
Normal file
|
|
@ -0,0 +1,504 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
speech-recognizer.py — STT recognizer for Linux.
|
||||
|
||||
Emits line protocol on stdout (unbuffered):
|
||||
READY — model loaded, mic active
|
||||
PARTIAL:<text> — partial transcription update (during speech)
|
||||
FINAL:<text> — finalized transcription (after pause/endpoint)
|
||||
ERROR:<msg> — fatal error (human-readable)
|
||||
|
||||
Backend: Groq Whisper API (default) or local faster-whisper.
|
||||
--backend=groq → Groq API (fast, accurate, requires GROQ_API_KEY)
|
||||
--backend=local → Local faster-whisper (offline, slower on CPU)
|
||||
|
||||
Requires: sounddevice (pip install sounddevice)
|
||||
System dep: libportaudio2 (sudo apt install libportaudio2)
|
||||
|
||||
Designed to be spawned by index.ts startRecognizer() and communicate
|
||||
exclusively via the stdout line protocol above.
|
||||
"""
|
||||
|
||||
import io
|
||||
import os
|
||||
import signal
|
||||
import subprocess
|
||||
import struct
|
||||
import sys
|
||||
import time
|
||||
import queue
|
||||
import threading
|
||||
import wave
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def emit(tag, msg=""):
|
||||
"""Emit a single protocol line, flushed immediately."""
|
||||
if msg:
|
||||
print(f"{tag}:{msg}", flush=True)
|
||||
else:
|
||||
print(tag, flush=True)
|
||||
|
||||
|
||||
def _try_pip_install(*packages):
|
||||
"""Attempt pip install. Returns (success, error_detail)."""
|
||||
try:
|
||||
result = subprocess.run(
|
||||
[sys.executable, "-m", "pip", "install", *packages, "--quiet"],
|
||||
capture_output=True,
|
||||
timeout=300,
|
||||
)
|
||||
if result.returncode == 0:
|
||||
return True, ""
|
||||
stderr = result.stderr.decode("utf-8", errors="replace").strip()
|
||||
return False, stderr
|
||||
except FileNotFoundError:
|
||||
return False, "pip not found"
|
||||
except subprocess.TimeoutExpired:
|
||||
return False, "install timed out after 300s"
|
||||
except Exception as exc:
|
||||
return False, str(exc)
|
||||
|
||||
|
||||
def ensure_deps():
|
||||
"""Import sounddevice, auto-installing if missing.
|
||||
|
||||
Returns True on success. On failure, emits ERROR: and returns False.
|
||||
Never raises — all failures go through the line protocol.
|
||||
"""
|
||||
try:
|
||||
__import__("sounddevice")
|
||||
__import__("requests")
|
||||
return True
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Attempt install
|
||||
ok, detail = _try_pip_install("sounddevice", "requests")
|
||||
if not ok:
|
||||
if "externally-managed" in detail.lower():
|
||||
emit(
|
||||
"ERROR",
|
||||
"Python environment is externally managed (PEP 668). "
|
||||
"Create a venv first: python3 -m venv ~/.gsd/voice-venv && "
|
||||
"~/.gsd/voice-venv/bin/pip install sounddevice requests",
|
||||
)
|
||||
elif "pip not found" in detail:
|
||||
emit("ERROR", "pip is not available. Install: sudo apt install python3-pip")
|
||||
else:
|
||||
emit("ERROR", f"Failed to install sounddevice: {detail}")
|
||||
return False
|
||||
|
||||
# Verify import after install
|
||||
try:
|
||||
__import__("sounddevice")
|
||||
__import__("requests")
|
||||
return True
|
||||
except ImportError as exc:
|
||||
emit("ERROR", f"Packages installed but cannot import: {exc}")
|
||||
return False
|
||||
|
||||
|
||||
def audio_to_wav_bytes(audio_data, sample_rate=16000):
|
||||
"""Convert float32 numpy array to WAV bytes for API upload."""
|
||||
import numpy as np
|
||||
# Convert float32 [-1, 1] to int16
|
||||
int16_data = (audio_data * 32767).astype(np.int16)
|
||||
buf = io.BytesIO()
|
||||
with wave.open(buf, "wb") as wf:
|
||||
wf.setnchannels(1)
|
||||
wf.setsampwidth(2)
|
||||
wf.setframerate(sample_rate)
|
||||
wf.writeframes(int16_data.tobytes())
|
||||
return buf.getvalue()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Audio capture (shared by all backends)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
SAMPLE_RATE = 16000
|
||||
BLOCK_DURATION = 0.5 # seconds per audio block
|
||||
BLOCK_SIZE = int(SAMPLE_RATE * BLOCK_DURATION)
|
||||
SILENCE_THRESHOLD = 0.01 # RMS threshold for silence detection
|
||||
SILENCE_DURATION = 0.8 # seconds of silence before finalizing
|
||||
MIN_SPEECH_DURATION = 0.3 # minimum speech duration to trigger transcription
|
||||
|
||||
|
||||
def open_mic():
|
||||
"""Open mic stream and return (stream, audio_queue)."""
|
||||
import sounddevice as sd
|
||||
|
||||
audio_queue = queue.Queue()
|
||||
|
||||
def audio_callback(indata, frames, time_info, status):
|
||||
audio_queue.put(indata[:, 0].copy())
|
||||
|
||||
try:
|
||||
stream = sd.InputStream(
|
||||
samplerate=SAMPLE_RATE,
|
||||
channels=1,
|
||||
dtype="float32",
|
||||
blocksize=BLOCK_SIZE,
|
||||
callback=audio_callback,
|
||||
)
|
||||
stream.start()
|
||||
return stream, audio_queue
|
||||
except Exception as exc:
|
||||
msg = str(exc).lower()
|
||||
if "portaudio" in msg or "no module" in msg:
|
||||
emit("ERROR", "Audio system not available. Install: sudo apt install libportaudio2")
|
||||
else:
|
||||
emit("ERROR", f"Failed to initialize microphone: {exc}")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Groq backend
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def run_groq():
|
||||
"""Groq Whisper API backend — fast cloud transcription."""
|
||||
import numpy as np
|
||||
import requests
|
||||
|
||||
api_key = os.environ.get("GROQ_API_KEY", "")
|
||||
if not api_key:
|
||||
emit("ERROR", "GROQ_API_KEY not set. Run 'gsd config' to set up, or get a free key at https://console.groq.com")
|
||||
sys.exit(1)
|
||||
|
||||
groq_model = os.environ.get("GSD_GROQ_MODEL", "whisper-large-v3-turbo")
|
||||
api_url = "https://api.groq.com/openai/v1/audio/transcriptions"
|
||||
|
||||
# --- Signal handling ---
|
||||
shutdown_requested = False
|
||||
|
||||
def _handle_signal(signum, frame):
|
||||
nonlocal shutdown_requested
|
||||
shutdown_requested = True
|
||||
|
||||
signal.signal(signal.SIGTERM, _handle_signal)
|
||||
signal.signal(signal.SIGINT, _handle_signal)
|
||||
|
||||
def transcribe_audio(audio_data):
|
||||
"""Send audio to Groq API, return transcription text."""
|
||||
wav_bytes = audio_to_wav_bytes(audio_data, SAMPLE_RATE)
|
||||
|
||||
try:
|
||||
resp = requests.post(
|
||||
api_url,
|
||||
headers={"Authorization": f"Bearer {api_key}"},
|
||||
files={"file": ("audio.wav", wav_bytes, "audio/wav")},
|
||||
data={
|
||||
"model": groq_model,
|
||||
"language": "en",
|
||||
"response_format": "json",
|
||||
"temperature": "0.0",
|
||||
},
|
||||
timeout=10,
|
||||
)
|
||||
if resp.ok:
|
||||
return resp.json().get("text", "").strip()
|
||||
else:
|
||||
emit("ERROR", f"Groq API error ({resp.status_code}): {resp.text[:200]}")
|
||||
return ""
|
||||
except requests.exceptions.Timeout:
|
||||
emit("ERROR", "Groq API timeout")
|
||||
return ""
|
||||
except Exception as e:
|
||||
emit("ERROR", f"Groq API connection error: {e}")
|
||||
return ""
|
||||
|
||||
# --- Open mic ---
|
||||
stream, audio_queue = open_mic()
|
||||
emit("READY")
|
||||
|
||||
# --- State ---
|
||||
completed_lines = []
|
||||
speech_buffer = []
|
||||
silence_counter = 0.0
|
||||
in_speech = False
|
||||
|
||||
# Background transcription for partials
|
||||
partial_lock = threading.Lock()
|
||||
latest_partial = [None]
|
||||
partial_thread = None
|
||||
last_partial_time = 0.0
|
||||
|
||||
def _full_text(current=""):
|
||||
parts = list(completed_lines)
|
||||
if current:
|
||||
parts.append(current)
|
||||
return " ".join(parts)
|
||||
|
||||
def _transcribe_partial(audio_data):
|
||||
try:
|
||||
text = transcribe_audio(audio_data)
|
||||
if text:
|
||||
with partial_lock:
|
||||
latest_partial[0] = text
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
while not shutdown_requested:
|
||||
try:
|
||||
block = audio_queue.get(timeout=0.2)
|
||||
except queue.Empty:
|
||||
with partial_lock:
|
||||
if latest_partial[0] is not None:
|
||||
emit("PARTIAL", _full_text(latest_partial[0]))
|
||||
latest_partial[0] = None
|
||||
continue
|
||||
|
||||
rms = float(np.sqrt(np.mean(block ** 2)))
|
||||
is_speech = rms > SILENCE_THRESHOLD
|
||||
|
||||
if is_speech:
|
||||
speech_buffer.append(block)
|
||||
silence_counter = 0.0
|
||||
|
||||
if not in_speech:
|
||||
in_speech = True
|
||||
|
||||
# Emit completed partial results
|
||||
with partial_lock:
|
||||
if latest_partial[0] is not None:
|
||||
emit("PARTIAL", _full_text(latest_partial[0]))
|
||||
latest_partial[0] = None
|
||||
|
||||
# Launch partial every ~2s, non-blocking
|
||||
now = time.monotonic()
|
||||
speech_duration = len(speech_buffer) * BLOCK_DURATION
|
||||
can_partial = (
|
||||
speech_duration >= 1.5
|
||||
and now - last_partial_time >= 2.0
|
||||
and (partial_thread is None or not partial_thread.is_alive())
|
||||
)
|
||||
if can_partial:
|
||||
audio_data = np.concatenate(speech_buffer).copy()
|
||||
partial_thread = threading.Thread(
|
||||
target=_transcribe_partial,
|
||||
args=(audio_data,),
|
||||
daemon=True,
|
||||
)
|
||||
partial_thread.start()
|
||||
last_partial_time = now
|
||||
else:
|
||||
if in_speech:
|
||||
silence_counter += BLOCK_DURATION
|
||||
|
||||
if silence_counter >= SILENCE_DURATION:
|
||||
speech_duration = len(speech_buffer) * BLOCK_DURATION
|
||||
if speech_duration >= MIN_SPEECH_DURATION:
|
||||
# Wait for any in-flight partial
|
||||
if partial_thread is not None and partial_thread.is_alive():
|
||||
partial_thread.join(timeout=5.0)
|
||||
|
||||
audio_data = np.concatenate(speech_buffer)
|
||||
text = transcribe_audio(audio_data)
|
||||
if text:
|
||||
completed_lines.append(text)
|
||||
emit("FINAL", _full_text())
|
||||
|
||||
speech_buffer.clear()
|
||||
silence_counter = 0.0
|
||||
in_speech = False
|
||||
except Exception as exc:
|
||||
emit("ERROR", f"Runtime error: {exc}")
|
||||
sys.exit(1)
|
||||
finally:
|
||||
try:
|
||||
stream.stop()
|
||||
stream.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Local Whisper backend
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def run_local():
|
||||
"""Local faster-whisper backend (offline, CPU)."""
|
||||
import numpy as np
|
||||
from faster_whisper import WhisperModel
|
||||
|
||||
# --- Signal handling ---
|
||||
shutdown_requested = False
|
||||
|
||||
def _handle_signal(signum, frame):
|
||||
nonlocal shutdown_requested
|
||||
shutdown_requested = True
|
||||
|
||||
signal.signal(signal.SIGTERM, _handle_signal)
|
||||
signal.signal(signal.SIGINT, _handle_signal)
|
||||
|
||||
# --- Load model ---
|
||||
model_size = os.environ.get("GSD_WHISPER_MODEL", "small")
|
||||
cache_root = os.path.join(
|
||||
os.environ.get("XDG_CACHE_HOME", os.path.expanduser("~/.cache")),
|
||||
"gsd", "whisper",
|
||||
)
|
||||
try:
|
||||
model = WhisperModel(
|
||||
model_size,
|
||||
device="cpu",
|
||||
compute_type="int8",
|
||||
download_root=cache_root,
|
||||
)
|
||||
except Exception as exc:
|
||||
emit("ERROR", f"Failed to load Whisper model ({model_size}): {exc}")
|
||||
sys.exit(1)
|
||||
|
||||
# --- Open mic ---
|
||||
stream, audio_queue = open_mic()
|
||||
emit("READY")
|
||||
|
||||
# --- State ---
|
||||
completed_lines = []
|
||||
speech_buffer = []
|
||||
silence_counter = 0.0
|
||||
in_speech = False
|
||||
|
||||
partial_lock = threading.Lock()
|
||||
latest_partial = [None]
|
||||
partial_thread = None
|
||||
last_partial_time = 0.0
|
||||
|
||||
def _full_text(current=""):
|
||||
parts = list(completed_lines)
|
||||
if current:
|
||||
parts.append(current)
|
||||
return " ".join(parts)
|
||||
|
||||
def _transcribe_partial(audio_data):
|
||||
try:
|
||||
segments, _ = model.transcribe(
|
||||
audio_data, language="en", beam_size=1,
|
||||
vad_filter=False, condition_on_previous_text=False,
|
||||
)
|
||||
text = " ".join(s.text.strip() for s in segments).strip()
|
||||
if text:
|
||||
with partial_lock:
|
||||
latest_partial[0] = text
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
while not shutdown_requested:
|
||||
try:
|
||||
block = audio_queue.get(timeout=0.2)
|
||||
except queue.Empty:
|
||||
with partial_lock:
|
||||
if latest_partial[0] is not None:
|
||||
emit("PARTIAL", _full_text(latest_partial[0]))
|
||||
latest_partial[0] = None
|
||||
continue
|
||||
|
||||
rms = float(np.sqrt(np.mean(block ** 2)))
|
||||
is_speech = rms > SILENCE_THRESHOLD
|
||||
|
||||
if is_speech:
|
||||
speech_buffer.append(block)
|
||||
silence_counter = 0.0
|
||||
|
||||
if not in_speech:
|
||||
in_speech = True
|
||||
|
||||
with partial_lock:
|
||||
if latest_partial[0] is not None:
|
||||
emit("PARTIAL", _full_text(latest_partial[0]))
|
||||
latest_partial[0] = None
|
||||
|
||||
now = time.monotonic()
|
||||
speech_duration = len(speech_buffer) * BLOCK_DURATION
|
||||
can_partial = (
|
||||
speech_duration >= 1.5
|
||||
and now - last_partial_time >= 2.0
|
||||
and (partial_thread is None or not partial_thread.is_alive())
|
||||
)
|
||||
if can_partial:
|
||||
audio_data = np.concatenate(speech_buffer).copy()
|
||||
partial_thread = threading.Thread(
|
||||
target=_transcribe_partial,
|
||||
args=(audio_data,),
|
||||
daemon=True,
|
||||
)
|
||||
partial_thread.start()
|
||||
last_partial_time = now
|
||||
else:
|
||||
if in_speech:
|
||||
silence_counter += BLOCK_DURATION
|
||||
|
||||
if silence_counter >= SILENCE_DURATION:
|
||||
speech_duration = len(speech_buffer) * BLOCK_DURATION
|
||||
if speech_duration >= MIN_SPEECH_DURATION:
|
||||
if partial_thread is not None and partial_thread.is_alive():
|
||||
partial_thread.join(timeout=5.0)
|
||||
|
||||
audio_data = np.concatenate(speech_buffer)
|
||||
try:
|
||||
segments, _ = model.transcribe(
|
||||
audio_data, language="en", beam_size=5,
|
||||
vad_filter=True,
|
||||
)
|
||||
text = " ".join(s.text.strip() for s in segments).strip()
|
||||
if text:
|
||||
completed_lines.append(text)
|
||||
emit("FINAL", _full_text())
|
||||
except Exception as exc:
|
||||
emit("ERROR", f"Transcription error: {exc}")
|
||||
|
||||
speech_buffer.clear()
|
||||
silence_counter = 0.0
|
||||
in_speech = False
|
||||
except Exception as exc:
|
||||
emit("ERROR", f"Runtime error: {exc}")
|
||||
sys.exit(1)
|
||||
finally:
|
||||
try:
|
||||
stream.stop()
|
||||
stream.close()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Entry point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def main():
|
||||
backend = "groq"
|
||||
for arg in sys.argv[1:]:
|
||||
if arg == "--backend=groq":
|
||||
backend = "groq"
|
||||
elif arg == "--backend=local":
|
||||
backend = "local"
|
||||
|
||||
if not ensure_deps():
|
||||
sys.exit(1)
|
||||
|
||||
if backend == "local":
|
||||
# Check for faster-whisper
|
||||
try:
|
||||
__import__("faster_whisper")
|
||||
except ImportError:
|
||||
ok, detail = _try_pip_install("faster-whisper")
|
||||
if not ok:
|
||||
if "externally-managed" in detail.lower():
|
||||
emit("ERROR",
|
||||
"Python environment is externally managed (PEP 668). "
|
||||
"Install in your venv: pip install faster-whisper")
|
||||
else:
|
||||
emit("ERROR", f"Failed to install faster-whisper: {detail}")
|
||||
sys.exit(1)
|
||||
run_local()
|
||||
else:
|
||||
run_groq()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -16,6 +16,7 @@ export function loadStoredEnvKeys(authStorage: AuthStorage): void {
|
|||
['tavily', 'TAVILY_API_KEY'],
|
||||
['slack_bot', 'SLACK_BOT_TOKEN'],
|
||||
['discord_bot', 'DISCORD_BOT_TOKEN'],
|
||||
['groq', 'GROQ_API_KEY'],
|
||||
]
|
||||
for (const [provider, envVar] of providers) {
|
||||
if (!process.env[envVar]) {
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue