- stt.py: WakeWordListener (openWakeWord) + Transcriber (Vosk) - brain.py: Async OpenRouter streaming client with command parsing - tts.py: Qwen3-TTS engine with voice selection & instruction control - actions.py: 10 local OS commands (open_app, set_timer, search, etc.) - main.py: Async orchestrator with Phase 5 parallel TTS streaming
196 lines
6.6 KiB
Python
196 lines
6.6 KiB
Python
"""
|
|
stt.py — Speech-To-Text Module (Vosk + PyAudio + openWakeWord)
|
|
|
|
Responsibilities:
|
|
1. Continuously monitor the microphone for a wake word ("echo").
|
|
2. Once triggered, capture and transcribe the full spoken command.
|
|
3. Return the transcribed text to the orchestrator.
|
|
|
|
Environment Variables:
|
|
VOSK_MODEL_PATH — path to a Vosk model directory (default: models/vosk-model-small-en-us)
|
|
WAKE_WORD — wake phrase to listen for (default: "echo")
|
|
|
|
Dependencies:
|
|
pip install vosk pyaudio openwakeword
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import queue
|
|
import threading
|
|
import time
|
|
from pathlib import Path
|
|
|
|
import openwakeword
|
|
import pyaudio
|
|
from vosk import KaldiRecognizer, Model, SetLogLevel
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Constants
|
|
# ---------------------------------------------------------------------------
|
|
DEFAULT_VOSK_MODEL = "models/vosk-model-small-en-us"
|
|
DEFAULT_WAKE_WORD = "echo"
|
|
FORMAT = pyaudio.paInt16
|
|
CHANNELS = 1
|
|
RATE = 16000
|
|
CHUNK = 1024
|
|
RECORD_SECONDS = 10 # max length of a voice command after wake word
|
|
SILENCE_LIMIT = 1.5 # seconds of silence before we stop recording
|
|
|
|
|
|
class WakeWordListener:
|
|
"""Background thread that listens for the wake word using openWakeWord."""
|
|
|
|
def __init__(self, wake_word: str = DEFAULT_WAKE_WORD, on_detected=None):
|
|
self.wake_word = wake_word.lower()
|
|
self.on_detected = on_detected # callback(wake_word)
|
|
self._running = False
|
|
self._thread: threading.Thread | None = None
|
|
self._audio_queue: queue.Queue = queue.Queue()
|
|
|
|
# ---- audio callback fed to PyAudio stream ----
|
|
def _audio_callback(self, in_data, frame_count, time_info, status):
|
|
self._audio_queue.put(in_data)
|
|
return (in_data, pyaudio.paContinue)
|
|
|
|
def start(self):
|
|
if self._running:
|
|
return
|
|
self._running = True
|
|
self._thread = threading.Thread(target=self._listen_loop, daemon=True)
|
|
self._thread.start()
|
|
logger.info("WakeWordListener started — listening for '%s'", self.wake_word)
|
|
|
|
def stop(self):
|
|
self._running = False
|
|
if self._thread:
|
|
self._thread.join(timeout=5)
|
|
logger.info("WakeWordListener stopped")
|
|
|
|
def _listen_loop(self):
|
|
"""Open PyAudio, feed frames to openWakeWord, fire callback on match."""
|
|
oww = openwakeword.Model(
|
|
wakeword_models=[self.wake_word],
|
|
inference_framework="onnx",
|
|
)
|
|
|
|
pa = pyaudio.PyAudio()
|
|
stream = pa.open(
|
|
format=FORMAT,
|
|
channels=CHANNELS,
|
|
rate=RATE,
|
|
input=True,
|
|
frames_per_buffer=CHUNK,
|
|
stream_callback=self._audio_callback,
|
|
)
|
|
stream.start_stream()
|
|
|
|
try:
|
|
while self._running:
|
|
try:
|
|
frame = self._audio_queue.get(timeout=0.5)
|
|
except queue.Empty:
|
|
continue
|
|
# openWakeWord expects 16 kHz mono int16 — matches our format
|
|
prediction = oww.process(frame)
|
|
for model_name, score in prediction.items():
|
|
if score >= 0.5: # threshold
|
|
logger.info("Wake word '%s' detected (score=%.2f)", model_name, score)
|
|
if self.on_detected:
|
|
self.on_detected(self.wake_word)
|
|
finally:
|
|
stream.stop_stream()
|
|
stream.close()
|
|
pa.terminate()
|
|
|
|
|
|
class Transcriber:
|
|
"""Captures microphone audio after wake word and transcribes via Vosk."""
|
|
|
|
def __init__(self, model_path: str = DEFAULT_VOSK_MODEL):
|
|
resolved = Path(model_path)
|
|
if not resolved.exists():
|
|
raise FileNotFoundError(
|
|
f"Vosk model not found at {resolved.resolve()}. "
|
|
"Download one from https://alphacephei.com/vosk/models"
|
|
)
|
|
SetLogLevel(-1) # suppress Vosk internal noise
|
|
self._model = Model(str(resolved))
|
|
self._recognizer = KaldiRecognizer(self._model, RATE)
|
|
|
|
def listen_and_transcribe(self) -> str | None:
|
|
"""
|
|
Open the mic, record until silence or timeout, and return the
|
|
best-effort transcription of the spoken command.
|
|
|
|
Returns:
|
|
Transcribed text (str) or None if nothing was understood.
|
|
"""
|
|
pa = pyaudio.PyAudio()
|
|
stream = pa.open(
|
|
format=FORMAT,
|
|
channels=CHANNELS,
|
|
rate=RATE,
|
|
input=True,
|
|
frames_per_buffer=CHUNK,
|
|
)
|
|
stream.start_stream()
|
|
logger.info("Recording voice command...")
|
|
|
|
all_data = b""
|
|
silence_start: float | None = None
|
|
started_speaking = False
|
|
|
|
try:
|
|
while True:
|
|
data = stream.read(CHUNK, exception_on_overflow=False)
|
|
all_data += data
|
|
|
|
# Quick RMS check for silence detection
|
|
rms = self._rms(data)
|
|
if rms > 300:
|
|
started_speaking = True
|
|
silence_start = None
|
|
elif started_speaking and silence_start is None:
|
|
silence_start = time.time()
|
|
|
|
# Stop on silence timeout or max duration
|
|
if silence_start and (time.time() - silence_start) > SILENCE_LIMIT:
|
|
logger.debug("Silence detected — ending recording")
|
|
break
|
|
if len(all_data) > RATE * RECORD_SECONDS * 2: # bytes
|
|
logger.debug("Max recording duration reached")
|
|
break
|
|
finally:
|
|
stream.stop_stream()
|
|
stream.close()
|
|
pa.terminate()
|
|
|
|
if not started_speaking:
|
|
logger.info("No speech detected after wake word")
|
|
return None
|
|
|
|
# Feed all collected audio to Vosk for final transcription
|
|
self._recognizer.Reset()
|
|
if self._recognizer.AcceptWaveform(all_data):
|
|
result = json.loads(self._recognizer.Result())
|
|
text = result.get("text", "").strip()
|
|
else:
|
|
partial = json.loads(self._recognizer.PartialResult())
|
|
text = partial.get("partial", "").strip()
|
|
|
|
logger.info("Transcription: '%s'", text)
|
|
return text if text else None
|
|
|
|
@staticmethod
|
|
def _rms(data: bytes) -> float:
|
|
"""Compute Root Mean Square of a byte buffer of int16 samples."""
|
|
import array
|
|
samples = array.array("h", data)
|
|
if not samples:
|
|
return 0.0
|
|
sum_sq = sum(s * s for s in samples)
|
|
return (sum_sq / len(samples)) ** 0.5
|