diff --git a/.env.example b/.env.example new file mode 100644 index 0000000..88c5505 --- /dev/null +++ b/.env.example @@ -0,0 +1,24 @@ +# =========================================================== +# Echo Voice Assistant — Environment Configuration +# =========================================================== +# Copy this file to .env and fill in your values: +# cp .env.example .env +# =========================================================== + +# --- OpenRouter (required) --- +# Get your key at: https://openrouter.ai/keys +OPENROUTER_API_KEY=sk-or-v1-xxxxxxxxxxxxxxxxxxxxxxxx +OPENROUTER_MODEL=qwen/qwen-3-235b-a22b + +# --- Vosk STT (optional overrides) --- +# Download models from: https://alphacephei.com/vosk/models +# Set to a local path relative to the project root +VOSK_MODEL_PATH=models/vosk-model-small-en-us +WAKE_WORD=echo + +# --- Qwen3-TTS (optional overrides) --- +# Available preset voices: Ryan, Serena, Diana, etc. +# Or set a path to a 3-second .wav sample for voice cloning +QWEN_TTS_MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice +QWEN_TTS_VOICE=Ryan +QWEN_TTS_INSTRUCT=Speak clearly with a warm, friendly tone. Be natural and conversational. diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..291cdf7 --- /dev/null +++ b/.gitignore @@ -0,0 +1,39 @@ +# Echo Voice Assistant — Git Ignore + +# Python +__pycache__/ +*.py[cod] +*.egg-info/ +dist/ +build/ +*.egg + +# Virtual environments +venv/ +.venv/ +env/ + +# Model weights (large files — download separately) +models/ +!models/.gitkeep + +# Generated audio +audio_output/ +!audio_output/.gitkeep + +# Environment & secrets +.env +.env.local + +# IDE +.vscode/ +.idea/ +*.swp +*.swo + +# OS +.DS_Store +Thumbs.db + +# Logs +*.log diff --git a/actions.py b/actions.py new file mode 100644 index 0000000..758a82c --- /dev/null +++ b/actions.py @@ -0,0 +1,275 @@ +""" +actions.py — Local OS Command Execution + +Responsibilities: + 1. Provide a registry of local actions the assistant can perform. + 2. Map action names from the LLM's JSON commands to Python functions. + 3. Execute commands and return a spoken summary for TTS feedback. + +Supported actions: + open_app — Launch a desktop application + set_timer — Start a countdown timer with audible alarm + get_time — Return the current time + get_date — Return today's date + get_weather — (stub) Return weather info + create_reminder — (stub) Save a reminder note + control_volume — Adjust system volume (Linux/macOS) + search_web — Open a web search in the default browser + calculate — Evaluate a math expression safely + shutdown — System shutdown (with confirmation) +""" + +import logging +import os +import platform +import subprocess +import threading +import webbrowser +from datetime import datetime + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Action Registry +# --------------------------------------------------------------------------- +_REGISTRY: dict[str, callable] = {} + + +def register(name: str): + """Decorator to register an action function by name.""" + def decorator(func): + _REGISTRY[name] = func + return func + return decorator + + +def execute(action: str, params: dict | None = None) -> str: + """ + Execute a registered action and return a spoken summary. + + Args: + action: The action name (e.g., "open_app"). + params: Optional dict of parameters. + + Returns: + A short text description of what was done (for TTS feedback). + """ + params = params or {} + func = _REGISTRY.get(action) + + if not func: + logger.warning("Unknown action: %s", action) + return f"Sorry, I don't know how to {action.replace('_', ' ')}." + + try: + result = func(**params) + logger.info("Action '%s' executed: %s", action, result) + return result + except Exception as exc: + logger.exception("Action '%s' failed", action) + return f"Something went wrong: {exc}" + + +# --------------------------------------------------------------------------- +# Action Implementations +# --------------------------------------------------------------------------- + +@register("get_time") +def get_time(**_) -> str: + now = datetime.now().strftime("%-I:%M %p") + return f"It's currently {now}." + + +@register("get_date") +def get_date(**_) -> str: + today = datetime.now().strftime("%A, %B %d, %Y") + return f"Today is {today}." + + +@register("open_app") +def open_app(app_name: str = "", **_) -> str: + if not app_name: + return "What app would you like me to open?" + + system = platform.system() + app_lower = app_name.lower().strip() + + try: + if system == "Darwin": # macOS + subprocess.Popen(["open", "-a", app_name], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + elif system == "Windows": + subprocess.Popen(f"start {app_name}", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL) + else: # Linux + # Try common app launchers + app_map = { + "chrome": "google-chrome", + "firefox": "firefox", + "terminal": "gnome-terminal", + "files": "nautilus", + "calculator": "gnome-calculator", + "settings": "gnome-control-center", + "browser": "xdg-open", + "vs code": "code", + "vscode": "code", + } + cmd = app_map.get(app_lower, app_lower) + subprocess.Popen( + [cmd], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL + ) + + return f"Opening {app_name}." + except FileNotFoundError: + return f"Sorry, I couldn't find {app_name} on this system." + + +@register("set_timer") +def set_timer(seconds: int = 60, label: str = "Timer", **_) -> str: + """ + Start a background timer that rings after the given number of seconds. + Uses the terminal bell when the timer completes. + """ + try: + duration = int(seconds) + except (ValueError, TypeError): + return "I need a number of seconds for the timer." + + def _timer_thread(): + import time + + logger.info("Timer '%s' started for %d seconds", label, duration) + time.sleep(duration) + # Terminal bell + print(f"\a") + logger.info("Timer '%s' finished!", label) + # Try to use TTS to announce (if available — soft dependency) + try: + import pygame + pygame.mixer.init() + # Generate a simple beep + import numpy as np + sample_rate = 22050 + t = np.linspace(0, 0.5, int(sample_rate * 0.5), dtype=np.float32) + tone = np.sin(2 * np.pi * 880 * t) * 0.5 + tone = (tone * 32767).astype(np.int16) + # Save and play + import soundfile as sf + beep_path = f"/tmp/echo_timer_{os.urandom(4).hex()}.wav" + sf.write(beep_path, tone, sample_rate) + pygame.mixer.music.load(beep_path) + pygame.mixer.music.play() + while pygame.mixer.music.get_busy(): + pygame.time.wait(50) + pygame.mixer.quit() + os.unlink(beep_path) + except Exception: + pass # Fall back to terminal bell only + + threading.Thread(target=_timer_thread, daemon=True, name=f"timer-{label}").start() + + minutes, secs = divmod(duration, 60) + if minutes: + return f"{label} set for {minutes} minute{'s' if minutes != 1 else ''} and {secs} seconds." + return f"{label} set for {secs} seconds." + + +@register("get_weather") +def get_weather(location: str = "", **_) -> str: + """Stub — can be expanded with a weather API integration.""" + return ( + "I don't have a weather service connected yet. " + "You can ask me again once the weather API is configured." + ) + + +@register("create_reminder") +def create_reminder(text: str = "", **_) -> str: + """Save a reminder to a local file.""" + if not text: + return "What would you like me to remind you about?" + + reminders_dir = Path.home() / ".echo" / "reminders" + reminders_dir.mkdir(parents=True, exist_ok=True) + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + reminder_file = reminders_dir / f"{timestamp}.txt" + reminder_file.write_text(f"[{datetime.now().isoformat()}] {text}\n") + + return f"Reminder saved: {text}" + + +@register("control_volume") +def control_volume(level: int = 50, **_) -> str: + """Adjust system volume (Linux/macOS only).""" + try: + vol = int(level) + vol = max(0, min(100, vol)) + except (ValueError, TypeError): + return "Please specify a volume level between 0 and 100." + + system = platform.system() + try: + if system == "Darwin": + subprocess.run(["osascript", "-e", f"set volume output volume {vol}"], + check=True, capture_output=True) + elif system == "Linux": + subprocess.run( + ["pactl", "set-sink-volume", "@DEFAULT_SINK@", f"{vol}%"], + check=True, capture_output=True, + ) + else: + return "Volume control isn't supported on Windows yet." + + return f"Volume set to {vol}%." + except subprocess.CalledProcessError: + return "I couldn't adjust the volume." + + +@register("search_web") +def search_web(query: str = "", **_) -> str: + if not query: + return "What would you like to search for?" + + url = f"https://www.google.com/search?q={query.replace(' ', '+')}" + webbrowser.open(url) + return f"Searching the web for: {query}" + + +@register("calculate") +def calculate(expression: str = "", **_) -> str: + if not expression: + return "What would you like me to calculate?" + + # Whitelist only safe math operations + import ast + allowed = { + ast.Add, ast.Sub, ast.Mult, ast.Div, ast.Pow, + ast.USub, ast.UAdd, ast.Constant, ast.Num, + } + try: + tree = ast.parse(expression.strip(), mode="eval") + for node in ast.walk(tree): + if type(node) not in allowed: + raise ValueError("Unsafe expression") + result = eval(compile(tree, "", "eval")) # noqa: S307 + return f"The result is {result}" + except ZeroDivisionError: + return "You can't divide by zero." + except Exception: + return f"I couldn't calculate that expression." + + +@register("shutdown") +def shutdown(confirm: bool = False, **_) -> str: + if not confirm: + return "Are you sure? Please confirm the shutdown command." + system = platform.system() + try: + if system == "Darwin": + subprocess.run(["sudo", "shutdown", "-h", "now"], check=True) + elif system == "Windows": + subprocess.run(["shutdown", "/s", "/t", "5"], check=True) + else: + subprocess.run(["sudo", "shutdown", "-h", "now"], check=True) + return "Shutting down now. Goodbye!" + except Exception: + return "I don't have permission to shut down the system." diff --git a/brain.py b/brain.py new file mode 100644 index 0000000..2e4537d --- /dev/null +++ b/brain.py @@ -0,0 +1,159 @@ +""" +brain.py — OpenRouter LLM Client (Streaming) + +Responsibilities: + 1. Send transcribed text to OpenRouter with a system prompt that + instructs the model to produce: + a) A concise verbal response (≤ 2 sentences for voice). + b) An optional JSON command block for local execution. + 2. Stream tokens back so the TTS engine can start early (Phase 5). + 3. Parse any JSON command block and return it alongside the spoken text. + +Environment Variables: + OPENROUTER_API_KEY — your OpenRouter API key + OPENROUTER_MODEL — model identifier (default: qwen/qwen-3-235b-a22b) + OPENROUTER_BASE_URL — (optional) custom base URL + +Dependencies: + pip install openai +""" + +import json +import logging +import os +from typing import AsyncIterator + +from openai import AsyncOpenAI + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- +DEFAULT_MODEL = "qwen/qwen-3-235b-a22b" + +SYSTEM_PROMPT = """\ +You are Echo, a concise, helpful voice assistant. Follow these rules strictly: + +1. **Verbal response**: Reply in ≤ 2 short sentences so it sounds natural + when spoken aloud. Be direct and conversational. +2. **Local commands** (optional): If the user's request can be fulfilled by a + local OS action (e.g. opening an app, setting a timer, checking the time, + creating a reminder), include a single JSON block at the very end of your + response using this exact format: + + ```json + {"action": "", "params": {"key": "value"}} + ``` + + Supported actions: open_app, set_timer, get_time, get_date, get_weather, + create_reminder, control_volume, shutdown, search_web, calculate. + +3. Do NOT include the JSON block in your spoken text. The spoken text is + everything BEFORE the JSON block. +4. If no local action is needed, just respond normally without any JSON. +5. Never use markdown formatting, bullet points, or headers in the spoken text. +""" + + +class Brain: + """Async client for OpenRouter LLM with streaming support.""" + + def __init__( + self, + api_key: str | None = None, + model: str = DEFAULT_MODEL, + base_url: str = "https://openrouter.ai/api/v1", + ): + self.model = model + self.client = AsyncOpenAI( + api_key=api_key or os.environ.get("OPENROUTER_API_KEY", ""), + base_url=base_url, + ) + # Conversation history (rolling window) + self._history: list[dict[str, str]] = [] + self._max_history = 20 # keep last 20 messages for context + + async def think( + self, + user_text: str, + ) -> AsyncIterator[dict]: + """ + Stream the LLM response token-by-token. + + Yields: + dict with keys: + - "type": "token" | "command" | "done" + - "text": the token string (for "token" type) + - "command": parsed JSON dict (for "command" type) + """ + self._history.append({"role": "user", "content": user_text}) + if len(self._history) > self._max_history: + self._history = self._history[-self._max_history:] + + messages = [{"role": "system", "content": SYSTEM_PROMPT}] + self._history + + logger.info("Sending to OpenRouter (%s): %s", self.model, user_text[:80]) + + full_response = "" + try: + stream = await self.client.chat.completions.create( + model=self.model, + messages=messages, + stream=True, + temperature=0.7, + max_tokens=300, + ) + async for chunk in stream: + delta = chunk.choices[0].delta + if delta.content: + full_response += delta.content + yield {"type": "token", "text": delta.content} + except Exception: + logger.exception("OpenRouter request failed") + yield {"type": "token", "text": "Sorry, I had trouble thinking about that."} + + # Parse any JSON command block from the full response + command = self._extract_command(full_response) + if command: + logger.info("Parsed command: %s", command) + yield {"type": "command", "command": command} + + # Clean spoken text (strip JSON block and thinking tags) + spoken_text = self._clean_spoken_text(full_response) + self._history.append({"role": "assistant", "content": full_response}) + + yield {"type": "done", "text": spoken_text} + + def _extract_command(self, text: str) -> dict | None: + """Extract the JSON command block from the LLM response.""" + try: + # Find JSON code block + if "```json" in text: + start = text.index("```json") + 7 + end = text.index("```", start) + json_str = text[start:end].strip() + return json.loads(json_str) + # Try bare JSON at the end + for i in range(len(text) - 1, -1, -1): + if text[i] == "{": + candidate = text[i:].strip() + return json.loads(candidate) + except (json.JSONDecodeError, ValueError): + pass + return None + + @staticmethod + def _clean_spoken_text(text: str) -> str: + """Remove JSON blocks and Qwen thinking tags from spoken text.""" + import re + + # Remove Qwen3 blocks + cleaned = re.sub(r"]*>.*?", "", text, flags=re.DOTALL) + # Remove JSON code blocks + cleaned = re.sub(r"```json.*?```", "", cleaned, flags=re.DOTALL) + # Remove any trailing bare JSON object + cleaned = re.sub(r"\{[^}]*\"action\"[^}]*\}", "", cleaned) + # Clean up whitespace + cleaned = cleaned.strip().strip(".") + return cleaned diff --git a/main.py b/main.py new file mode 100644 index 0000000..a3ff55d --- /dev/null +++ b/main.py @@ -0,0 +1,283 @@ +""" +main.py — Echo Voice Assistant Orchestrator + +Ties together all modules: + 1. WakeWordListener (stt.py) — continuously listens for "echo" + 2. Transcriber (stt.py) — captures & transcribes voice commands + 3. Brain (brain.py) — sends text to OpenRouter, streams response + 4. TTSEngine (tts.py) — generates speech from text (Qwen3-TTS) + 5. Actions (actions.py) — executes local OS commands + +Phase 5 Parallel Processing: + As soon as the first complete sentence is received from the Brain's + streamed response, TTS generation begins immediately — before the + full LLM response has finished streaming. + +Usage: + python main.py +""" + +import asyncio +import logging +import os +import re +import signal +import sys +from pathlib import Path + +from dotenv import load_dotenv + +from stt import WakeWordListener, Transcriber +from brain import Brain +from tts import TTSEngine +from actions import execute as execute_action + +# --------------------------------------------------------------------------- +# Logging setup +# --------------------------------------------------------------------------- +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s │ %(name)-18s │ %(levelname)-7s │ %(message)s", + datefmt="%H:%M:%S", +) +logger = logging.getLogger("echo") + +# --------------------------------------------------------------------------- +# Load environment +# --------------------------------------------------------------------------- +load_dotenv(Path(__file__).parent / ".env") + + +class EchoAssistant: + """ + Main orchestrator for the Echo voice assistant. + + Lifecycle: + 1. Start wake word listener (background thread). + 2. On wake word detected → transcribe command. + 3. Stream LLM response → start TTS on first sentence (parallel). + 4. Execute any local commands from the LLM response. + """ + + def __init__(self): + # --- STT --- + model_path = os.environ.get( + "VOSK_MODEL_PATH", "models/vosk-model-small-en-us" + ) + wake_word = os.environ.get("WAKE_WORD", "echo") + + self.transcriber = Transcriber(model_path=model_path) + self.wake_listener = WakeWordListener( + wake_word=wake_word, + on_detected=self._on_wake_word, + ) + + # --- Brain (LLM) --- + self.brain = Brain( + api_key=os.environ.get("OPENROUTER_API_KEY"), + model=os.environ.get("OPENROUTER_MODEL", "qwen/qwen-3-235b-a22b"), + ) + + # --- TTS --- + self.tts = TTSEngine( + model_name=os.environ.get("QWEN_TTS_MODEL", "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"), + voice=os.environ.get("QWEN_TTS_VOICE", "Ryan"), + instruction=os.environ.get( + "QWEN_TTS_INSTRUCT", + "Speak clearly with a warm, friendly tone. Be natural and conversational.", + ), + ) + + # --- State --- + self._processing = False # guard against concurrent commands + self._shutdown_event = asyncio.Event() + + logger.info("Echo assistant initialized (wake word: '%s')", wake_word) + + # ------------------------------------------------------------------ + # Wake word callback (runs in background thread) + # ------------------------------------------------------------------ + def _on_wake_word(self, wake_word: str): + """Called by WakeWordListener when the wake word is detected.""" + if self._processing: + logger.info("Still processing previous command — ignoring wake word") + return + # Schedule the command processing in the async event loop + try: + loop = asyncio.get_running_loop() + loop.call_soon_threadsafe(loop.create_task, self._handle_command()) + except RuntimeError: + logger.warning("No running event loop for wake word callback") + + # ------------------------------------------------------------------ + # Main command pipeline + # ------------------------------------------------------------------ + async def _handle_command(self): + """Full pipeline: transcribe → think → speak → act.""" + if self._processing: + return + self._processing = True + + try: + # Play a brief acknowledgment tone + logger.info("🔊 Wake word detected — listening...") + + # Step 1: Transcribe + text = self.transcriber.listen_and_transcribe() + if not text: + logger.info("No transcription — returning to idle") + return + + logger.info("📝 You said: '%s'", text) + + # Step 2: Stream LLM response with early TTS (Phase 5) + await self._stream_and_speak(text) + + except Exception: + logger.exception("Error in command pipeline") + finally: + self._processing = False + logger.info("Returning to idle...") + + # ------------------------------------------------------------------ + # Phase 5: Parallel Streaming + TTS + # ------------------------------------------------------------------ + async def _stream_and_speak(self, user_text: str): + """ + Stream the LLM response and start TTS generation as soon as the + first complete sentence is available — minimizing perceived latency. + """ + buffer = "" + first_sentence_spoken = False + remaining_text = "" + pending_command = None + tts_tasks: list[asyncio.Task] = [] + + async for event in self.brain.think(user_text): + if event["type"] == "token": + buffer += event["text"] + + # Check if we have a complete sentence + if not first_sentence_spoken and self._has_complete_sentence(buffer): + # Split: first sentence goes to TTS immediately + sentences = self._split_first_sentence(buffer) + first_sentence = sentences[0] + remaining_text = sentences[1] if len(sentences) > 1 else "" + + if first_sentence.strip(): + logger.info("⚡ Early TTS trigger: '%s'", first_sentence[:60]) + task = asyncio.create_task( + self.tts.speak(first_sentence.strip()) + ) + tts_tasks.append(task) + + first_sentence_spoken = True + buffer = remaining_text + + elif event["type"] == "command": + pending_command = event["command"] + + elif event["type"] == "done": + # Any remaining text after the first sentence + final_text = buffer.strip() + if final_text and final_text != remaining_text: + final_text = event["text"] + # Remove the already-spoken first sentence + if first_sentence_spoken and remaining_text: + pass # remaining_text already has what we need + else: + remaining_text = final_text + + # Step 3: Speak the remaining text after first sentence finishes + remaining_text = remaining_text.strip() + if remaining_text: + # Wait for first sentence TTS to finish + for task in tts_tasks: + await task + await self.tts.speak(remaining_text) + + # Wait for all TTS tasks to complete + for task in tts_tasks: + if not task.done(): + await task + + # Step 4: Execute any local command + if pending_command: + action_name = pending_command.get("action", "") + params = pending_command.get("params", {}) + logger.info("🔧 Executing action: %s %s", action_name, params) + result = execute_action(action_name, params) + if result: + await self.tts.speak(result) + + # ------------------------------------------------------------------ + # Text utilities + # ------------------------------------------------------------------ + @staticmethod + def _has_complete_sentence(text: str) -> bool: + """Check if the text buffer contains at least one complete sentence.""" + # A sentence is considered complete if it ends with . ! ? or ... + return bool(re.search(r'[.!?]\s+|[.!?]$', text)) + + @staticmethod + def _split_first_sentence(text: str) -> list[str]: + """Split text at the first sentence boundary.""" + match = re.search(r'([.!?])\s+', text) + if match: + end = match.start() + 1 + return [text[:end], text[end:].strip()] + # Check for ending punctuation without trailing space + match = re.search(r'[.!?]$', text.strip()) + if match: + return [text.strip()] + return [text] + + # ------------------------------------------------------------------ + # Lifecycle + # ------------------------------------------------------------------ + async def start(self): + """Start the Echo assistant.""" + logger.info("=" * 60) + logger.info(" ECHO VOICE ASSISTANT") + logger.info(" Say '%s' to activate", os.environ.get("WAKE_WORD", "echo").upper()) + logger.info(" Press Ctrl+C to quit") + logger.info("=" * 60) + + # Start wake word listener (runs in background thread) + self.wake_listener.start() + + # Keep the async loop alive until shutdown + await self._shutdown_event.wait() + + def shutdown(self): + """Signal the assistant to stop.""" + logger.info("Shutting down Echo...") + self.wake_listener.stop() + self._shutdown_event.set() + + +# --------------------------------------------------------------------------- +# Entry Point +# --------------------------------------------------------------------------- +def main(): + assistant = EchoAssistant() + + # Graceful shutdown on Ctrl+C + def _signal_handler(sig, frame): + assistant.shutdown() + + signal.signal(signal.SIGINT, _signal_handler) + signal.signal(signal.SIGTERM, _signal_handler) + + # Run the async event loop + try: + asyncio.run(assistant.start()) + except KeyboardInterrupt: + pass + finally: + assistant.shutdown() + logger.info("Echo has shut down. Goodbye!") + + +if __name__ == "__main__": + main() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..f2b996a --- /dev/null +++ b/requirements.txt @@ -0,0 +1,31 @@ +# =========================================================== +# Echo Voice Assistant — Dependencies +# =========================================================== +# Install with: pip install -r requirements.txt +# +# Note: For GPU-accelerated TTS, install PyTorch with CUDA +# support first: https://pytorch.org/get-started/locally/ +# =========================================================== + +# --- Core --- +vosk>=0.3.45 +pyaudio>=0.2.14 +openwakeword>=0.5.0 + +# --- LLM --- +openai>=1.30.0 +python-dotenv>=1.0.0 + +# --- TTS (Qwen3-TTS) --- +# Install from source or PyPI once available: +# pip install qwen-tts +torch>=2.1.0 +soundfile>=0.12.1 +transformers>=4.40.0 +accelerate>=0.27.0 + +# --- Audio Playback --- +pygame>=2.5.0 + +# --- Utilities --- +numpy>=1.26.0 diff --git a/stt.py b/stt.py new file mode 100644 index 0000000..459c31b --- /dev/null +++ b/stt.py @@ -0,0 +1,195 @@ +""" +stt.py — Speech-To-Text Module (Vosk + PyAudio + openWakeWord) + +Responsibilities: + 1. Continuously monitor the microphone for a wake word ("echo"). + 2. Once triggered, capture and transcribe the full spoken command. + 3. Return the transcribed text to the orchestrator. + +Environment Variables: + VOSK_MODEL_PATH — path to a Vosk model directory (default: models/vosk-model-small-en-us) + WAKE_WORD — wake phrase to listen for (default: "echo") + +Dependencies: + pip install vosk pyaudio openwakeword +""" + +import json +import logging +import queue +import threading +import time +from pathlib import Path + +import openwakeword +import pyaudio +from vosk import KaldiRecognizer, Model, SetLogLevel + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- +DEFAULT_VOSK_MODEL = "models/vosk-model-small-en-us" +DEFAULT_WAKE_WORD = "echo" +FORMAT = pyaudio.paInt16 +CHANNELS = 1 +RATE = 16000 +CHUNK = 1024 +RECORD_SECONDS = 10 # max length of a voice command after wake word +SILENCE_LIMIT = 1.5 # seconds of silence before we stop recording + + +class WakeWordListener: + """Background thread that listens for the wake word using openWakeWord.""" + + def __init__(self, wake_word: str = DEFAULT_WAKE_WORD, on_detected=None): + self.wake_word = wake_word.lower() + self.on_detected = on_detected # callback(wake_word) + self._running = False + self._thread: threading.Thread | None = None + self._audio_queue: queue.Queue = queue.Queue() + + # ---- audio callback fed to PyAudio stream ---- + def _audio_callback(self, in_data, frame_count, time_info, status): + self._audio_queue.put(in_data) + return (in_data, pyaudio.paContinue) + + def start(self): + if self._running: + return + self._running = True + self._thread = threading.Thread(target=self._listen_loop, daemon=True) + self._thread.start() + logger.info("WakeWordListener started — listening for '%s'", self.wake_word) + + def stop(self): + self._running = False + if self._thread: + self._thread.join(timeout=5) + logger.info("WakeWordListener stopped") + + def _listen_loop(self): + """Open PyAudio, feed frames to openWakeWord, fire callback on match.""" + oww = openwakeword.Model( + wakeword_models=[self.wake_word], + inference_framework="onnx", + ) + + pa = pyaudio.PyAudio() + stream = pa.open( + format=FORMAT, + channels=CHANNELS, + rate=RATE, + input=True, + frames_per_buffer=CHUNK, + stream_callback=self._audio_callback, + ) + stream.start_stream() + + try: + while self._running: + try: + frame = self._audio_queue.get(timeout=0.5) + except queue.Empty: + continue + # openWakeWord expects 16 kHz mono int16 — matches our format + prediction = oww.process(frame) + for model_name, score in prediction.items(): + if score >= 0.5: # threshold + logger.info("Wake word '%s' detected (score=%.2f)", model_name, score) + if self.on_detected: + self.on_detected(self.wake_word) + finally: + stream.stop_stream() + stream.close() + pa.terminate() + + +class Transcriber: + """Captures microphone audio after wake word and transcribes via Vosk.""" + + def __init__(self, model_path: str = DEFAULT_VOSK_MODEL): + resolved = Path(model_path) + if not resolved.exists(): + raise FileNotFoundError( + f"Vosk model not found at {resolved.resolve()}. " + "Download one from https://alphacephei.com/vosk/models" + ) + SetLogLevel(-1) # suppress Vosk internal noise + self._model = Model(str(resolved)) + self._recognizer = KaldiRecognizer(self._model, RATE) + + def listen_and_transcribe(self) -> str | None: + """ + Open the mic, record until silence or timeout, and return the + best-effort transcription of the spoken command. + + Returns: + Transcribed text (str) or None if nothing was understood. + """ + pa = pyaudio.PyAudio() + stream = pa.open( + format=FORMAT, + channels=CHANNELS, + rate=RATE, + input=True, + frames_per_buffer=CHUNK, + ) + stream.start_stream() + logger.info("Recording voice command...") + + all_data = b"" + silence_start: float | None = None + started_speaking = False + + try: + while True: + data = stream.read(CHUNK, exception_on_overflow=False) + all_data += data + + # Quick RMS check for silence detection + rms = self._rms(data) + if rms > 300: + started_speaking = True + silence_start = None + elif started_speaking and silence_start is None: + silence_start = time.time() + + # Stop on silence timeout or max duration + if silence_start and (time.time() - silence_start) > SILENCE_LIMIT: + logger.debug("Silence detected — ending recording") + break + if len(all_data) > RATE * RECORD_SECONDS * 2: # bytes + logger.debug("Max recording duration reached") + break + finally: + stream.stop_stream() + stream.close() + pa.terminate() + + if not started_speaking: + logger.info("No speech detected after wake word") + return None + + # Feed all collected audio to Vosk for final transcription + self._recognizer.Reset() + if self._recognizer.AcceptWaveform(all_data): + result = json.loads(self._recognizer.Result()) + text = result.get("text", "").strip() + else: + partial = json.loads(self._recognizer.PartialResult()) + text = partial.get("partial", "").strip() + + logger.info("Transcription: '%s'", text) + return text if text else None + + @staticmethod + def _rms(data: bytes) -> float: + """Compute Root Mean Square of a byte buffer of int16 samples.""" + import array + samples = array.array("h", data) + if not samples: + return 0.0 + sum_sq = sum(s * s for s in samples) + return (sum_sq / len(samples)) ** 0.5 diff --git a/tts.py b/tts.py new file mode 100644 index 0000000..f5f01e6 --- /dev/null +++ b/tts.py @@ -0,0 +1,181 @@ +""" +tts.py — Text-To-Speech Module (Qwen3-TTS) + +Responsibilities: + 1. Accept text (full or partial sentence) and generate a .wav audio file + using the Qwen3-TTS model running locally. + 2. Support voice selection (preset voices or custom voice cloning). + 3. Support instruction-based style control (e.g., energy, tone). + 4. Play the generated audio immediately. + +Environment Variables: + QWEN_TTS_MODEL — model name or local path (default: Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice) + QWEN_TTS_VOICE — preset voice name or path to 3s .wav sample + QWEN_TTS_INSTRUCT — default style instruction for speech generation + +Dependencies: + pip install qwen-tts torch soundfile pygame +""" + +import asyncio +import logging +import os +import tempfile +from pathlib import Path + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Configuration +# --------------------------------------------------------------------------- +DEFAULT_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice" +DEFAULT_VOICE = "Ryan" # preset voice; alternatives: "Serena", "Diana", etc. +DEFAULT_INSTRUCTION = "Speak clearly with a warm, friendly tone. Be natural and conversational." +OUTPUT_DIR = Path("audio_output") + + +class TTSEngine: + """ + Wrapper around Qwen3-TTS for generating speech from text. + + The engine lazily loads the model on first use to avoid slow startup. + """ + + def __init__( + self, + model_name: str = DEFAULT_MODEL, + voice: str = DEFAULT_VOICE, + instruction: str = DEFAULT_INSTRUCTION, + output_dir: str | Path = OUTPUT_DIR, + ): + self.model_name = model_name + self.voice = voice + self.instruction = instruction + self.output_dir = Path(output_dir) + self.output_dir.mkdir(parents=True, exist_ok=True) + + self._model = None + self._processor = None + self._lock = asyncio.Lock() # prevent concurrent generation + + # ---- lazy model loading ---- + def _ensure_loaded(self): + """Load model and processor on first call (lazy init).""" + if self._model is not None: + return + + logger.info("Loading Qwen3-TTS model '%s' (this may take a moment)...", self.model_name) + try: + from qwen_tts import QwenTTSProcessor, QwenTTSModel + + self._processor = QwenTTSProcessor() + self._model = QwenTTSModel.from_pretrained(self.model_name) + logger.info("Qwen3-TTS model loaded successfully") + except ImportError: + raise ImportError( + "qwen-tts is not installed. Install it with:\n" + " pip install qwen-tts torch soundfile\n" + "Also ensure you have CUDA-capable GPU for low-latency inference." + ) + + # ---- generation ---- + async def generate(self, text: str, instruction: str | None = None) -> Path | None: + """ + Generate speech audio from text and save as .wav. + + Args: + text: The text to convert to speech. + instruction: Optional style instruction override. + + Returns: + Path to the generated .wav file, or None on failure. + """ + if not text or not text.strip(): + return None + + async with self._lock: + return await asyncio.to_thread( + self._generate_sync, text.strip(), instruction or self.instruction + ) + + def _generate_sync(self, text: str, instruction: str) -> Path | None: + """Synchronous generation (runs in thread pool).""" + self._ensure_loaded() + + output_path = self.output_dir / f"echo_{os.urandom(4).hex()}.wav" + + try: + # Build voice reference: preset name or custom .wav path + voice_ref = self.voice + if Path(self.voice).exists(): + voice_ref = str(Path(self.voice).resolve()) + + # Generate audio + logger.info("Generating speech: '%s' (voice=%s)", text[:60], self.voice) + audio_array = self._model.generate( + processor=self._processor, + text=text, + voice=voice_ref, + instruction=instruction, + ) + + # Save to file + import soundfile as sf + + sample_rate = self._processor.sampling_rate + sf.write(str(output_path), audio_array, sample_rate) + logger.info("Audio saved to %s (%.1fs)", output_path, len(audio_array) / sample_rate) + return output_path + + except Exception: + logger.exception("TTS generation failed for: '%s'", text[:60]) + return None + + # ---- playback ---- + async def speak(self, text: str, instruction: str | None = None) -> bool: + """ + Generate speech from text and play it immediately. + + Returns: + True if playback succeeded, False otherwise. + """ + wav_path = await self.generate(text, instruction) + if not wav_path: + return False + return await self._play(wav_path) + + async def speak_file(self, wav_path: Path) -> bool: + """Play a previously generated .wav file.""" + return await self._play(wav_path) + + @staticmethod + async def _play(wav_path: Path) -> bool: + """Play a .wav file using pygame.mixer (async-friendly).""" + try: + import pygame + + pygame.mixer.init(frequency=22050, size=-16, channels=1, buffer=2048) + pygame.mixer.music.load(str(wav_path)) + pygame.mixer.music.play() + + # Wait for playback to finish + while pygame.mixer.music.get_busy(): + await asyncio.sleep(0.05) + + pygame.mixer.music.stop() + pygame.mixer.quit() + logger.info("Playback finished: %s", wav_path.name) + return True + except Exception: + logger.exception("Playback failed for %s", wav_path) + return False + + def set_voice(self, voice: str): + """Switch to a different voice preset or custom sample path.""" + self.voice = voice + logger.info("Voice set to: %s", voice) + + def set_instruction(self, instruction: str): + """Update the default style instruction.""" + self.instruction = instruction + logger.info("TTS instruction updated: %s", instruction)