feat: initial Echo voice assistant — Vosk + OpenRouter + Qwen3-TTS

- stt.py: WakeWordListener (openWakeWord) + Transcriber (Vosk) - brain.py: Async OpenRouter streaming client with command parsing - tts.py: Qwen3-TTS engine with voice selection & instruction control - actions.py: 10 local OS commands (open_app, set_timer, search, etc.) - main.py: Async orchestrator with Phase 5 parallel TTS streaming
2026-03-31 00:08:52 +00:00 · 2026-03-31 00:08:52 +00:00 · d6b64d04d1
commit d6b64d04d1
parent 722596bb09
8 changed files with 1187 additions and 0 deletions
--- a/.env.example
+++ b/.env.example
@ -0,0 +1,24 @@
+# ===========================================================
+# Echo Voice Assistant — Environment Configuration
+# ===========================================================
+# Copy this file to .env and fill in your values:
+#   cp .env.example .env
+# ===========================================================
+
+# --- OpenRouter (required) ---
+# Get your key at: https://openrouter.ai/keys
+OPENROUTER_API_KEY=sk-or-v1-xxxxxxxxxxxxxxxxxxxxxxxx
+OPENROUTER_MODEL=qwen/qwen-3-235b-a22b
+
+# --- Vosk STT (optional overrides) ---
+# Download models from: https://alphacephei.com/vosk/models
+# Set to a local path relative to the project root
+VOSK_MODEL_PATH=models/vosk-model-small-en-us
+WAKE_WORD=echo
+
+# --- Qwen3-TTS (optional overrides) ---
+# Available preset voices: Ryan, Serena, Diana, etc.
+# Or set a path to a 3-second .wav sample for voice cloning
+QWEN_TTS_MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice
+QWEN_TTS_VOICE=Ryan
+QWEN_TTS_INSTRUCT=Speak clearly with a warm, friendly tone. Be natural and conversational.
--- a/.gitignore
+++ b/.gitignore
@ -0,0 +1,39 @@
+# Echo Voice Assistant — Git Ignore
+
+# Python
+__pycache__/
+*.py[cod]
+*.egg-info/
+dist/
+build/
+*.egg
+
+# Virtual environments
+venv/
+.venv/
+env/
+
+# Model weights (large files — download separately)
+models/
+!models/.gitkeep
+
+# Generated audio
+audio_output/
+!audio_output/.gitkeep
+
+# Environment & secrets
+.env
+.env.local
+
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+
+# OS
+.DS_Store
+Thumbs.db
+
+# Logs
+*.log
--- a/actions.py
+++ b/actions.py
@ -0,0 +1,275 @@
+"""
+actions.py — Local OS Command Execution
+
+Responsibilities:
+  1. Provide a registry of local actions the assistant can perform.
+  2. Map action names from the LLM's JSON commands to Python functions.
+  3. Execute commands and return a spoken summary for TTS feedback.
+
+Supported actions:
+  open_app         — Launch a desktop application
+  set_timer        — Start a countdown timer with audible alarm
+  get_time         — Return the current time
+  get_date         — Return today's date
+  get_weather      — (stub) Return weather info
+  create_reminder  — (stub) Save a reminder note
+  control_volume   — Adjust system volume (Linux/macOS)
+  search_web       — Open a web search in the default browser
+  calculate        — Evaluate a math expression safely
+  shutdown         — System shutdown (with confirmation)
+"""
+
+import logging
+import os
+import platform
+import subprocess
+import threading
+import webbrowser
+from datetime import datetime
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Action Registry
+# ---------------------------------------------------------------------------
+_REGISTRY: dict[str, callable] = {}
+
+
+def register(name: str):
+    """Decorator to register an action function by name."""
+    def decorator(func):
+        _REGISTRY[name] = func
+        return func
+    return decorator
+
+
+def execute(action: str, params: dict | None = None) -> str:
+    """
+    Execute a registered action and return a spoken summary.
+
+    Args:
+        action: The action name (e.g., "open_app").
+        params: Optional dict of parameters.
+
+    Returns:
+        A short text description of what was done (for TTS feedback).
+    """
+    params = params or {}
+    func = _REGISTRY.get(action)
+
+    if not func:
+        logger.warning("Unknown action: %s", action)
+        return f"Sorry, I don't know how to {action.replace('_', ' ')}."
+
+    try:
+        result = func(**params)
+        logger.info("Action '%s' executed: %s", action, result)
+        return result
+    except Exception as exc:
+        logger.exception("Action '%s' failed", action)
+        return f"Something went wrong: {exc}"
+
+
+# ---------------------------------------------------------------------------
+# Action Implementations
+# ---------------------------------------------------------------------------
+
+@register("get_time")
+def get_time(**_) -> str:
+    now = datetime.now().strftime("%-I:%M %p")
+    return f"It's currently {now}."
+
+
+@register("get_date")
+def get_date(**_) -> str:
+    today = datetime.now().strftime("%A, %B %d, %Y")
+    return f"Today is {today}."
+
+
+@register("open_app")
+def open_app(app_name: str = "", **_) -> str:
+    if not app_name:
+        return "What app would you like me to open?"
+
+    system = platform.system()
+    app_lower = app_name.lower().strip()
+
+    try:
+        if system == "Darwin":  # macOS
+            subprocess.Popen(["open", "-a", app_name], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        elif system == "Windows":
+            subprocess.Popen(f"start {app_name}", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
+        else:  # Linux
+            # Try common app launchers
+            app_map = {
+                "chrome": "google-chrome",
+                "firefox": "firefox",
+                "terminal": "gnome-terminal",
+                "files": "nautilus",
+                "calculator": "gnome-calculator",
+                "settings": "gnome-control-center",
+                "browser": "xdg-open",
+                "vs code": "code",
+                "vscode": "code",
+            }
+            cmd = app_map.get(app_lower, app_lower)
+            subprocess.Popen(
+                [cmd], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
+            )
+
+        return f"Opening {app_name}."
+    except FileNotFoundError:
+        return f"Sorry, I couldn't find {app_name} on this system."
+
+
+@register("set_timer")
+def set_timer(seconds: int = 60, label: str = "Timer", **_) -> str:
+    """
+    Start a background timer that rings after the given number of seconds.
+    Uses the terminal bell when the timer completes.
+    """
+    try:
+        duration = int(seconds)
+    except (ValueError, TypeError):
+        return "I need a number of seconds for the timer."
+
+    def _timer_thread():
+        import time
+
+        logger.info("Timer '%s' started for %d seconds", label, duration)
+        time.sleep(duration)
+        # Terminal bell
+        print(f"\a")
+        logger.info("Timer '%s' finished!", label)
+        # Try to use TTS to announce (if available — soft dependency)
+        try:
+            import pygame
+            pygame.mixer.init()
+            # Generate a simple beep
+            import numpy as np
+            sample_rate = 22050
+            t = np.linspace(0, 0.5, int(sample_rate * 0.5), dtype=np.float32)
+            tone = np.sin(2 * np.pi * 880 * t) * 0.5
+            tone = (tone * 32767).astype(np.int16)
+            # Save and play
+            import soundfile as sf
+            beep_path = f"/tmp/echo_timer_{os.urandom(4).hex()}.wav"
+            sf.write(beep_path, tone, sample_rate)
+            pygame.mixer.music.load(beep_path)
+            pygame.mixer.music.play()
+            while pygame.mixer.music.get_busy():
+                pygame.time.wait(50)
+            pygame.mixer.quit()
+            os.unlink(beep_path)
+        except Exception:
+            pass  # Fall back to terminal bell only
+
+    threading.Thread(target=_timer_thread, daemon=True, name=f"timer-{label}").start()
+
+    minutes, secs = divmod(duration, 60)
+    if minutes:
+        return f"{label} set for {minutes} minute{'s' if minutes != 1 else ''} and {secs} seconds."
+    return f"{label} set for {secs} seconds."
+
+
+@register("get_weather")
+def get_weather(location: str = "", **_) -> str:
+    """Stub — can be expanded with a weather API integration."""
+    return (
+        "I don't have a weather service connected yet. "
+        "You can ask me again once the weather API is configured."
+    )
+
+
+@register("create_reminder")
+def create_reminder(text: str = "", **_) -> str:
+    """Save a reminder to a local file."""
+    if not text:
+        return "What would you like me to remind you about?"
+
+    reminders_dir = Path.home() / ".echo" / "reminders"
+    reminders_dir.mkdir(parents=True, exist_ok=True)
+
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    reminder_file = reminders_dir / f"{timestamp}.txt"
+    reminder_file.write_text(f"[{datetime.now().isoformat()}] {text}\n")
+
+    return f"Reminder saved: {text}"
+
+
+@register("control_volume")
+def control_volume(level: int = 50, **_) -> str:
+    """Adjust system volume (Linux/macOS only)."""
+    try:
+        vol = int(level)
+        vol = max(0, min(100, vol))
+    except (ValueError, TypeError):
+        return "Please specify a volume level between 0 and 100."
+
+    system = platform.system()
+    try:
+        if system == "Darwin":
+            subprocess.run(["osascript", "-e", f"set volume output volume {vol}"],
+                           check=True, capture_output=True)
+        elif system == "Linux":
+            subprocess.run(
+                ["pactl", "set-sink-volume", "@DEFAULT_SINK@", f"{vol}%"],
+                check=True, capture_output=True,
+            )
+        else:
+            return "Volume control isn't supported on Windows yet."
+
+        return f"Volume set to {vol}%."
+    except subprocess.CalledProcessError:
+        return "I couldn't adjust the volume."
+
+
+@register("search_web")
+def search_web(query: str = "", **_) -> str:
+    if not query:
+        return "What would you like to search for?"
+
+    url = f"https://www.google.com/search?q={query.replace(' ', '+')}"
+    webbrowser.open(url)
+    return f"Searching the web for: {query}"
+
+
+@register("calculate")
+def calculate(expression: str = "", **_) -> str:
+    if not expression:
+        return "What would you like me to calculate?"
+
+    # Whitelist only safe math operations
+    import ast
+    allowed = {
+        ast.Add, ast.Sub, ast.Mult, ast.Div, ast.Pow,
+        ast.USub, ast.UAdd, ast.Constant, ast.Num,
+    }
+    try:
+        tree = ast.parse(expression.strip(), mode="eval")
+        for node in ast.walk(tree):
+            if type(node) not in allowed:
+                raise ValueError("Unsafe expression")
+        result = eval(compile(tree, "<calc>", "eval"))  # noqa: S307
+        return f"The result is {result}"
+    except ZeroDivisionError:
+        return "You can't divide by zero."
+    except Exception:
+        return f"I couldn't calculate that expression."
+
+
+@register("shutdown")
+def shutdown(confirm: bool = False, **_) -> str:
+    if not confirm:
+        return "Are you sure? Please confirm the shutdown command."
+    system = platform.system()
+    try:
+        if system == "Darwin":
+            subprocess.run(["sudo", "shutdown", "-h", "now"], check=True)
+        elif system == "Windows":
+            subprocess.run(["shutdown", "/s", "/t", "5"], check=True)
+        else:
+            subprocess.run(["sudo", "shutdown", "-h", "now"], check=True)
+        return "Shutting down now. Goodbye!"
+    except Exception:
+        return "I don't have permission to shut down the system."
--- a/brain.py
+++ b/brain.py
@ -0,0 +1,159 @@
+"""
+brain.py — OpenRouter LLM Client (Streaming)
+
+Responsibilities:
+  1. Send transcribed text to OpenRouter with a system prompt that
+     instructs the model to produce:
+       a) A concise verbal response (≤ 2 sentences for voice).
+       b) An optional JSON command block for local execution.
+  2. Stream tokens back so the TTS engine can start early (Phase 5).
+  3. Parse any JSON command block and return it alongside the spoken text.
+
+Environment Variables:
+  OPENROUTER_API_KEY  — your OpenRouter API key
+  OPENROUTER_MODEL    — model identifier (default: qwen/qwen-3-235b-a22b)
+  OPENROUTER_BASE_URL — (optional) custom base URL
+
+Dependencies:
+  pip install openai
+"""
+
+import json
+import logging
+import os
+from typing import AsyncIterator
+
+from openai import AsyncOpenAI
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Configuration
+# ---------------------------------------------------------------------------
+DEFAULT_MODEL = "qwen/qwen-3-235b-a22b"
+
+SYSTEM_PROMPT = """\
+You are Echo, a concise, helpful voice assistant. Follow these rules strictly:
+
+1. **Verbal response**: Reply in ≤ 2 short sentences so it sounds natural
+   when spoken aloud. Be direct and conversational.
+2. **Local commands** (optional): If the user's request can be fulfilled by a
+   local OS action (e.g. opening an app, setting a timer, checking the time,
+   creating a reminder), include a single JSON block at the very end of your
+   response using this exact format:
+
+   ```json
+   {"action": "<command_name>", "params": {"key": "value"}}
+   ```
+
+   Supported actions: open_app, set_timer, get_time, get_date, get_weather,
+   create_reminder, control_volume, shutdown, search_web, calculate.
+
+3. Do NOT include the JSON block in your spoken text. The spoken text is
+   everything BEFORE the JSON block.
+4. If no local action is needed, just respond normally without any JSON.
+5. Never use markdown formatting, bullet points, or headers in the spoken text.
+"""
+
+
+class Brain:
+    """Async client for OpenRouter LLM with streaming support."""
+
+    def __init__(
+        self,
+        api_key: str | None = None,
+        model: str = DEFAULT_MODEL,
+        base_url: str = "https://openrouter.ai/api/v1",
+    ):
+        self.model = model
+        self.client = AsyncOpenAI(
+            api_key=api_key or os.environ.get("OPENROUTER_API_KEY", ""),
+            base_url=base_url,
+        )
+        # Conversation history (rolling window)
+        self._history: list[dict[str, str]] = []
+        self._max_history = 20  # keep last 20 messages for context
+
+    async def think(
+        self,
+        user_text: str,
+    ) -> AsyncIterator[dict]:
+        """
+        Stream the LLM response token-by-token.
+
+        Yields:
+            dict with keys:
+              - "type": "token" | "command" | "done"
+              - "text": the token string (for "token" type)
+              - "command": parsed JSON dict (for "command" type)
+        """
+        self._history.append({"role": "user", "content": user_text})
+        if len(self._history) > self._max_history:
+            self._history = self._history[-self._max_history:]
+
+        messages = [{"role": "system", "content": SYSTEM_PROMPT}] + self._history
+
+        logger.info("Sending to OpenRouter (%s): %s", self.model, user_text[:80])
+
+        full_response = ""
+        try:
+            stream = await self.client.chat.completions.create(
+                model=self.model,
+                messages=messages,
+                stream=True,
+                temperature=0.7,
+                max_tokens=300,
+            )
+            async for chunk in stream:
+                delta = chunk.choices[0].delta
+                if delta.content:
+                    full_response += delta.content
+                    yield {"type": "token", "text": delta.content}
+        except Exception:
+            logger.exception("OpenRouter request failed")
+            yield {"type": "token", "text": "Sorry, I had trouble thinking about that."}
+
+        # Parse any JSON command block from the full response
+        command = self._extract_command(full_response)
+        if command:
+            logger.info("Parsed command: %s", command)
+            yield {"type": "command", "command": command}
+
+        # Clean spoken text (strip JSON block and thinking tags)
+        spoken_text = self._clean_spoken_text(full_response)
+        self._history.append({"role": "assistant", "content": full_response})
+
+        yield {"type": "done", "text": spoken_text}
+
+    def _extract_command(self, text: str) -> dict | None:
+        """Extract the JSON command block from the LLM response."""
+        try:
+            # Find JSON code block
+            if "```json" in text:
+                start = text.index("```json") + 7
+                end = text.index("```", start)
+                json_str = text[start:end].strip()
+                return json.loads(json_str)
+            # Try bare JSON at the end
+            for i in range(len(text) - 1, -1, -1):
+                if text[i] == "{":
+                    candidate = text[i:].strip()
+                    return json.loads(candidate)
+        except (json.JSONDecodeError, ValueError):
+            pass
+        return None
+
+    @staticmethod
+    def _clean_spoken_text(text: str) -> str:
+        """Remove JSON blocks and Qwen thinking tags from spoken text."""
+        import re
+
+        # Remove Qwen3 <think/> blocks
+        cleaned = re.sub(r"<think[^>]*>.*?</think\s*>", "", text, flags=re.DOTALL)
+        # Remove JSON code blocks
+        cleaned = re.sub(r"```json.*?```", "", cleaned, flags=re.DOTALL)
+        # Remove any trailing bare JSON object
+        cleaned = re.sub(r"\{[^}]*\"action\"[^}]*\}", "", cleaned)
+        # Clean up whitespace
+        cleaned = cleaned.strip().strip(".")
+        return cleaned
--- a/main.py
+++ b/main.py
@ -0,0 +1,283 @@
+"""
+main.py — Echo Voice Assistant Orchestrator
+
+Ties together all modules:
+  1. WakeWordListener  (stt.py) — continuously listens for "echo"
+  2. Transcriber       (stt.py) — captures & transcribes voice commands
+  3. Brain             (brain.py) — sends text to OpenRouter, streams response
+  4. TTSEngine         (tts.py) — generates speech from text (Qwen3-TTS)
+  5. Actions           (actions.py) — executes local OS commands
+
+Phase 5 Parallel Processing:
+  As soon as the first complete sentence is received from the Brain's
+  streamed response, TTS generation begins immediately — before the
+  full LLM response has finished streaming.
+
+Usage:
+  python main.py
+"""
+
+import asyncio
+import logging
+import os
+import re
+import signal
+import sys
+from pathlib import Path
+
+from dotenv import load_dotenv
+
+from stt import WakeWordListener, Transcriber
+from brain import Brain
+from tts import TTSEngine
+from actions import execute as execute_action
+
+# ---------------------------------------------------------------------------
+# Logging setup
+# ---------------------------------------------------------------------------
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s │ %(name)-18s │ %(levelname)-7s │ %(message)s",
+    datefmt="%H:%M:%S",
+)
+logger = logging.getLogger("echo")
+
+# ---------------------------------------------------------------------------
+# Load environment
+# ---------------------------------------------------------------------------
+load_dotenv(Path(__file__).parent / ".env")
+
+
+class EchoAssistant:
+    """
+    Main orchestrator for the Echo voice assistant.
+
+    Lifecycle:
+      1. Start wake word listener (background thread).
+      2. On wake word detected → transcribe command.
+      3. Stream LLM response → start TTS on first sentence (parallel).
+      4. Execute any local commands from the LLM response.
+    """
+
+    def __init__(self):
+        # --- STT ---
+        model_path = os.environ.get(
+            "VOSK_MODEL_PATH", "models/vosk-model-small-en-us"
+        )
+        wake_word = os.environ.get("WAKE_WORD", "echo")
+
+        self.transcriber = Transcriber(model_path=model_path)
+        self.wake_listener = WakeWordListener(
+            wake_word=wake_word,
+            on_detected=self._on_wake_word,
+        )
+
+        # --- Brain (LLM) ---
+        self.brain = Brain(
+            api_key=os.environ.get("OPENROUTER_API_KEY"),
+            model=os.environ.get("OPENROUTER_MODEL", "qwen/qwen-3-235b-a22b"),
+        )
+
+        # --- TTS ---
+        self.tts = TTSEngine(
+            model_name=os.environ.get("QWEN_TTS_MODEL", "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"),
+            voice=os.environ.get("QWEN_TTS_VOICE", "Ryan"),
+            instruction=os.environ.get(
+                "QWEN_TTS_INSTRUCT",
+                "Speak clearly with a warm, friendly tone. Be natural and conversational.",
+            ),
+        )
+
+        # --- State ---
+        self._processing = False  # guard against concurrent commands
+        self._shutdown_event = asyncio.Event()
+
+        logger.info("Echo assistant initialized (wake word: '%s')", wake_word)
+
+    # ------------------------------------------------------------------
+    # Wake word callback (runs in background thread)
+    # ------------------------------------------------------------------
+    def _on_wake_word(self, wake_word: str):
+        """Called by WakeWordListener when the wake word is detected."""
+        if self._processing:
+            logger.info("Still processing previous command — ignoring wake word")
+            return
+        # Schedule the command processing in the async event loop
+        try:
+            loop = asyncio.get_running_loop()
+            loop.call_soon_threadsafe(loop.create_task, self._handle_command())
+        except RuntimeError:
+            logger.warning("No running event loop for wake word callback")
+
+    # ------------------------------------------------------------------
+    # Main command pipeline
+    # ------------------------------------------------------------------
+    async def _handle_command(self):
+        """Full pipeline: transcribe → think → speak → act."""
+        if self._processing:
+            return
+        self._processing = True
+
+        try:
+            # Play a brief acknowledgment tone
+            logger.info("🔊 Wake word detected — listening...")
+
+            # Step 1: Transcribe
+            text = self.transcriber.listen_and_transcribe()
+            if not text:
+                logger.info("No transcription — returning to idle")
+                return
+
+            logger.info("📝 You said: '%s'", text)
+
+            # Step 2: Stream LLM response with early TTS (Phase 5)
+            await self._stream_and_speak(text)
+
+        except Exception:
+            logger.exception("Error in command pipeline")
+        finally:
+            self._processing = False
+            logger.info("Returning to idle...")
+
+    # ------------------------------------------------------------------
+    # Phase 5: Parallel Streaming + TTS
+    # ------------------------------------------------------------------
+    async def _stream_and_speak(self, user_text: str):
+        """
+        Stream the LLM response and start TTS generation as soon as the
+        first complete sentence is available — minimizing perceived latency.
+        """
+        buffer = ""
+        first_sentence_spoken = False
+        remaining_text = ""
+        pending_command = None
+        tts_tasks: list[asyncio.Task] = []
+
+        async for event in self.brain.think(user_text):
+            if event["type"] == "token":
+                buffer += event["text"]
+
+                # Check if we have a complete sentence
+                if not first_sentence_spoken and self._has_complete_sentence(buffer):
+                    # Split: first sentence goes to TTS immediately
+                    sentences = self._split_first_sentence(buffer)
+                    first_sentence = sentences[0]
+                    remaining_text = sentences[1] if len(sentences) > 1 else ""
+
+                    if first_sentence.strip():
+                        logger.info("⚡ Early TTS trigger: '%s'", first_sentence[:60])
+                        task = asyncio.create_task(
+                            self.tts.speak(first_sentence.strip())
+                        )
+                        tts_tasks.append(task)
+
+                    first_sentence_spoken = True
+                    buffer = remaining_text
+
+            elif event["type"] == "command":
+                pending_command = event["command"]
+
+            elif event["type"] == "done":
+                # Any remaining text after the first sentence
+                final_text = buffer.strip()
+                if final_text and final_text != remaining_text:
+                    final_text = event["text"]
+                    # Remove the already-spoken first sentence
+                    if first_sentence_spoken and remaining_text:
+                        pass  # remaining_text already has what we need
+                    else:
+                        remaining_text = final_text
+
+        # Step 3: Speak the remaining text after first sentence finishes
+        remaining_text = remaining_text.strip()
+        if remaining_text:
+            # Wait for first sentence TTS to finish
+            for task in tts_tasks:
+                await task
+            await self.tts.speak(remaining_text)
+
+        # Wait for all TTS tasks to complete
+        for task in tts_tasks:
+            if not task.done():
+                await task
+
+        # Step 4: Execute any local command
+        if pending_command:
+            action_name = pending_command.get("action", "")
+            params = pending_command.get("params", {})
+            logger.info("🔧 Executing action: %s %s", action_name, params)
+            result = execute_action(action_name, params)
+            if result:
+                await self.tts.speak(result)
+
+    # ------------------------------------------------------------------
+    # Text utilities
+    # ------------------------------------------------------------------
+    @staticmethod
+    def _has_complete_sentence(text: str) -> bool:
+        """Check if the text buffer contains at least one complete sentence."""
+        # A sentence is considered complete if it ends with . ! ? or ...
+        return bool(re.search(r'[.!?]\s+|[.!?]$', text))
+
+    @staticmethod
+    def _split_first_sentence(text: str) -> list[str]:
+        """Split text at the first sentence boundary."""
+        match = re.search(r'([.!?])\s+', text)
+        if match:
+            end = match.start() + 1
+            return [text[:end], text[end:].strip()]
+        # Check for ending punctuation without trailing space
+        match = re.search(r'[.!?]$', text.strip())
+        if match:
+            return [text.strip()]
+        return [text]
+
+    # ------------------------------------------------------------------
+    # Lifecycle
+    # ------------------------------------------------------------------
+    async def start(self):
+        """Start the Echo assistant."""
+        logger.info("=" * 60)
+        logger.info("  ECHO VOICE ASSISTANT")
+        logger.info("  Say '%s' to activate", os.environ.get("WAKE_WORD", "echo").upper())
+        logger.info("  Press Ctrl+C to quit")
+        logger.info("=" * 60)
+
+        # Start wake word listener (runs in background thread)
+        self.wake_listener.start()
+
+        # Keep the async loop alive until shutdown
+        await self._shutdown_event.wait()
+
+    def shutdown(self):
+        """Signal the assistant to stop."""
+        logger.info("Shutting down Echo...")
+        self.wake_listener.stop()
+        self._shutdown_event.set()
+
+
+# ---------------------------------------------------------------------------
+# Entry Point
+# ---------------------------------------------------------------------------
+def main():
+    assistant = EchoAssistant()
+
+    # Graceful shutdown on Ctrl+C
+    def _signal_handler(sig, frame):
+        assistant.shutdown()
+
+    signal.signal(signal.SIGINT, _signal_handler)
+    signal.signal(signal.SIGTERM, _signal_handler)
+
+    # Run the async event loop
+    try:
+        asyncio.run(assistant.start())
+    except KeyboardInterrupt:
+        pass
+    finally:
+        assistant.shutdown()
+        logger.info("Echo has shut down. Goodbye!")
+
+
+if __name__ == "__main__":
+    main()
--- a/requirements.txt
+++ b/requirements.txt
@ -0,0 +1,31 @@
+# ===========================================================
+# Echo Voice Assistant — Dependencies
+# ===========================================================
+# Install with:  pip install -r requirements.txt
+#
+# Note: For GPU-accelerated TTS, install PyTorch with CUDA
+# support first:  https://pytorch.org/get-started/locally/
+# ===========================================================
+
+# --- Core ---
+vosk>=0.3.45
+pyaudio>=0.2.14
+openwakeword>=0.5.0
+
+# --- LLM ---
+openai>=1.30.0
+python-dotenv>=1.0.0
+
+# --- TTS (Qwen3-TTS) ---
+# Install from source or PyPI once available:
+# pip install qwen-tts
+torch>=2.1.0
+soundfile>=0.12.1
+transformers>=4.40.0
+accelerate>=0.27.0
+
+# --- Audio Playback ---
+pygame>=2.5.0
+
+# --- Utilities ---
+numpy>=1.26.0
--- a/stt.py
+++ b/stt.py
@ -0,0 +1,195 @@
+"""
+stt.py — Speech-To-Text Module (Vosk + PyAudio + openWakeWord)
+
+Responsibilities:
+  1. Continuously monitor the microphone for a wake word ("echo").
+  2. Once triggered, capture and transcribe the full spoken command.
+  3. Return the transcribed text to the orchestrator.
+
+Environment Variables:
+  VOSK_MODEL_PATH  — path to a Vosk model directory (default: models/vosk-model-small-en-us)
+  WAKE_WORD        — wake phrase to listen for (default: "echo")
+
+Dependencies:
+  pip install vosk pyaudio openwakeword
+"""
+
+import json
+import logging
+import queue
+import threading
+import time
+from pathlib import Path
+
+import openwakeword
+import pyaudio
+from vosk import KaldiRecognizer, Model, SetLogLevel
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+DEFAULT_VOSK_MODEL = "models/vosk-model-small-en-us"
+DEFAULT_WAKE_WORD = "echo"
+FORMAT = pyaudio.paInt16
+CHANNELS = 1
+RATE = 16000
+CHUNK = 1024
+RECORD_SECONDS = 10          # max length of a voice command after wake word
+SILENCE_LIMIT = 1.5          # seconds of silence before we stop recording
+
+
+class WakeWordListener:
+    """Background thread that listens for the wake word using openWakeWord."""
+
+    def __init__(self, wake_word: str = DEFAULT_WAKE_WORD, on_detected=None):
+        self.wake_word = wake_word.lower()
+        self.on_detected = on_detected  # callback(wake_word)
+        self._running = False
+        self._thread: threading.Thread | None = None
+        self._audio_queue: queue.Queue = queue.Queue()
+
+    # ---- audio callback fed to PyAudio stream ----
+    def _audio_callback(self, in_data, frame_count, time_info, status):
+        self._audio_queue.put(in_data)
+        return (in_data, pyaudio.paContinue)
+
+    def start(self):
+        if self._running:
+            return
+        self._running = True
+        self._thread = threading.Thread(target=self._listen_loop, daemon=True)
+        self._thread.start()
+        logger.info("WakeWordListener started — listening for '%s'", self.wake_word)
+
+    def stop(self):
+        self._running = False
+        if self._thread:
+            self._thread.join(timeout=5)
+        logger.info("WakeWordListener stopped")
+
+    def _listen_loop(self):
+        """Open PyAudio, feed frames to openWakeWord, fire callback on match."""
+        oww = openwakeword.Model(
+            wakeword_models=[self.wake_word],
+            inference_framework="onnx",
+        )
+
+        pa = pyaudio.PyAudio()
+        stream = pa.open(
+            format=FORMAT,
+            channels=CHANNELS,
+            rate=RATE,
+            input=True,
+            frames_per_buffer=CHUNK,
+            stream_callback=self._audio_callback,
+        )
+        stream.start_stream()
+
+        try:
+            while self._running:
+                try:
+                    frame = self._audio_queue.get(timeout=0.5)
+                except queue.Empty:
+                    continue
+                # openWakeWord expects 16 kHz mono int16 — matches our format
+                prediction = oww.process(frame)
+                for model_name, score in prediction.items():
+                    if score >= 0.5:  # threshold
+                        logger.info("Wake word '%s' detected (score=%.2f)", model_name, score)
+                        if self.on_detected:
+                            self.on_detected(self.wake_word)
+        finally:
+            stream.stop_stream()
+            stream.close()
+            pa.terminate()
+
+
+class Transcriber:
+    """Captures microphone audio after wake word and transcribes via Vosk."""
+
+    def __init__(self, model_path: str = DEFAULT_VOSK_MODEL):
+        resolved = Path(model_path)
+        if not resolved.exists():
+            raise FileNotFoundError(
+                f"Vosk model not found at {resolved.resolve()}. "
+                "Download one from https://alphacephei.com/vosk/models"
+            )
+        SetLogLevel(-1)  # suppress Vosk internal noise
+        self._model = Model(str(resolved))
+        self._recognizer = KaldiRecognizer(self._model, RATE)
+
+    def listen_and_transcribe(self) -> str | None:
+        """
+        Open the mic, record until silence or timeout, and return the
+        best-effort transcription of the spoken command.
+
+        Returns:
+            Transcribed text (str) or None if nothing was understood.
+        """
+        pa = pyaudio.PyAudio()
+        stream = pa.open(
+            format=FORMAT,
+            channels=CHANNELS,
+            rate=RATE,
+            input=True,
+            frames_per_buffer=CHUNK,
+        )
+        stream.start_stream()
+        logger.info("Recording voice command...")
+
+        all_data = b""
+        silence_start: float | None = None
+        started_speaking = False
+
+        try:
+            while True:
+                data = stream.read(CHUNK, exception_on_overflow=False)
+                all_data += data
+
+                # Quick RMS check for silence detection
+                rms = self._rms(data)
+                if rms > 300:
+                    started_speaking = True
+                    silence_start = None
+                elif started_speaking and silence_start is None:
+                    silence_start = time.time()
+
+                # Stop on silence timeout or max duration
+                if silence_start and (time.time() - silence_start) > SILENCE_LIMIT:
+                    logger.debug("Silence detected — ending recording")
+                    break
+                if len(all_data) > RATE * RECORD_SECONDS * 2:  # bytes
+                    logger.debug("Max recording duration reached")
+                    break
+        finally:
+            stream.stop_stream()
+            stream.close()
+            pa.terminate()
+
+        if not started_speaking:
+            logger.info("No speech detected after wake word")
+            return None
+
+        # Feed all collected audio to Vosk for final transcription
+        self._recognizer.Reset()
+        if self._recognizer.AcceptWaveform(all_data):
+            result = json.loads(self._recognizer.Result())
+            text = result.get("text", "").strip()
+        else:
+            partial = json.loads(self._recognizer.PartialResult())
+            text = partial.get("partial", "").strip()
+
+        logger.info("Transcription: '%s'", text)
+        return text if text else None
+
+    @staticmethod
+    def _rms(data: bytes) -> float:
+        """Compute Root Mean Square of a byte buffer of int16 samples."""
+        import array
+        samples = array.array("h", data)
+        if not samples:
+            return 0.0
+        sum_sq = sum(s * s for s in samples)
+        return (sum_sq / len(samples)) ** 0.5
--- a/tts.py
+++ b/tts.py
@ -0,0 +1,181 @@
+"""
+tts.py — Text-To-Speech Module (Qwen3-TTS)
+
+Responsibilities:
+  1. Accept text (full or partial sentence) and generate a .wav audio file
+     using the Qwen3-TTS model running locally.
+  2. Support voice selection (preset voices or custom voice cloning).
+  3. Support instruction-based style control (e.g., energy, tone).
+  4. Play the generated audio immediately.
+
+Environment Variables:
+  QWEN_TTS_MODEL    — model name or local path (default: Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice)
+  QWEN_TTS_VOICE    — preset voice name or path to 3s .wav sample
+  QWEN_TTS_INSTRUCT  — default style instruction for speech generation
+
+Dependencies:
+  pip install qwen-tts torch soundfile pygame
+"""
+
+import asyncio
+import logging
+import os
+import tempfile
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Configuration
+# ---------------------------------------------------------------------------
+DEFAULT_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
+DEFAULT_VOICE = "Ryan"  # preset voice; alternatives: "Serena", "Diana", etc.
+DEFAULT_INSTRUCTION = "Speak clearly with a warm, friendly tone. Be natural and conversational."
+OUTPUT_DIR = Path("audio_output")
+
+
+class TTSEngine:
+    """
+    Wrapper around Qwen3-TTS for generating speech from text.
+
+    The engine lazily loads the model on first use to avoid slow startup.
+    """
+
+    def __init__(
+        self,
+        model_name: str = DEFAULT_MODEL,
+        voice: str = DEFAULT_VOICE,
+        instruction: str = DEFAULT_INSTRUCTION,
+        output_dir: str | Path = OUTPUT_DIR,
+    ):
+        self.model_name = model_name
+        self.voice = voice
+        self.instruction = instruction
+        self.output_dir = Path(output_dir)
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+
+        self._model = None
+        self._processor = None
+        self._lock = asyncio.Lock()  # prevent concurrent generation
+
+    # ---- lazy model loading ----
+    def _ensure_loaded(self):
+        """Load model and processor on first call (lazy init)."""
+        if self._model is not None:
+            return
+
+        logger.info("Loading Qwen3-TTS model '%s' (this may take a moment)...", self.model_name)
+        try:
+            from qwen_tts import QwenTTSProcessor, QwenTTSModel
+
+            self._processor = QwenTTSProcessor()
+            self._model = QwenTTSModel.from_pretrained(self.model_name)
+            logger.info("Qwen3-TTS model loaded successfully")
+        except ImportError:
+            raise ImportError(
+                "qwen-tts is not installed. Install it with:\n"
+                "  pip install qwen-tts torch soundfile\n"
+                "Also ensure you have CUDA-capable GPU for low-latency inference."
+            )
+
+    # ---- generation ----
+    async def generate(self, text: str, instruction: str | None = None) -> Path | None:
+        """
+        Generate speech audio from text and save as .wav.
+
+        Args:
+            text: The text to convert to speech.
+            instruction: Optional style instruction override.
+
+        Returns:
+            Path to the generated .wav file, or None on failure.
+        """
+        if not text or not text.strip():
+            return None
+
+        async with self._lock:
+            return await asyncio.to_thread(
+                self._generate_sync, text.strip(), instruction or self.instruction
+            )
+
+    def _generate_sync(self, text: str, instruction: str) -> Path | None:
+        """Synchronous generation (runs in thread pool)."""
+        self._ensure_loaded()
+
+        output_path = self.output_dir / f"echo_{os.urandom(4).hex()}.wav"
+
+        try:
+            # Build voice reference: preset name or custom .wav path
+            voice_ref = self.voice
+            if Path(self.voice).exists():
+                voice_ref = str(Path(self.voice).resolve())
+
+            # Generate audio
+            logger.info("Generating speech: '%s' (voice=%s)", text[:60], self.voice)
+            audio_array = self._model.generate(
+                processor=self._processor,
+                text=text,
+                voice=voice_ref,
+                instruction=instruction,
+            )
+
+            # Save to file
+            import soundfile as sf
+
+            sample_rate = self._processor.sampling_rate
+            sf.write(str(output_path), audio_array, sample_rate)
+            logger.info("Audio saved to %s (%.1fs)", output_path, len(audio_array) / sample_rate)
+            return output_path
+
+        except Exception:
+            logger.exception("TTS generation failed for: '%s'", text[:60])
+            return None
+
+    # ---- playback ----
+    async def speak(self, text: str, instruction: str | None = None) -> bool:
+        """
+        Generate speech from text and play it immediately.
+
+        Returns:
+            True if playback succeeded, False otherwise.
+        """
+        wav_path = await self.generate(text, instruction)
+        if not wav_path:
+            return False
+        return await self._play(wav_path)
+
+    async def speak_file(self, wav_path: Path) -> bool:
+        """Play a previously generated .wav file."""
+        return await self._play(wav_path)
+
+    @staticmethod
+    async def _play(wav_path: Path) -> bool:
+        """Play a .wav file using pygame.mixer (async-friendly)."""
+        try:
+            import pygame
+
+            pygame.mixer.init(frequency=22050, size=-16, channels=1, buffer=2048)
+            pygame.mixer.music.load(str(wav_path))
+            pygame.mixer.music.play()
+
+            # Wait for playback to finish
+            while pygame.mixer.music.get_busy():
+                await asyncio.sleep(0.05)
+
+            pygame.mixer.music.stop()
+            pygame.mixer.quit()
+            logger.info("Playback finished: %s", wav_path.name)
+            return True
+        except Exception:
+            logger.exception("Playback failed for %s", wav_path)
+            return False
+
+    def set_voice(self, voice: str):
+        """Switch to a different voice preset or custom sample path."""
+        self.voice = voice
+        logger.info("Voice set to: %s", voice)
+
+    def set_instruction(self, instruction: str):
+        """Update the default style instruction."""
+        self.instruction = instruction
+        logger.info("TTS instruction updated: %s", instruction)