""" main.py — Echo Voice Assistant Orchestrator Ties together all modules: 1. WakeWordListener (stt.py) — continuously listens for "echo" 2. Transcriber (stt.py) — captures & transcribes voice commands 3. Brain (brain.py) — sends text to OpenRouter, streams response 4. TTSEngine (tts.py) — generates speech from text (Qwen3-TTS) 5. Actions (actions.py) — executes local OS commands Phase 5 Parallel Processing: As soon as the first complete sentence is received from the Brain's streamed response, TTS generation begins immediately — before the full LLM response has finished streaming. Usage: python main.py """ import asyncio import logging import os import re import signal import sys from pathlib import Path from dotenv import load_dotenv from stt import WakeWordListener, Transcriber from brain import Brain from tts import TTSEngine from actions import execute as execute_action # --------------------------------------------------------------------------- # Logging setup # --------------------------------------------------------------------------- logging.basicConfig( level=logging.INFO, format="%(asctime)s │ %(name)-18s │ %(levelname)-7s │ %(message)s", datefmt="%H:%M:%S", ) logger = logging.getLogger("echo") # --------------------------------------------------------------------------- # Load environment # --------------------------------------------------------------------------- load_dotenv(Path(__file__).parent / ".env") class EchoAssistant: """ Main orchestrator for the Echo voice assistant. Lifecycle: 1. Start wake word listener (background thread). 2. On wake word detected → transcribe command. 3. Stream LLM response → start TTS on first sentence (parallel). 4. Execute any local commands from the LLM response. """ def __init__(self): # --- STT --- model_path = os.environ.get( "VOSK_MODEL_PATH", "models/vosk-model-small-en-us" ) wake_word = os.environ.get("WAKE_WORD", "echo") self.transcriber = Transcriber(model_path=model_path) self.wake_listener = WakeWordListener( wake_word=wake_word, on_detected=self._on_wake_word, ) # --- Brain (LLM) --- self.brain = Brain( api_key=os.environ.get("OPENROUTER_API_KEY"), model=os.environ.get("OPENROUTER_MODEL", "qwen/qwen-3-235b-a22b"), ) # --- TTS --- self.tts = TTSEngine( model_name=os.environ.get("QWEN_TTS_MODEL", "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"), voice=os.environ.get("QWEN_TTS_VOICE", "Ryan"), instruction=os.environ.get( "QWEN_TTS_INSTRUCT", "Speak clearly with a warm, friendly tone. Be natural and conversational.", ), ) # --- State --- self._processing = False # guard against concurrent commands self._shutdown_event = asyncio.Event() logger.info("Echo assistant initialized (wake word: '%s')", wake_word) # ------------------------------------------------------------------ # Wake word callback (runs in background thread) # ------------------------------------------------------------------ def _on_wake_word(self, wake_word: str): """Called by WakeWordListener when the wake word is detected.""" if self._processing: logger.info("Still processing previous command — ignoring wake word") return # Schedule the command processing in the async event loop try: loop = asyncio.get_running_loop() loop.call_soon_threadsafe(loop.create_task, self._handle_command()) except RuntimeError: logger.warning("No running event loop for wake word callback") # ------------------------------------------------------------------ # Main command pipeline # ------------------------------------------------------------------ async def _handle_command(self): """Full pipeline: transcribe → think → speak → act.""" if self._processing: return self._processing = True try: # Play a brief acknowledgment tone logger.info("🔊 Wake word detected — listening...") # Step 1: Transcribe text = self.transcriber.listen_and_transcribe() if not text: logger.info("No transcription — returning to idle") return logger.info("📝 You said: '%s'", text) # Step 2: Stream LLM response with early TTS (Phase 5) await self._stream_and_speak(text) except Exception: logger.exception("Error in command pipeline") finally: self._processing = False logger.info("Returning to idle...") # ------------------------------------------------------------------ # Phase 5: Parallel Streaming + TTS # ------------------------------------------------------------------ async def _stream_and_speak(self, user_text: str): """ Stream the LLM response and start TTS generation as soon as the first complete sentence is available — minimizing perceived latency. """ buffer = "" first_sentence_spoken = False remaining_text = "" pending_command = None tts_tasks: list[asyncio.Task] = [] async for event in self.brain.think(user_text): if event["type"] == "token": buffer += event["text"] # Check if we have a complete sentence if not first_sentence_spoken and self._has_complete_sentence(buffer): # Split: first sentence goes to TTS immediately sentences = self._split_first_sentence(buffer) first_sentence = sentences[0] remaining_text = sentences[1] if len(sentences) > 1 else "" if first_sentence.strip(): logger.info("⚡ Early TTS trigger: '%s'", first_sentence[:60]) task = asyncio.create_task( self.tts.speak(first_sentence.strip()) ) tts_tasks.append(task) first_sentence_spoken = True buffer = remaining_text elif event["type"] == "command": pending_command = event["command"] elif event["type"] == "done": # Any remaining text after the first sentence final_text = buffer.strip() if final_text and final_text != remaining_text: final_text = event["text"] # Remove the already-spoken first sentence if first_sentence_spoken and remaining_text: pass # remaining_text already has what we need else: remaining_text = final_text # Step 3: Speak the remaining text after first sentence finishes remaining_text = remaining_text.strip() if remaining_text: # Wait for first sentence TTS to finish for task in tts_tasks: await task await self.tts.speak(remaining_text) # Wait for all TTS tasks to complete for task in tts_tasks: if not task.done(): await task # Step 4: Execute any local command if pending_command: action_name = pending_command.get("action", "") params = pending_command.get("params", {}) logger.info("🔧 Executing action: %s %s", action_name, params) result = execute_action(action_name, params) if result: await self.tts.speak(result) # ------------------------------------------------------------------ # Text utilities # ------------------------------------------------------------------ @staticmethod def _has_complete_sentence(text: str) -> bool: """Check if the text buffer contains at least one complete sentence.""" # A sentence is considered complete if it ends with . ! ? or ... return bool(re.search(r'[.!?]\s+|[.!?]$', text)) @staticmethod def _split_first_sentence(text: str) -> list[str]: """Split text at the first sentence boundary.""" match = re.search(r'([.!?])\s+', text) if match: end = match.start() + 1 return [text[:end], text[end:].strip()] # Check for ending punctuation without trailing space match = re.search(r'[.!?]$', text.strip()) if match: return [text.strip()] return [text] # ------------------------------------------------------------------ # Lifecycle # ------------------------------------------------------------------ async def start(self): """Start the Echo assistant.""" logger.info("=" * 60) logger.info(" ECHO VOICE ASSISTANT") logger.info(" Say '%s' to activate", os.environ.get("WAKE_WORD", "echo").upper()) logger.info(" Press Ctrl+C to quit") logger.info("=" * 60) # Start wake word listener (runs in background thread) self.wake_listener.start() # Keep the async loop alive until shutdown await self._shutdown_event.wait() def shutdown(self): """Signal the assistant to stop.""" logger.info("Shutting down Echo...") self.wake_listener.stop() self._shutdown_event.set() # --------------------------------------------------------------------------- # Entry Point # --------------------------------------------------------------------------- def main(): assistant = EchoAssistant() # Graceful shutdown on Ctrl+C def _signal_handler(sig, frame): assistant.shutdown() signal.signal(signal.SIGINT, _signal_handler) signal.signal(signal.SIGTERM, _signal_handler) # Run the async event loop try: asyncio.run(assistant.start()) except KeyboardInterrupt: pass finally: assistant.shutdown() logger.info("Echo has shut down. Goodbye!") if __name__ == "__main__": main()