moxieTalking/main.py
Echo Assistant d6b64d04d1 feat: initial Echo voice assistant — Vosk + OpenRouter + Qwen3-TTS
- stt.py: WakeWordListener (openWakeWord) + Transcriber (Vosk)
- brain.py: Async OpenRouter streaming client with command parsing
- tts.py: Qwen3-TTS engine with voice selection & instruction control
- actions.py: 10 local OS commands (open_app, set_timer, search, etc.)
- main.py: Async orchestrator with Phase 5 parallel TTS streaming
2026-03-31 00:09:00 +00:00

284 lines
10 KiB
Python

"""
main.py — Echo Voice Assistant Orchestrator
Ties together all modules:
1. WakeWordListener (stt.py) — continuously listens for "echo"
2. Transcriber (stt.py) — captures & transcribes voice commands
3. Brain (brain.py) — sends text to OpenRouter, streams response
4. TTSEngine (tts.py) — generates speech from text (Qwen3-TTS)
5. Actions (actions.py) — executes local OS commands
Phase 5 Parallel Processing:
As soon as the first complete sentence is received from the Brain's
streamed response, TTS generation begins immediately — before the
full LLM response has finished streaming.
Usage:
python main.py
"""
import asyncio
import logging
import os
import re
import signal
import sys
from pathlib import Path
from dotenv import load_dotenv
from stt import WakeWordListener, Transcriber
from brain import Brain
from tts import TTSEngine
from actions import execute as execute_action
# ---------------------------------------------------------------------------
# Logging setup
# ---------------------------------------------------------------------------
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s%(name)-18s%(levelname)-7s%(message)s",
datefmt="%H:%M:%S",
)
logger = logging.getLogger("echo")
# ---------------------------------------------------------------------------
# Load environment
# ---------------------------------------------------------------------------
load_dotenv(Path(__file__).parent / ".env")
class EchoAssistant:
"""
Main orchestrator for the Echo voice assistant.
Lifecycle:
1. Start wake word listener (background thread).
2. On wake word detected → transcribe command.
3. Stream LLM response → start TTS on first sentence (parallel).
4. Execute any local commands from the LLM response.
"""
def __init__(self):
# --- STT ---
model_path = os.environ.get(
"VOSK_MODEL_PATH", "models/vosk-model-small-en-us"
)
wake_word = os.environ.get("WAKE_WORD", "echo")
self.transcriber = Transcriber(model_path=model_path)
self.wake_listener = WakeWordListener(
wake_word=wake_word,
on_detected=self._on_wake_word,
)
# --- Brain (LLM) ---
self.brain = Brain(
api_key=os.environ.get("OPENROUTER_API_KEY"),
model=os.environ.get("OPENROUTER_MODEL", "qwen/qwen-3-235b-a22b"),
)
# --- TTS ---
self.tts = TTSEngine(
model_name=os.environ.get("QWEN_TTS_MODEL", "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"),
voice=os.environ.get("QWEN_TTS_VOICE", "Ryan"),
instruction=os.environ.get(
"QWEN_TTS_INSTRUCT",
"Speak clearly with a warm, friendly tone. Be natural and conversational.",
),
)
# --- State ---
self._processing = False # guard against concurrent commands
self._shutdown_event = asyncio.Event()
logger.info("Echo assistant initialized (wake word: '%s')", wake_word)
# ------------------------------------------------------------------
# Wake word callback (runs in background thread)
# ------------------------------------------------------------------
def _on_wake_word(self, wake_word: str):
"""Called by WakeWordListener when the wake word is detected."""
if self._processing:
logger.info("Still processing previous command — ignoring wake word")
return
# Schedule the command processing in the async event loop
try:
loop = asyncio.get_running_loop()
loop.call_soon_threadsafe(loop.create_task, self._handle_command())
except RuntimeError:
logger.warning("No running event loop for wake word callback")
# ------------------------------------------------------------------
# Main command pipeline
# ------------------------------------------------------------------
async def _handle_command(self):
"""Full pipeline: transcribe → think → speak → act."""
if self._processing:
return
self._processing = True
try:
# Play a brief acknowledgment tone
logger.info("🔊 Wake word detected — listening...")
# Step 1: Transcribe
text = self.transcriber.listen_and_transcribe()
if not text:
logger.info("No transcription — returning to idle")
return
logger.info("📝 You said: '%s'", text)
# Step 2: Stream LLM response with early TTS (Phase 5)
await self._stream_and_speak(text)
except Exception:
logger.exception("Error in command pipeline")
finally:
self._processing = False
logger.info("Returning to idle...")
# ------------------------------------------------------------------
# Phase 5: Parallel Streaming + TTS
# ------------------------------------------------------------------
async def _stream_and_speak(self, user_text: str):
"""
Stream the LLM response and start TTS generation as soon as the
first complete sentence is available — minimizing perceived latency.
"""
buffer = ""
first_sentence_spoken = False
remaining_text = ""
pending_command = None
tts_tasks: list[asyncio.Task] = []
async for event in self.brain.think(user_text):
if event["type"] == "token":
buffer += event["text"]
# Check if we have a complete sentence
if not first_sentence_spoken and self._has_complete_sentence(buffer):
# Split: first sentence goes to TTS immediately
sentences = self._split_first_sentence(buffer)
first_sentence = sentences[0]
remaining_text = sentences[1] if len(sentences) > 1 else ""
if first_sentence.strip():
logger.info("⚡ Early TTS trigger: '%s'", first_sentence[:60])
task = asyncio.create_task(
self.tts.speak(first_sentence.strip())
)
tts_tasks.append(task)
first_sentence_spoken = True
buffer = remaining_text
elif event["type"] == "command":
pending_command = event["command"]
elif event["type"] == "done":
# Any remaining text after the first sentence
final_text = buffer.strip()
if final_text and final_text != remaining_text:
final_text = event["text"]
# Remove the already-spoken first sentence
if first_sentence_spoken and remaining_text:
pass # remaining_text already has what we need
else:
remaining_text = final_text
# Step 3: Speak the remaining text after first sentence finishes
remaining_text = remaining_text.strip()
if remaining_text:
# Wait for first sentence TTS to finish
for task in tts_tasks:
await task
await self.tts.speak(remaining_text)
# Wait for all TTS tasks to complete
for task in tts_tasks:
if not task.done():
await task
# Step 4: Execute any local command
if pending_command:
action_name = pending_command.get("action", "")
params = pending_command.get("params", {})
logger.info("🔧 Executing action: %s %s", action_name, params)
result = execute_action(action_name, params)
if result:
await self.tts.speak(result)
# ------------------------------------------------------------------
# Text utilities
# ------------------------------------------------------------------
@staticmethod
def _has_complete_sentence(text: str) -> bool:
"""Check if the text buffer contains at least one complete sentence."""
# A sentence is considered complete if it ends with . ! ? or ...
return bool(re.search(r'[.!?]\s+|[.!?]$', text))
@staticmethod
def _split_first_sentence(text: str) -> list[str]:
"""Split text at the first sentence boundary."""
match = re.search(r'([.!?])\s+', text)
if match:
end = match.start() + 1
return [text[:end], text[end:].strip()]
# Check for ending punctuation without trailing space
match = re.search(r'[.!?]$', text.strip())
if match:
return [text.strip()]
return [text]
# ------------------------------------------------------------------
# Lifecycle
# ------------------------------------------------------------------
async def start(self):
"""Start the Echo assistant."""
logger.info("=" * 60)
logger.info(" ECHO VOICE ASSISTANT")
logger.info(" Say '%s' to activate", os.environ.get("WAKE_WORD", "echo").upper())
logger.info(" Press Ctrl+C to quit")
logger.info("=" * 60)
# Start wake word listener (runs in background thread)
self.wake_listener.start()
# Keep the async loop alive until shutdown
await self._shutdown_event.wait()
def shutdown(self):
"""Signal the assistant to stop."""
logger.info("Shutting down Echo...")
self.wake_listener.stop()
self._shutdown_event.set()
# ---------------------------------------------------------------------------
# Entry Point
# ---------------------------------------------------------------------------
def main():
assistant = EchoAssistant()
# Graceful shutdown on Ctrl+C
def _signal_handler(sig, frame):
assistant.shutdown()
signal.signal(signal.SIGINT, _signal_handler)
signal.signal(signal.SIGTERM, _signal_handler)
# Run the async event loop
try:
asyncio.run(assistant.start())
except KeyboardInterrupt:
pass
finally:
assistant.shutdown()
logger.info("Echo has shut down. Goodbye!")
if __name__ == "__main__":
main()