- tts.py: voice_sample param replaces preset voice, add validate_voice_sample(), add record_voice_sample() with CLI (python tts.py record), validate .wav format/duration/channels on init - main.py: warn at startup if voice sample missing, show voice status in banner - .env.example: QWEN_TTS_VOICE now points to voices/echo_voice.wav - .gitignore: voice samples gitignored (personal data) - voices/README.md: instructions for recording & placing voice samples
295 lines
11 KiB
Python
295 lines
11 KiB
Python
"""
|
|
main.py — Echo Voice Assistant Orchestrator
|
|
|
|
Ties together all modules:
|
|
1. WakeWordListener (stt.py) — continuously listens for "echo"
|
|
2. Transcriber (stt.py) — captures & transcribes voice commands
|
|
3. Brain (brain.py) — sends text to OpenRouter, streams response
|
|
4. TTSEngine (tts.py) — generates speech from text (Qwen3-TTS)
|
|
5. Actions (actions.py) — executes local OS commands
|
|
|
|
Phase 5 Parallel Processing:
|
|
As soon as the first complete sentence is received from the Brain's
|
|
streamed response, TTS generation begins immediately — before the
|
|
full LLM response has finished streaming.
|
|
|
|
Usage:
|
|
python main.py
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
import os
|
|
import re
|
|
import signal
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
from dotenv import load_dotenv
|
|
|
|
from stt import WakeWordListener, Transcriber
|
|
from brain import Brain
|
|
from tts import TTSEngine
|
|
from actions import execute as execute_action
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Logging setup
|
|
# ---------------------------------------------------------------------------
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s │ %(name)-18s │ %(levelname)-7s │ %(message)s",
|
|
datefmt="%H:%M:%S",
|
|
)
|
|
logger = logging.getLogger("echo")
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Load environment
|
|
# ---------------------------------------------------------------------------
|
|
load_dotenv(Path(__file__).parent / ".env")
|
|
|
|
|
|
class EchoAssistant:
|
|
"""
|
|
Main orchestrator for the Echo voice assistant.
|
|
|
|
Lifecycle:
|
|
1. Start wake word listener (background thread).
|
|
2. On wake word detected → transcribe command.
|
|
3. Stream LLM response → start TTS on first sentence (parallel).
|
|
4. Execute any local commands from the LLM response.
|
|
"""
|
|
|
|
def __init__(self):
|
|
# --- STT ---
|
|
model_path = os.environ.get(
|
|
"VOSK_MODEL_PATH", "models/vosk-model-small-en-us"
|
|
)
|
|
wake_word = os.environ.get("WAKE_WORD", "echo")
|
|
|
|
self.transcriber = Transcriber(model_path=model_path)
|
|
self.wake_listener = WakeWordListener(
|
|
wake_word=wake_word,
|
|
on_detected=self._on_wake_word,
|
|
)
|
|
|
|
# --- Brain (LLM) ---
|
|
self.brain = Brain(
|
|
api_key=os.environ.get("OPENROUTER_API_KEY"),
|
|
model=os.environ.get("OPENROUTER_MODEL", "qwen/qwen-3-235b-a22b"),
|
|
)
|
|
|
|
# --- TTS (Cloned Voice) ---
|
|
voice_path = os.environ.get("QWEN_TTS_VOICE", "voices/echo_voice.wav")
|
|
self.tts = TTSEngine(
|
|
model_name=os.environ.get("QWEN_TTS_MODEL", "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"),
|
|
voice_sample=voice_path,
|
|
instruction=os.environ.get(
|
|
"QWEN_TTS_INSTRUCT",
|
|
"Speak clearly with a warm, friendly tone. Be natural and conversational.",
|
|
),
|
|
)
|
|
|
|
# --- State ---
|
|
self._processing = False # guard against concurrent commands
|
|
self._shutdown_event = asyncio.Event()
|
|
|
|
if not Path(voice_path).exists():
|
|
logger.warning(
|
|
"No voice sample at '%s' — TTS will not work until you record one. "
|
|
"Run: python tts.py record",
|
|
voice_path,
|
|
)
|
|
|
|
logger.info("Echo assistant initialized (wake word: '%s')", wake_word)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Wake word callback (runs in background thread)
|
|
# ------------------------------------------------------------------
|
|
def _on_wake_word(self, wake_word: str):
|
|
"""Called by WakeWordListener when the wake word is detected."""
|
|
if self._processing:
|
|
logger.info("Still processing previous command — ignoring wake word")
|
|
return
|
|
# Schedule the command processing in the async event loop
|
|
try:
|
|
loop = asyncio.get_running_loop()
|
|
loop.call_soon_threadsafe(loop.create_task, self._handle_command())
|
|
except RuntimeError:
|
|
logger.warning("No running event loop for wake word callback")
|
|
|
|
# ------------------------------------------------------------------
|
|
# Main command pipeline
|
|
# ------------------------------------------------------------------
|
|
async def _handle_command(self):
|
|
"""Full pipeline: transcribe → think → speak → act."""
|
|
if self._processing:
|
|
return
|
|
self._processing = True
|
|
|
|
try:
|
|
# Play a brief acknowledgment tone
|
|
logger.info("🔊 Wake word detected — listening...")
|
|
|
|
# Step 1: Transcribe
|
|
text = self.transcriber.listen_and_transcribe()
|
|
if not text:
|
|
logger.info("No transcription — returning to idle")
|
|
return
|
|
|
|
logger.info("📝 You said: '%s'", text)
|
|
|
|
# Step 2: Stream LLM response with early TTS (Phase 5)
|
|
await self._stream_and_speak(text)
|
|
|
|
except Exception:
|
|
logger.exception("Error in command pipeline")
|
|
finally:
|
|
self._processing = False
|
|
logger.info("Returning to idle...")
|
|
|
|
# ------------------------------------------------------------------
|
|
# Phase 5: Parallel Streaming + TTS
|
|
# ------------------------------------------------------------------
|
|
async def _stream_and_speak(self, user_text: str):
|
|
"""
|
|
Stream the LLM response and start TTS generation as soon as the
|
|
first complete sentence is available — minimizing perceived latency.
|
|
"""
|
|
buffer = ""
|
|
first_sentence_spoken = False
|
|
remaining_text = ""
|
|
pending_command = None
|
|
tts_tasks: list[asyncio.Task] = []
|
|
|
|
async for event in self.brain.think(user_text):
|
|
if event["type"] == "token":
|
|
buffer += event["text"]
|
|
|
|
# Check if we have a complete sentence
|
|
if not first_sentence_spoken and self._has_complete_sentence(buffer):
|
|
# Split: first sentence goes to TTS immediately
|
|
sentences = self._split_first_sentence(buffer)
|
|
first_sentence = sentences[0]
|
|
remaining_text = sentences[1] if len(sentences) > 1 else ""
|
|
|
|
if first_sentence.strip():
|
|
logger.info("⚡ Early TTS trigger: '%s'", first_sentence[:60])
|
|
task = asyncio.create_task(
|
|
self.tts.speak(first_sentence.strip())
|
|
)
|
|
tts_tasks.append(task)
|
|
|
|
first_sentence_spoken = True
|
|
buffer = remaining_text
|
|
|
|
elif event["type"] == "command":
|
|
pending_command = event["command"]
|
|
|
|
elif event["type"] == "done":
|
|
# Any remaining text after the first sentence
|
|
final_text = buffer.strip()
|
|
if final_text and final_text != remaining_text:
|
|
final_text = event["text"]
|
|
# Remove the already-spoken first sentence
|
|
if first_sentence_spoken and remaining_text:
|
|
pass # remaining_text already has what we need
|
|
else:
|
|
remaining_text = final_text
|
|
|
|
# Step 3: Speak the remaining text after first sentence finishes
|
|
remaining_text = remaining_text.strip()
|
|
if remaining_text:
|
|
# Wait for first sentence TTS to finish
|
|
for task in tts_tasks:
|
|
await task
|
|
await self.tts.speak(remaining_text)
|
|
|
|
# Wait for all TTS tasks to complete
|
|
for task in tts_tasks:
|
|
if not task.done():
|
|
await task
|
|
|
|
# Step 4: Execute any local command
|
|
if pending_command:
|
|
action_name = pending_command.get("action", "")
|
|
params = pending_command.get("params", {})
|
|
logger.info("🔧 Executing action: %s %s", action_name, params)
|
|
result = execute_action(action_name, params)
|
|
if result:
|
|
await self.tts.speak(result)
|
|
|
|
# ------------------------------------------------------------------
|
|
# Text utilities
|
|
# ------------------------------------------------------------------
|
|
@staticmethod
|
|
def _has_complete_sentence(text: str) -> bool:
|
|
"""Check if the text buffer contains at least one complete sentence."""
|
|
# A sentence is considered complete if it ends with . ! ? or ...
|
|
return bool(re.search(r'[.!?]\s+|[.!?]$', text))
|
|
|
|
@staticmethod
|
|
def _split_first_sentence(text: str) -> list[str]:
|
|
"""Split text at the first sentence boundary."""
|
|
match = re.search(r'([.!?])\s+', text)
|
|
if match:
|
|
end = match.start() + 1
|
|
return [text[:end], text[end:].strip()]
|
|
# Check for ending punctuation without trailing space
|
|
match = re.search(r'[.!?]$', text.strip())
|
|
if match:
|
|
return [text.strip()]
|
|
return [text]
|
|
|
|
# ------------------------------------------------------------------
|
|
# Lifecycle
|
|
# ------------------------------------------------------------------
|
|
async def start(self):
|
|
"""Start the Echo assistant."""
|
|
logger.info("=" * 60)
|
|
voice_path = os.environ.get("QWEN_TTS_VOICE", "voices/echo_voice.wav")
|
|
logger.info(" ECHO VOICE ASSISTANT")
|
|
logger.info(" Voice: %s (%s)", voice_path,
|
|
"✅" if Path(voice_path).exists() else "❌ missing")
|
|
logger.info(" Say '%s' to activate", os.environ.get("WAKE_WORD", "echo").upper())
|
|
logger.info(" Press Ctrl+C to quit")
|
|
logger.info("=" * 60)
|
|
|
|
# Start wake word listener (runs in background thread)
|
|
self.wake_listener.start()
|
|
|
|
# Keep the async loop alive until shutdown
|
|
await self._shutdown_event.wait()
|
|
|
|
def shutdown(self):
|
|
"""Signal the assistant to stop."""
|
|
logger.info("Shutting down Echo...")
|
|
self.wake_listener.stop()
|
|
self._shutdown_event.set()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Entry Point
|
|
# ---------------------------------------------------------------------------
|
|
def main():
|
|
assistant = EchoAssistant()
|
|
|
|
# Graceful shutdown on Ctrl+C
|
|
def _signal_handler(sig, frame):
|
|
assistant.shutdown()
|
|
|
|
signal.signal(signal.SIGINT, _signal_handler)
|
|
signal.signal(signal.SIGTERM, _signal_handler)
|
|
|
|
# Run the async event loop
|
|
try:
|
|
asyncio.run(assistant.start())
|
|
except KeyboardInterrupt:
|
|
pass
|
|
finally:
|
|
assistant.shutdown()
|
|
logger.info("Echo has shut down. Goodbye!")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|