moxieTalking/main.py

"""
main.py — Echo Voice Assistant Orchestrator

Ties together all modules:
  1. WakeWordListener  (stt.py) — continuously listens for "echo"
  2. Transcriber       (stt.py) — captures & transcribes voice commands
  3. Brain             (brain.py) — sends text to OpenRouter, streams response
  4. TTSEngine         (tts.py) — generates speech from text (Qwen3-TTS)
  5. Actions           (actions.py) — executes local OS commands

Phase 5 Parallel Processing:
  As soon as the first complete sentence is received from the Brain's
  streamed response, TTS generation begins immediately — before the
  full LLM response has finished streaming.

Usage:
  python main.py
"""

import asyncio
import logging
import os
import re
import signal
import sys
from pathlib import Path

from dotenv import load_dotenv

from stt import WakeWordListener, Transcriber
from brain import Brain
from tts import TTSEngine
from actions import execute as execute_action

# ---------------------------------------------------------------------------
# Logging setup
# ---------------------------------------------------------------------------
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s │ %(name)-18s │ %(levelname)-7s │ %(message)s",
    datefmt="%H:%M:%S",
)
logger = logging.getLogger("echo")

# ---------------------------------------------------------------------------
# Load environment
# ---------------------------------------------------------------------------
load_dotenv(Path(__file__).parent / ".env")


class EchoAssistant:
    """
    Main orchestrator for the Echo voice assistant.

    Lifecycle:
      1. Start wake word listener (background thread).
      2. On wake word detected → transcribe command.
      3. Stream LLM response → start TTS on first sentence (parallel).
      4. Execute any local commands from the LLM response.
    """

    def __init__(self):
        # --- STT ---
        model_path = os.environ.get(
            "VOSK_MODEL_PATH", "models/vosk-model-small-en-us"
        )
        wake_word = os.environ.get("WAKE_WORD", "echo")

        self.transcriber = Transcriber(model_path=model_path)
        self.wake_listener = WakeWordListener(
            wake_word=wake_word,
            on_detected=self._on_wake_word,
        )

        # --- Brain (LLM) ---
        self.brain = Brain(
            api_key=os.environ.get("OPENROUTER_API_KEY"),
            model=os.environ.get("OPENROUTER_MODEL", "qwen/qwen-3-235b-a22b"),
        )

        # --- TTS (Cloned Voice) ---
        voice_path = os.environ.get("QWEN_TTS_VOICE", "voices/echo_voice.wav")
        self.tts = TTSEngine(
            model_name=os.environ.get("QWEN_TTS_MODEL", "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"),
            voice_sample=voice_path,
            instruction=os.environ.get(
                "QWEN_TTS_INSTRUCT",
                "Speak clearly with a warm, friendly tone. Be natural and conversational.",
            ),
        )

        # --- State ---
        self._processing = False  # guard against concurrent commands
        self._shutdown_event = asyncio.Event()

        if not Path(voice_path).exists():
            logger.warning(
                "No voice sample at '%s' — TTS will not work until you record one. "
                "Run: python tts.py record",
                voice_path,
            )

        logger.info("Echo assistant initialized (wake word: '%s')", wake_word)

    # ------------------------------------------------------------------
    # Wake word callback (runs in background thread)
    # ------------------------------------------------------------------
    def _on_wake_word(self, wake_word: str):
        """Called by WakeWordListener when the wake word is detected."""
        if self._processing:
            logger.info("Still processing previous command — ignoring wake word")
            return
        # Schedule the command processing in the async event loop
        try:
            loop = asyncio.get_running_loop()
            loop.call_soon_threadsafe(loop.create_task, self._handle_command())
        except RuntimeError:
            logger.warning("No running event loop for wake word callback")

    # ------------------------------------------------------------------
    # Main command pipeline
    # ------------------------------------------------------------------
    async def _handle_command(self):
        """Full pipeline: transcribe → think → speak → act."""
        if self._processing:
            return
        self._processing = True

        try:
            # Play a brief acknowledgment tone
            logger.info("🔊 Wake word detected — listening...")

            # Step 1: Transcribe
            text = self.transcriber.listen_and_transcribe()
            if not text:
                logger.info("No transcription — returning to idle")
                return

            logger.info("📝 You said: '%s'", text)

            # Step 2: Stream LLM response with early TTS (Phase 5)
            await self._stream_and_speak(text)

        except Exception:
            logger.exception("Error in command pipeline")
        finally:
            self._processing = False
            logger.info("Returning to idle...")

    # ------------------------------------------------------------------
    # Phase 5: Parallel Streaming + TTS
    # ------------------------------------------------------------------
    async def _stream_and_speak(self, user_text: str):
        """
        Stream the LLM response and start TTS generation as soon as the
        first complete sentence is available — minimizing perceived latency.
        """
        buffer = ""
        first_sentence_spoken = False
        remaining_text = ""
        pending_command = None
        tts_tasks: list[asyncio.Task] = []

        async for event in self.brain.think(user_text):
            if event["type"] == "token":
                buffer += event["text"]

                # Check if we have a complete sentence
                if not first_sentence_spoken and self._has_complete_sentence(buffer):
                    # Split: first sentence goes to TTS immediately
                    sentences = self._split_first_sentence(buffer)
                    first_sentence = sentences[0]
                    remaining_text = sentences[1] if len(sentences) > 1 else ""

                    if first_sentence.strip():
                        logger.info("⚡ Early TTS trigger: '%s'", first_sentence[:60])
                        task = asyncio.create_task(
                            self.tts.speak(first_sentence.strip())
                        )
                        tts_tasks.append(task)

                    first_sentence_spoken = True
                    buffer = remaining_text

            elif event["type"] == "command":
                pending_command = event["command"]

            elif event["type"] == "done":
                # Any remaining text after the first sentence
                final_text = buffer.strip()
                if final_text and final_text != remaining_text:
                    final_text = event["text"]
                    # Remove the already-spoken first sentence
                    if first_sentence_spoken and remaining_text:
                        pass  # remaining_text already has what we need
                    else:
                        remaining_text = final_text

        # Step 3: Speak the remaining text after first sentence finishes
        remaining_text = remaining_text.strip()
        if remaining_text:
            # Wait for first sentence TTS to finish
            for task in tts_tasks:
                await task
            await self.tts.speak(remaining_text)

        # Wait for all TTS tasks to complete
        for task in tts_tasks:
            if not task.done():
                await task

        # Step 4: Execute any local command
        if pending_command:
            action_name = pending_command.get("action", "")
            params = pending_command.get("params", {})
            logger.info("🔧 Executing action: %s %s", action_name, params)
            result = execute_action(action_name, params)
            if result:
                await self.tts.speak(result)

    # ------------------------------------------------------------------
    # Text utilities
    # ------------------------------------------------------------------
    @staticmethod
    def _has_complete_sentence(text: str) -> bool:
        """Check if the text buffer contains at least one complete sentence."""
        # A sentence is considered complete if it ends with . ! ? or ...
        return bool(re.search(r'[.!?]\s+|[.!?]$', text))

    @staticmethod
    def _split_first_sentence(text: str) -> list[str]:
        """Split text at the first sentence boundary."""
        match = re.search(r'([.!?])\s+', text)
        if match:
            end = match.start() + 1
            return [text[:end], text[end:].strip()]
        # Check for ending punctuation without trailing space
        match = re.search(r'[.!?]$', text.strip())
        if match:
            return [text.strip()]
        return [text]

    # ------------------------------------------------------------------
    # Lifecycle
    # ------------------------------------------------------------------
    async def start(self):
        """Start the Echo assistant."""
        logger.info("=" * 60)
        voice_path = os.environ.get("QWEN_TTS_VOICE", "voices/echo_voice.wav")
        logger.info("  ECHO VOICE ASSISTANT")
        logger.info("  Voice: %s (%s)", voice_path,
                     "✅" if Path(voice_path).exists() else "❌ missing")
        logger.info("  Say '%s' to activate", os.environ.get("WAKE_WORD", "echo").upper())
        logger.info("  Press Ctrl+C to quit")
        logger.info("=" * 60)

        # Start wake word listener (runs in background thread)
        self.wake_listener.start()

        # Keep the async loop alive until shutdown
        await self._shutdown_event.wait()

    def shutdown(self):
        """Signal the assistant to stop."""
        logger.info("Shutting down Echo...")
        self.wake_listener.stop()
        self._shutdown_event.set()


# ---------------------------------------------------------------------------
# Entry Point
# ---------------------------------------------------------------------------
def main():
    assistant = EchoAssistant()

    # Graceful shutdown on Ctrl+C
    def _signal_handler(sig, frame):
        assistant.shutdown()

    signal.signal(signal.SIGINT, _signal_handler)
    signal.signal(signal.SIGTERM, _signal_handler)

    # Run the async event loop
    try:
        asyncio.run(assistant.start())
    except KeyboardInterrupt:
        pass
    finally:
        assistant.shutdown()
        logger.info("Echo has shut down. Goodbye!")


if __name__ == "__main__":
    main()