moxieTalking/tts.py

"""
tts.py — Text-To-Speech Module (Qwen3-TTS)

Responsibilities:
  1. Accept text (full or partial sentence) and generate a .wav audio file
     using the Qwen3-TTS model running locally.
  2. Support voice selection (preset voices or custom voice cloning).
  3. Support instruction-based style control (e.g., energy, tone).
  4. Play the generated audio immediately.

Environment Variables:
  QWEN_TTS_MODEL    — model name or local path (default: Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice)
  QWEN_TTS_VOICE    — preset voice name or path to 3s .wav sample
  QWEN_TTS_INSTRUCT  — default style instruction for speech generation

Dependencies:
  pip install qwen-tts torch soundfile pygame
"""

import asyncio
import logging
import os
import tempfile
from pathlib import Path

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
DEFAULT_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
DEFAULT_VOICE = "Ryan"  # preset voice; alternatives: "Serena", "Diana", etc.
DEFAULT_INSTRUCTION = "Speak clearly with a warm, friendly tone. Be natural and conversational."
OUTPUT_DIR = Path("audio_output")


class TTSEngine:
    """
    Wrapper around Qwen3-TTS for generating speech from text.

    The engine lazily loads the model on first use to avoid slow startup.
    """

    def __init__(
        self,
        model_name: str = DEFAULT_MODEL,
        voice: str = DEFAULT_VOICE,
        instruction: str = DEFAULT_INSTRUCTION,
        output_dir: str | Path = OUTPUT_DIR,
    ):
        self.model_name = model_name
        self.voice = voice
        self.instruction = instruction
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)

        self._model = None
        self._processor = None
        self._lock = asyncio.Lock()  # prevent concurrent generation

    # ---- lazy model loading ----
    def _ensure_loaded(self):
        """Load model and processor on first call (lazy init)."""
        if self._model is not None:
            return

        logger.info("Loading Qwen3-TTS model '%s' (this may take a moment)...", self.model_name)
        try:
            from qwen_tts import QwenTTSProcessor, QwenTTSModel

            self._processor = QwenTTSProcessor()
            self._model = QwenTTSModel.from_pretrained(self.model_name)
            logger.info("Qwen3-TTS model loaded successfully")
        except ImportError:
            raise ImportError(
                "qwen-tts is not installed. Install it with:\n"
                "  pip install qwen-tts torch soundfile\n"
                "Also ensure you have CUDA-capable GPU for low-latency inference."
            )

    # ---- generation ----
    async def generate(self, text: str, instruction: str | None = None) -> Path | None:
        """
        Generate speech audio from text and save as .wav.

        Args:
            text: The text to convert to speech.
            instruction: Optional style instruction override.

        Returns:
            Path to the generated .wav file, or None on failure.
        """
        if not text or not text.strip():
            return None

        async with self._lock:
            return await asyncio.to_thread(
                self._generate_sync, text.strip(), instruction or self.instruction
            )

    def _generate_sync(self, text: str, instruction: str) -> Path | None:
        """Synchronous generation (runs in thread pool)."""
        self._ensure_loaded()

        output_path = self.output_dir / f"echo_{os.urandom(4).hex()}.wav"

        try:
            # Build voice reference: preset name or custom .wav path
            voice_ref = self.voice
            if Path(self.voice).exists():
                voice_ref = str(Path(self.voice).resolve())

            # Generate audio
            logger.info("Generating speech: '%s' (voice=%s)", text[:60], self.voice)
            audio_array = self._model.generate(
                processor=self._processor,
                text=text,
                voice=voice_ref,
                instruction=instruction,
            )

            # Save to file
            import soundfile as sf

            sample_rate = self._processor.sampling_rate
            sf.write(str(output_path), audio_array, sample_rate)
            logger.info("Audio saved to %s (%.1fs)", output_path, len(audio_array) / sample_rate)
            return output_path

        except Exception:
            logger.exception("TTS generation failed for: '%s'", text[:60])
            return None

    # ---- playback ----
    async def speak(self, text: str, instruction: str | None = None) -> bool:
        """
        Generate speech from text and play it immediately.

        Returns:
            True if playback succeeded, False otherwise.
        """
        wav_path = await self.generate(text, instruction)
        if not wav_path:
            return False
        return await self._play(wav_path)

    async def speak_file(self, wav_path: Path) -> bool:
        """Play a previously generated .wav file."""
        return await self._play(wav_path)

    @staticmethod
    async def _play(wav_path: Path) -> bool:
        """Play a .wav file using pygame.mixer (async-friendly)."""
        try:
            import pygame

            pygame.mixer.init(frequency=22050, size=-16, channels=1, buffer=2048)
            pygame.mixer.music.load(str(wav_path))
            pygame.mixer.music.play()

            # Wait for playback to finish
            while pygame.mixer.music.get_busy():
                await asyncio.sleep(0.05)

            pygame.mixer.music.stop()
            pygame.mixer.quit()
            logger.info("Playback finished: %s", wav_path.name)
            return True
        except Exception:
            logger.exception("Playback failed for %s", wav_path)
            return False

    def set_voice(self, voice: str):
        """Switch to a different voice preset or custom sample path."""
        self.voice = voice
        logger.info("Voice set to: %s", voice)

    def set_instruction(self, instruction: str):
        """Update the default style instruction."""
        self.instruction = instruction
        logger.info("TTS instruction updated: %s", instruction)