moxieTalking/tts.py

"""
tts.py — Text-To-Speech Module (Qwen3-TTS)

Responsibilities:
  1. Accept text (full or partial sentence) and generate a .wav audio file
     using the Qwen3-TTS model running locally with a **cloned voice**.
  2. Validate the voice sample on init (must be a 2–5 second .wav file).
  3. Provide a built-in recorder so users can create their voice sample
     directly from the assistant.
  4. Support instruction-based style control (e.g., energy, tone).
  5. Play the generated audio immediately.

Cloned Voice Workflow:
  - Qwen3-TTS-12Hz-1.7B-CustomVoice can clone a voice from a short audio
    sample (recommended: 3 seconds, clean speech, no background noise).
  - Place your sample at the path specified by QWEN_TTS_VOICE (default:
    voices/echo_voice.wav).
  - Or run `python tts.py` to record a 3-second sample interactively.

Environment Variables:
  QWEN_TTS_MODEL     — model name or local path
  QWEN_TTS_VOICE     — path to .wav voice sample (required for cloning)
  QWEN_TTS_INSTRUCT  — default style instruction for speech generation

Dependencies:
  pip install qwen-tts torch soundfile pygame pyaudio
"""

import asyncio
import array
import logging
import os
import struct
import wave
from pathlib import Path

import pyaudio

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
DEFAULT_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
DEFAULT_VOICE_SAMPLE = "voices/echo_voice.wav"
DEFAULT_INSTRUCTION = (
    "Speak clearly with a warm, friendly tone. Be natural and conversational."
)
OUTPUT_DIR = Path("audio_output")

# Recording constants
REC_FORMAT = pyaudio.paInt16
REC_CHANNELS = 1
REC_RATE = 16000
REC_CHUNK = 1024
REC_DURATION = 3  # seconds — optimal for Qwen3 voice cloning


# ---------------------------------------------------------------------------
# Voice sample validation
# ---------------------------------------------------------------------------
def validate_voice_sample(path: str | Path) -> tuple[bool, str]:
    """
    Check that a voice sample file exists and meets Qwen3-TTS requirements.

    Returns:
        (is_valid, reason)
    """
    p = Path(path)

    if not p.exists():
        return False, f"Voice sample not found at: {p.resolve()}. Record one with `python tts.py`"

    if p.suffix.lower() != ".wav":
        return False, f"Voice sample must be a .wav file, got '{p.suffix}'"

    try:
        with wave.open(str(p), "rb") as wf:
            channels = wf.getnchannels()
            sample_width = wf.getsampwidth()
            framerate = wf.getframerate()
            nframes = wf.getnframes()
            duration = nframes / framerate
    except Exception as exc:
        return False, f"Could not read .wav file: {exc}"

    issues = []
    if channels != 1:
        issues.append(f"expected mono (1 channel), got {channels}")
    if framerate < 16000:
        issues.append(f"sample rate {framerate} Hz is too low (min 16000)")
    if duration < 2:
        issues.append(f"sample is {duration:.1f}s — too short (min 2s, recommended 3s)")
    elif duration > 5:
        issues.append(f"sample is {duration:.1f}s — too long (max 5s, recommended 3s)")
    if sample_width != 2:
        issues.append(f"expected 16-bit audio, got {sample_width * 8}-bit")

    if issues:
        return False, f"Voice sample issues: {'; '.join(issues)}"

    return True, f"Voice sample OK: {duration:.1f}s, {framerate} Hz, mono, 16-bit"


# ---------------------------------------------------------------------------
# Voice sample recorder
# ---------------------------------------------------------------------------
def record_voice_sample(output_path: str | Path, duration: int = REC_DURATION) -> Path:
    """
    Record a short voice sample from the microphone for voice cloning.

    The user will hear a countdown and should speak naturally for the
    full duration. The recording is saved as a 16 kHz mono 16-bit .wav file.

    Args:
        output_path: Where to save the .wav file.
        duration: Recording length in seconds (default 3).

    Returns:
        Path to the saved .wav file.
    """
    output_path = Path(output_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)

    print(f"\n🎙️  Recording a {duration}-second voice sample for Echo...")
    print(f"   Speak naturally in a clear voice. No background noise.")
    print(f"   Saving to: {output_path.resolve()}\n")

    pa = pyaudio.PyAudio()
    stream = pa.open(
        format=REC_FORMAT,
        channels=REC_CHANNELS,
        rate=REC_RATE,
        input=True,
        frames_per_buffer=REC_CHUNK,
    )

    frames = []
    for i in range(int(REC_RATE / REC_CHUNK * duration)):
        frame = stream.read(REC_CHUNK, exception_on_overflow=False)
        frames.append(frame)
        remaining = duration - (i + 1) * REC_CHUNK / REC_RATE
        if int(remaining) != int(remaining + REC_CHUNK / REC_RATE):
            print(f"   ... {int(remaining)}s remaining")

    stream.stop_stream()
    stream.close()
    pa.terminate()

    # Write .wav
    with wave.open(str(output_path), "wb") as wf:
        wf.setnchannels(REC_CHANNELS)
        wf.setsampwidth(2)  # 16-bit
        wf.setframerate(REC_RATE)
        wf.writeframes(b"".join(frames))

    ok, msg = validate_voice_sample(output_path)
    if ok:
        print(f"\n✅ {msg}")
    else:
        print(f"\n⚠️  {msg}")
    print(f"   File saved: {output_path.resolve()}\n")

    return output_path


class TTSEngine:
    """
    Wrapper around Qwen3-TTS for generating speech with a cloned voice.

    The engine lazily loads the model on first use to avoid slow startup.
    """

    def __init__(
        self,
        model_name: str = DEFAULT_MODEL,
        voice_sample: str = DEFAULT_VOICE_SAMPLE,
        instruction: str = DEFAULT_INSTRUCTION,
        output_dir: str | Path = OUTPUT_DIR,
    ):
        self.model_name = model_name
        self.voice_sample = Path(voice_sample)
        self.instruction = instruction
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)

        self._model = None
        self._processor = None
        self._lock = asyncio.Lock()  # prevent concurrent generation

        # Validate voice sample on init
        self._validate_voice()

    def _validate_voice(self):
        """Check the voice sample and log warnings if it's not usable."""
        ok, msg = validate_voice_sample(self.voice_sample)
        if ok:
            logger.info("🎤 Voice: %s", msg)
        else:
            logger.warning("🎤 Voice sample issue — %s", msg)
            logger.warning(
                "   Record a sample with: python tts.py record"
            )

    # ---- lazy model loading ----
    def _ensure_loaded(self):
        """Load model and processor on first call (lazy init)."""
        if self._model is not None:
            return

        logger.info(
            "Loading Qwen3-TTS model '%s' (this may take a moment)...",
            self.model_name,
        )
        try:
            from qwen_tts import QwenTTSModel, QwenTTSProcessor

            self._processor = QwenTTSProcessor()
            self._model = QwenTTSModel.from_pretrained(self.model_name)
            logger.info("Qwen3-TTS model loaded successfully")
        except ImportError:
            raise ImportError(
                "qwen-tts is not installed. Install it with:\n"
                "  pip install qwen-tts torch soundfile\n"
                "Also ensure you have CUDA-capable GPU for low-latency inference."
            )

    # ---- generation ----
    async def generate(self, text: str, instruction: str | None = None) -> Path | None:
        """
        Generate speech audio from text using the cloned voice.

        Args:
            text: The text to convert to speech.
            instruction: Optional style instruction override.

        Returns:
            Path to the generated .wav file, or None on failure.
        """
        if not text or not text.strip():
            return None

        async with self._lock:
            return await asyncio.to_thread(
                self._generate_sync, text.strip(), instruction or self.instruction
            )

    def _generate_sync(self, text: str, instruction: str) -> Path | None:
        """Synchronous generation (runs in thread pool)."""
        self._ensure_loaded()

        # Double-check voice sample before generating
        if not self.voice_sample.exists():
            logger.error(
                "Voice sample missing at '%s' — cannot generate speech",
                self.voice_sample.resolve(),
            )
            return None

        output_path = self.output_dir / f"echo_{os.urandom(4).hex()}.wav"

        try:
            voice_ref = str(self.voice_sample.resolve())

            logger.info(
                "Generating speech: '%s' (voice=%s)",
                text[:60],
                self.voice_sample.name,
            )
            audio_array = self._model.generate(
                processor=self._processor,
                text=text,
                voice=voice_ref,
                instruction=instruction,
            )

            import soundfile as sf

            sample_rate = self._processor.sampling_rate
            sf.write(str(output_path), audio_array, sample_rate)
            logger.info(
                "Audio saved to %s (%.1fs)",
                output_path,
                len(audio_array) / sample_rate,
            )
            return output_path

        except Exception:
            logger.exception("TTS generation failed for: '%s'", text[:60])
            return None

    # ---- playback ----
    async def speak(self, text: str, instruction: str | None = None) -> bool:
        """
        Generate speech from text and play it immediately.

        Returns:
            True if playback succeeded, False otherwise.
        """
        wav_path = await self.generate(text, instruction)
        if not wav_path:
            return False
        return await self._play(wav_path)

    async def speak_file(self, wav_path: Path) -> bool:
        """Play a previously generated .wav file."""
        return await self._play(wav_path)

    @staticmethod
    async def _play(wav_path: Path) -> bool:
        """Play a .wav file using pygame.mixer (async-friendly)."""
        try:
            import pygame

            pygame.mixer.init(frequency=22050, size=-16, channels=1, buffer=2048)
            pygame.mixer.music.load(str(wav_path))
            pygame.mixer.music.play()

            while pygame.mixer.music.get_busy():
                await asyncio.sleep(0.05)

            pygame.mixer.music.stop()
            pygame.mixer.quit()
            logger.info("Playback finished: %s", wav_path.name)
            return True
        except Exception:
            logger.exception("Playback failed for %s", wav_path)
            return False

    def set_voice_sample(self, path: str):
        """Switch to a different voice sample .wav file."""
        self.voice_sample = Path(path)
        self._validate_voice()
        logger.info("Voice sample set to: %s", self.voice_sample.resolve())

    def set_instruction(self, instruction: str):
        """Update the default style instruction."""
        self.instruction = instruction
        logger.info("TTS instruction updated: %s", instruction)


# ---------------------------------------------------------------------------
# CLI — record a voice sample directly
# ---------------------------------------------------------------------------
if __name__ == "__main__":
    import sys

    if len(sys.argv) > 1 and sys.argv[1] == "record":
        output = sys.argv[2] if len(sys.argv) > 2 else DEFAULT_VOICE_SAMPLE
        record_voice_sample(output)
    else:
        print("Usage:")
        print(f"  python {Path(__file__).name} record [output.wav]")
        print()
        print("Records a 3-second voice sample for Qwen3-TTS cloning.")
        print(f"Default output: {DEFAULT_VOICE_SAMPLE}")