""" tts.py — Text-To-Speech Module (Qwen3-TTS) Responsibilities: 1. Accept text (full or partial sentence) and generate a .wav audio file using the Qwen3-TTS model running locally. 2. Support voice selection (preset voices or custom voice cloning). 3. Support instruction-based style control (e.g., energy, tone). 4. Play the generated audio immediately. Environment Variables: QWEN_TTS_MODEL — model name or local path (default: Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice) QWEN_TTS_VOICE — preset voice name or path to 3s .wav sample QWEN_TTS_INSTRUCT — default style instruction for speech generation Dependencies: pip install qwen-tts torch soundfile pygame """ import asyncio import logging import os import tempfile from pathlib import Path logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Configuration # --------------------------------------------------------------------------- DEFAULT_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice" DEFAULT_VOICE = "Ryan" # preset voice; alternatives: "Serena", "Diana", etc. DEFAULT_INSTRUCTION = "Speak clearly with a warm, friendly tone. Be natural and conversational." OUTPUT_DIR = Path("audio_output") class TTSEngine: """ Wrapper around Qwen3-TTS for generating speech from text. The engine lazily loads the model on first use to avoid slow startup. """ def __init__( self, model_name: str = DEFAULT_MODEL, voice: str = DEFAULT_VOICE, instruction: str = DEFAULT_INSTRUCTION, output_dir: str | Path = OUTPUT_DIR, ): self.model_name = model_name self.voice = voice self.instruction = instruction self.output_dir = Path(output_dir) self.output_dir.mkdir(parents=True, exist_ok=True) self._model = None self._processor = None self._lock = asyncio.Lock() # prevent concurrent generation # ---- lazy model loading ---- def _ensure_loaded(self): """Load model and processor on first call (lazy init).""" if self._model is not None: return logger.info("Loading Qwen3-TTS model '%s' (this may take a moment)...", self.model_name) try: from qwen_tts import QwenTTSProcessor, QwenTTSModel self._processor = QwenTTSProcessor() self._model = QwenTTSModel.from_pretrained(self.model_name) logger.info("Qwen3-TTS model loaded successfully") except ImportError: raise ImportError( "qwen-tts is not installed. Install it with:\n" " pip install qwen-tts torch soundfile\n" "Also ensure you have CUDA-capable GPU for low-latency inference." ) # ---- generation ---- async def generate(self, text: str, instruction: str | None = None) -> Path | None: """ Generate speech audio from text and save as .wav. Args: text: The text to convert to speech. instruction: Optional style instruction override. Returns: Path to the generated .wav file, or None on failure. """ if not text or not text.strip(): return None async with self._lock: return await asyncio.to_thread( self._generate_sync, text.strip(), instruction or self.instruction ) def _generate_sync(self, text: str, instruction: str) -> Path | None: """Synchronous generation (runs in thread pool).""" self._ensure_loaded() output_path = self.output_dir / f"echo_{os.urandom(4).hex()}.wav" try: # Build voice reference: preset name or custom .wav path voice_ref = self.voice if Path(self.voice).exists(): voice_ref = str(Path(self.voice).resolve()) # Generate audio logger.info("Generating speech: '%s' (voice=%s)", text[:60], self.voice) audio_array = self._model.generate( processor=self._processor, text=text, voice=voice_ref, instruction=instruction, ) # Save to file import soundfile as sf sample_rate = self._processor.sampling_rate sf.write(str(output_path), audio_array, sample_rate) logger.info("Audio saved to %s (%.1fs)", output_path, len(audio_array) / sample_rate) return output_path except Exception: logger.exception("TTS generation failed for: '%s'", text[:60]) return None # ---- playback ---- async def speak(self, text: str, instruction: str | None = None) -> bool: """ Generate speech from text and play it immediately. Returns: True if playback succeeded, False otherwise. """ wav_path = await self.generate(text, instruction) if not wav_path: return False return await self._play(wav_path) async def speak_file(self, wav_path: Path) -> bool: """Play a previously generated .wav file.""" return await self._play(wav_path) @staticmethod async def _play(wav_path: Path) -> bool: """Play a .wav file using pygame.mixer (async-friendly).""" try: import pygame pygame.mixer.init(frequency=22050, size=-16, channels=1, buffer=2048) pygame.mixer.music.load(str(wav_path)) pygame.mixer.music.play() # Wait for playback to finish while pygame.mixer.music.get_busy(): await asyncio.sleep(0.05) pygame.mixer.music.stop() pygame.mixer.quit() logger.info("Playback finished: %s", wav_path.name) return True except Exception: logger.exception("Playback failed for %s", wav_path) return False def set_voice(self, voice: str): """Switch to a different voice preset or custom sample path.""" self.voice = voice logger.info("Voice set to: %s", voice) def set_instruction(self, instruction: str): """Update the default style instruction.""" self.instruction = instruction logger.info("TTS instruction updated: %s", instruction)