""" tts.py — Text-To-Speech Module (Qwen3-TTS) Responsibilities: 1. Accept text (full or partial sentence) and generate a .wav audio file using the Qwen3-TTS model running locally with a **cloned voice**. 2. Validate the voice sample on init (must be a 2–5 second .wav file). 3. Provide a built-in recorder so users can create their voice sample directly from the assistant. 4. Support instruction-based style control (e.g., energy, tone). 5. Play the generated audio immediately. Cloned Voice Workflow: - Qwen3-TTS-12Hz-1.7B-CustomVoice can clone a voice from a short audio sample (recommended: 3 seconds, clean speech, no background noise). - Place your sample at the path specified by QWEN_TTS_VOICE (default: voices/echo_voice.wav). - Or run `python tts.py` to record a 3-second sample interactively. Environment Variables: QWEN_TTS_MODEL — model name or local path QWEN_TTS_VOICE — path to .wav voice sample (required for cloning) QWEN_TTS_INSTRUCT — default style instruction for speech generation Dependencies: pip install qwen-tts torch soundfile pygame pyaudio """ import asyncio import array import logging import os import struct import wave from pathlib import Path import pyaudio logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Configuration # --------------------------------------------------------------------------- DEFAULT_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice" DEFAULT_VOICE_SAMPLE = "voices/echo_voice.wav" DEFAULT_INSTRUCTION = ( "Speak clearly with a warm, friendly tone. Be natural and conversational." ) OUTPUT_DIR = Path("audio_output") # Recording constants REC_FORMAT = pyaudio.paInt16 REC_CHANNELS = 1 REC_RATE = 16000 REC_CHUNK = 1024 REC_DURATION = 3 # seconds — optimal for Qwen3 voice cloning # --------------------------------------------------------------------------- # Voice sample validation # --------------------------------------------------------------------------- def validate_voice_sample(path: str | Path) -> tuple[bool, str]: """ Check that a voice sample file exists and meets Qwen3-TTS requirements. Returns: (is_valid, reason) """ p = Path(path) if not p.exists(): return False, f"Voice sample not found at: {p.resolve()}. Record one with `python tts.py`" if p.suffix.lower() != ".wav": return False, f"Voice sample must be a .wav file, got '{p.suffix}'" try: with wave.open(str(p), "rb") as wf: channels = wf.getnchannels() sample_width = wf.getsampwidth() framerate = wf.getframerate() nframes = wf.getnframes() duration = nframes / framerate except Exception as exc: return False, f"Could not read .wav file: {exc}" issues = [] if channels != 1: issues.append(f"expected mono (1 channel), got {channels}") if framerate < 16000: issues.append(f"sample rate {framerate} Hz is too low (min 16000)") if duration < 2: issues.append(f"sample is {duration:.1f}s — too short (min 2s, recommended 3s)") elif duration > 5: issues.append(f"sample is {duration:.1f}s — too long (max 5s, recommended 3s)") if sample_width != 2: issues.append(f"expected 16-bit audio, got {sample_width * 8}-bit") if issues: return False, f"Voice sample issues: {'; '.join(issues)}" return True, f"Voice sample OK: {duration:.1f}s, {framerate} Hz, mono, 16-bit" # --------------------------------------------------------------------------- # Voice sample recorder # --------------------------------------------------------------------------- def record_voice_sample(output_path: str | Path, duration: int = REC_DURATION) -> Path: """ Record a short voice sample from the microphone for voice cloning. The user will hear a countdown and should speak naturally for the full duration. The recording is saved as a 16 kHz mono 16-bit .wav file. Args: output_path: Where to save the .wav file. duration: Recording length in seconds (default 3). Returns: Path to the saved .wav file. """ output_path = Path(output_path) output_path.parent.mkdir(parents=True, exist_ok=True) print(f"\n🎙️ Recording a {duration}-second voice sample for Echo...") print(f" Speak naturally in a clear voice. No background noise.") print(f" Saving to: {output_path.resolve()}\n") pa = pyaudio.PyAudio() stream = pa.open( format=REC_FORMAT, channels=REC_CHANNELS, rate=REC_RATE, input=True, frames_per_buffer=REC_CHUNK, ) frames = [] for i in range(int(REC_RATE / REC_CHUNK * duration)): frame = stream.read(REC_CHUNK, exception_on_overflow=False) frames.append(frame) remaining = duration - (i + 1) * REC_CHUNK / REC_RATE if int(remaining) != int(remaining + REC_CHUNK / REC_RATE): print(f" ... {int(remaining)}s remaining") stream.stop_stream() stream.close() pa.terminate() # Write .wav with wave.open(str(output_path), "wb") as wf: wf.setnchannels(REC_CHANNELS) wf.setsampwidth(2) # 16-bit wf.setframerate(REC_RATE) wf.writeframes(b"".join(frames)) ok, msg = validate_voice_sample(output_path) if ok: print(f"\n✅ {msg}") else: print(f"\n⚠️ {msg}") print(f" File saved: {output_path.resolve()}\n") return output_path class TTSEngine: """ Wrapper around Qwen3-TTS for generating speech with a cloned voice. The engine lazily loads the model on first use to avoid slow startup. """ def __init__( self, model_name: str = DEFAULT_MODEL, voice_sample: str = DEFAULT_VOICE_SAMPLE, instruction: str = DEFAULT_INSTRUCTION, output_dir: str | Path = OUTPUT_DIR, ): self.model_name = model_name self.voice_sample = Path(voice_sample) self.instruction = instruction self.output_dir = Path(output_dir) self.output_dir.mkdir(parents=True, exist_ok=True) self._model = None self._processor = None self._lock = asyncio.Lock() # prevent concurrent generation # Validate voice sample on init self._validate_voice() def _validate_voice(self): """Check the voice sample and log warnings if it's not usable.""" ok, msg = validate_voice_sample(self.voice_sample) if ok: logger.info("🎤 Voice: %s", msg) else: logger.warning("🎤 Voice sample issue — %s", msg) logger.warning( " Record a sample with: python tts.py record" ) # ---- lazy model loading ---- def _ensure_loaded(self): """Load model and processor on first call (lazy init).""" if self._model is not None: return logger.info( "Loading Qwen3-TTS model '%s' (this may take a moment)...", self.model_name, ) try: from qwen_tts import QwenTTSModel, QwenTTSProcessor self._processor = QwenTTSProcessor() self._model = QwenTTSModel.from_pretrained(self.model_name) logger.info("Qwen3-TTS model loaded successfully") except ImportError: raise ImportError( "qwen-tts is not installed. Install it with:\n" " pip install qwen-tts torch soundfile\n" "Also ensure you have CUDA-capable GPU for low-latency inference." ) # ---- generation ---- async def generate(self, text: str, instruction: str | None = None) -> Path | None: """ Generate speech audio from text using the cloned voice. Args: text: The text to convert to speech. instruction: Optional style instruction override. Returns: Path to the generated .wav file, or None on failure. """ if not text or not text.strip(): return None async with self._lock: return await asyncio.to_thread( self._generate_sync, text.strip(), instruction or self.instruction ) def _generate_sync(self, text: str, instruction: str) -> Path | None: """Synchronous generation (runs in thread pool).""" self._ensure_loaded() # Double-check voice sample before generating if not self.voice_sample.exists(): logger.error( "Voice sample missing at '%s' — cannot generate speech", self.voice_sample.resolve(), ) return None output_path = self.output_dir / f"echo_{os.urandom(4).hex()}.wav" try: voice_ref = str(self.voice_sample.resolve()) logger.info( "Generating speech: '%s' (voice=%s)", text[:60], self.voice_sample.name, ) audio_array = self._model.generate( processor=self._processor, text=text, voice=voice_ref, instruction=instruction, ) import soundfile as sf sample_rate = self._processor.sampling_rate sf.write(str(output_path), audio_array, sample_rate) logger.info( "Audio saved to %s (%.1fs)", output_path, len(audio_array) / sample_rate, ) return output_path except Exception: logger.exception("TTS generation failed for: '%s'", text[:60]) return None # ---- playback ---- async def speak(self, text: str, instruction: str | None = None) -> bool: """ Generate speech from text and play it immediately. Returns: True if playback succeeded, False otherwise. """ wav_path = await self.generate(text, instruction) if not wav_path: return False return await self._play(wav_path) async def speak_file(self, wav_path: Path) -> bool: """Play a previously generated .wav file.""" return await self._play(wav_path) @staticmethod async def _play(wav_path: Path) -> bool: """Play a .wav file using pygame.mixer (async-friendly).""" try: import pygame pygame.mixer.init(frequency=22050, size=-16, channels=1, buffer=2048) pygame.mixer.music.load(str(wav_path)) pygame.mixer.music.play() while pygame.mixer.music.get_busy(): await asyncio.sleep(0.05) pygame.mixer.music.stop() pygame.mixer.quit() logger.info("Playback finished: %s", wav_path.name) return True except Exception: logger.exception("Playback failed for %s", wav_path) return False def set_voice_sample(self, path: str): """Switch to a different voice sample .wav file.""" self.voice_sample = Path(path) self._validate_voice() logger.info("Voice sample set to: %s", self.voice_sample.resolve()) def set_instruction(self, instruction: str): """Update the default style instruction.""" self.instruction = instruction logger.info("TTS instruction updated: %s", instruction) # --------------------------------------------------------------------------- # CLI — record a voice sample directly # --------------------------------------------------------------------------- if __name__ == "__main__": import sys if len(sys.argv) > 1 and sys.argv[1] == "record": output = sys.argv[2] if len(sys.argv) > 2 else DEFAULT_VOICE_SAMPLE record_voice_sample(output) else: print("Usage:") print(f" python {Path(__file__).name} record [output.wav]") print() print("Records a 3-second voice sample for Qwen3-TTS cloning.") print(f"Default output: {DEFAULT_VOICE_SAMPLE}")