moxieTalking/tts.py
Echo Assistant d6b64d04d1 feat: initial Echo voice assistant — Vosk + OpenRouter + Qwen3-TTS
- stt.py: WakeWordListener (openWakeWord) + Transcriber (Vosk)
- brain.py: Async OpenRouter streaming client with command parsing
- tts.py: Qwen3-TTS engine with voice selection & instruction control
- actions.py: 10 local OS commands (open_app, set_timer, search, etc.)
- main.py: Async orchestrator with Phase 5 parallel TTS streaming
2026-03-31 00:09:00 +00:00

182 lines
6.2 KiB
Python

"""
tts.py — Text-To-Speech Module (Qwen3-TTS)
Responsibilities:
1. Accept text (full or partial sentence) and generate a .wav audio file
using the Qwen3-TTS model running locally.
2. Support voice selection (preset voices or custom voice cloning).
3. Support instruction-based style control (e.g., energy, tone).
4. Play the generated audio immediately.
Environment Variables:
QWEN_TTS_MODEL — model name or local path (default: Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice)
QWEN_TTS_VOICE — preset voice name or path to 3s .wav sample
QWEN_TTS_INSTRUCT — default style instruction for speech generation
Dependencies:
pip install qwen-tts torch soundfile pygame
"""
import asyncio
import logging
import os
import tempfile
from pathlib import Path
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
DEFAULT_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
DEFAULT_VOICE = "Ryan" # preset voice; alternatives: "Serena", "Diana", etc.
DEFAULT_INSTRUCTION = "Speak clearly with a warm, friendly tone. Be natural and conversational."
OUTPUT_DIR = Path("audio_output")
class TTSEngine:
"""
Wrapper around Qwen3-TTS for generating speech from text.
The engine lazily loads the model on first use to avoid slow startup.
"""
def __init__(
self,
model_name: str = DEFAULT_MODEL,
voice: str = DEFAULT_VOICE,
instruction: str = DEFAULT_INSTRUCTION,
output_dir: str | Path = OUTPUT_DIR,
):
self.model_name = model_name
self.voice = voice
self.instruction = instruction
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
self._model = None
self._processor = None
self._lock = asyncio.Lock() # prevent concurrent generation
# ---- lazy model loading ----
def _ensure_loaded(self):
"""Load model and processor on first call (lazy init)."""
if self._model is not None:
return
logger.info("Loading Qwen3-TTS model '%s' (this may take a moment)...", self.model_name)
try:
from qwen_tts import QwenTTSProcessor, QwenTTSModel
self._processor = QwenTTSProcessor()
self._model = QwenTTSModel.from_pretrained(self.model_name)
logger.info("Qwen3-TTS model loaded successfully")
except ImportError:
raise ImportError(
"qwen-tts is not installed. Install it with:\n"
" pip install qwen-tts torch soundfile\n"
"Also ensure you have CUDA-capable GPU for low-latency inference."
)
# ---- generation ----
async def generate(self, text: str, instruction: str | None = None) -> Path | None:
"""
Generate speech audio from text and save as .wav.
Args:
text: The text to convert to speech.
instruction: Optional style instruction override.
Returns:
Path to the generated .wav file, or None on failure.
"""
if not text or not text.strip():
return None
async with self._lock:
return await asyncio.to_thread(
self._generate_sync, text.strip(), instruction or self.instruction
)
def _generate_sync(self, text: str, instruction: str) -> Path | None:
"""Synchronous generation (runs in thread pool)."""
self._ensure_loaded()
output_path = self.output_dir / f"echo_{os.urandom(4).hex()}.wav"
try:
# Build voice reference: preset name or custom .wav path
voice_ref = self.voice
if Path(self.voice).exists():
voice_ref = str(Path(self.voice).resolve())
# Generate audio
logger.info("Generating speech: '%s' (voice=%s)", text[:60], self.voice)
audio_array = self._model.generate(
processor=self._processor,
text=text,
voice=voice_ref,
instruction=instruction,
)
# Save to file
import soundfile as sf
sample_rate = self._processor.sampling_rate
sf.write(str(output_path), audio_array, sample_rate)
logger.info("Audio saved to %s (%.1fs)", output_path, len(audio_array) / sample_rate)
return output_path
except Exception:
logger.exception("TTS generation failed for: '%s'", text[:60])
return None
# ---- playback ----
async def speak(self, text: str, instruction: str | None = None) -> bool:
"""
Generate speech from text and play it immediately.
Returns:
True if playback succeeded, False otherwise.
"""
wav_path = await self.generate(text, instruction)
if not wav_path:
return False
return await self._play(wav_path)
async def speak_file(self, wav_path: Path) -> bool:
"""Play a previously generated .wav file."""
return await self._play(wav_path)
@staticmethod
async def _play(wav_path: Path) -> bool:
"""Play a .wav file using pygame.mixer (async-friendly)."""
try:
import pygame
pygame.mixer.init(frequency=22050, size=-16, channels=1, buffer=2048)
pygame.mixer.music.load(str(wav_path))
pygame.mixer.music.play()
# Wait for playback to finish
while pygame.mixer.music.get_busy():
await asyncio.sleep(0.05)
pygame.mixer.music.stop()
pygame.mixer.quit()
logger.info("Playback finished: %s", wav_path.name)
return True
except Exception:
logger.exception("Playback failed for %s", wav_path)
return False
def set_voice(self, voice: str):
"""Switch to a different voice preset or custom sample path."""
self.voice = voice
logger.info("Voice set to: %s", voice)
def set_instruction(self, instruction: str):
"""Update the default style instruction."""
self.instruction = instruction
logger.info("TTS instruction updated: %s", instruction)