- stt.py: WakeWordListener (openWakeWord) + Transcriber (Vosk) - brain.py: Async OpenRouter streaming client with command parsing - tts.py: Qwen3-TTS engine with voice selection & instruction control - actions.py: 10 local OS commands (open_app, set_timer, search, etc.) - main.py: Async orchestrator with Phase 5 parallel TTS streaming
182 lines
6.2 KiB
Python
182 lines
6.2 KiB
Python
"""
|
|
tts.py — Text-To-Speech Module (Qwen3-TTS)
|
|
|
|
Responsibilities:
|
|
1. Accept text (full or partial sentence) and generate a .wav audio file
|
|
using the Qwen3-TTS model running locally.
|
|
2. Support voice selection (preset voices or custom voice cloning).
|
|
3. Support instruction-based style control (e.g., energy, tone).
|
|
4. Play the generated audio immediately.
|
|
|
|
Environment Variables:
|
|
QWEN_TTS_MODEL — model name or local path (default: Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice)
|
|
QWEN_TTS_VOICE — preset voice name or path to 3s .wav sample
|
|
QWEN_TTS_INSTRUCT — default style instruction for speech generation
|
|
|
|
Dependencies:
|
|
pip install qwen-tts torch soundfile pygame
|
|
"""
|
|
|
|
import asyncio
|
|
import logging
|
|
import os
|
|
import tempfile
|
|
from pathlib import Path
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Configuration
|
|
# ---------------------------------------------------------------------------
|
|
DEFAULT_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
|
|
DEFAULT_VOICE = "Ryan" # preset voice; alternatives: "Serena", "Diana", etc.
|
|
DEFAULT_INSTRUCTION = "Speak clearly with a warm, friendly tone. Be natural and conversational."
|
|
OUTPUT_DIR = Path("audio_output")
|
|
|
|
|
|
class TTSEngine:
|
|
"""
|
|
Wrapper around Qwen3-TTS for generating speech from text.
|
|
|
|
The engine lazily loads the model on first use to avoid slow startup.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
model_name: str = DEFAULT_MODEL,
|
|
voice: str = DEFAULT_VOICE,
|
|
instruction: str = DEFAULT_INSTRUCTION,
|
|
output_dir: str | Path = OUTPUT_DIR,
|
|
):
|
|
self.model_name = model_name
|
|
self.voice = voice
|
|
self.instruction = instruction
|
|
self.output_dir = Path(output_dir)
|
|
self.output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
self._model = None
|
|
self._processor = None
|
|
self._lock = asyncio.Lock() # prevent concurrent generation
|
|
|
|
# ---- lazy model loading ----
|
|
def _ensure_loaded(self):
|
|
"""Load model and processor on first call (lazy init)."""
|
|
if self._model is not None:
|
|
return
|
|
|
|
logger.info("Loading Qwen3-TTS model '%s' (this may take a moment)...", self.model_name)
|
|
try:
|
|
from qwen_tts import QwenTTSProcessor, QwenTTSModel
|
|
|
|
self._processor = QwenTTSProcessor()
|
|
self._model = QwenTTSModel.from_pretrained(self.model_name)
|
|
logger.info("Qwen3-TTS model loaded successfully")
|
|
except ImportError:
|
|
raise ImportError(
|
|
"qwen-tts is not installed. Install it with:\n"
|
|
" pip install qwen-tts torch soundfile\n"
|
|
"Also ensure you have CUDA-capable GPU for low-latency inference."
|
|
)
|
|
|
|
# ---- generation ----
|
|
async def generate(self, text: str, instruction: str | None = None) -> Path | None:
|
|
"""
|
|
Generate speech audio from text and save as .wav.
|
|
|
|
Args:
|
|
text: The text to convert to speech.
|
|
instruction: Optional style instruction override.
|
|
|
|
Returns:
|
|
Path to the generated .wav file, or None on failure.
|
|
"""
|
|
if not text or not text.strip():
|
|
return None
|
|
|
|
async with self._lock:
|
|
return await asyncio.to_thread(
|
|
self._generate_sync, text.strip(), instruction or self.instruction
|
|
)
|
|
|
|
def _generate_sync(self, text: str, instruction: str) -> Path | None:
|
|
"""Synchronous generation (runs in thread pool)."""
|
|
self._ensure_loaded()
|
|
|
|
output_path = self.output_dir / f"echo_{os.urandom(4).hex()}.wav"
|
|
|
|
try:
|
|
# Build voice reference: preset name or custom .wav path
|
|
voice_ref = self.voice
|
|
if Path(self.voice).exists():
|
|
voice_ref = str(Path(self.voice).resolve())
|
|
|
|
# Generate audio
|
|
logger.info("Generating speech: '%s' (voice=%s)", text[:60], self.voice)
|
|
audio_array = self._model.generate(
|
|
processor=self._processor,
|
|
text=text,
|
|
voice=voice_ref,
|
|
instruction=instruction,
|
|
)
|
|
|
|
# Save to file
|
|
import soundfile as sf
|
|
|
|
sample_rate = self._processor.sampling_rate
|
|
sf.write(str(output_path), audio_array, sample_rate)
|
|
logger.info("Audio saved to %s (%.1fs)", output_path, len(audio_array) / sample_rate)
|
|
return output_path
|
|
|
|
except Exception:
|
|
logger.exception("TTS generation failed for: '%s'", text[:60])
|
|
return None
|
|
|
|
# ---- playback ----
|
|
async def speak(self, text: str, instruction: str | None = None) -> bool:
|
|
"""
|
|
Generate speech from text and play it immediately.
|
|
|
|
Returns:
|
|
True if playback succeeded, False otherwise.
|
|
"""
|
|
wav_path = await self.generate(text, instruction)
|
|
if not wav_path:
|
|
return False
|
|
return await self._play(wav_path)
|
|
|
|
async def speak_file(self, wav_path: Path) -> bool:
|
|
"""Play a previously generated .wav file."""
|
|
return await self._play(wav_path)
|
|
|
|
@staticmethod
|
|
async def _play(wav_path: Path) -> bool:
|
|
"""Play a .wav file using pygame.mixer (async-friendly)."""
|
|
try:
|
|
import pygame
|
|
|
|
pygame.mixer.init(frequency=22050, size=-16, channels=1, buffer=2048)
|
|
pygame.mixer.music.load(str(wav_path))
|
|
pygame.mixer.music.play()
|
|
|
|
# Wait for playback to finish
|
|
while pygame.mixer.music.get_busy():
|
|
await asyncio.sleep(0.05)
|
|
|
|
pygame.mixer.music.stop()
|
|
pygame.mixer.quit()
|
|
logger.info("Playback finished: %s", wav_path.name)
|
|
return True
|
|
except Exception:
|
|
logger.exception("Playback failed for %s", wav_path)
|
|
return False
|
|
|
|
def set_voice(self, voice: str):
|
|
"""Switch to a different voice preset or custom sample path."""
|
|
self.voice = voice
|
|
logger.info("Voice set to: %s", voice)
|
|
|
|
def set_instruction(self, instruction: str):
|
|
"""Update the default style instruction."""
|
|
self.instruction = instruction
|
|
logger.info("TTS instruction updated: %s", instruction)
|