moxieTalking/tts.py
Echo Assistant 19a283ec0f feat: switch TTS to cloned voice mode with sample validation & recorder
- tts.py: voice_sample param replaces preset voice, add validate_voice_sample(),
  add record_voice_sample() with CLI (python tts.py record), validate .wav
  format/duration/channels on init
- main.py: warn at startup if voice sample missing, show voice status in banner
- .env.example: QWEN_TTS_VOICE now points to voices/echo_voice.wav
- .gitignore: voice samples gitignored (personal data)
- voices/README.md: instructions for recording & placing voice samples
2026-03-31 00:31:56 +00:00

357 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""
tts.py — Text-To-Speech Module (Qwen3-TTS)
Responsibilities:
1. Accept text (full or partial sentence) and generate a .wav audio file
using the Qwen3-TTS model running locally with a **cloned voice**.
2. Validate the voice sample on init (must be a 25 second .wav file).
3. Provide a built-in recorder so users can create their voice sample
directly from the assistant.
4. Support instruction-based style control (e.g., energy, tone).
5. Play the generated audio immediately.
Cloned Voice Workflow:
- Qwen3-TTS-12Hz-1.7B-CustomVoice can clone a voice from a short audio
sample (recommended: 3 seconds, clean speech, no background noise).
- Place your sample at the path specified by QWEN_TTS_VOICE (default:
voices/echo_voice.wav).
- Or run `python tts.py` to record a 3-second sample interactively.
Environment Variables:
QWEN_TTS_MODEL — model name or local path
QWEN_TTS_VOICE — path to .wav voice sample (required for cloning)
QWEN_TTS_INSTRUCT — default style instruction for speech generation
Dependencies:
pip install qwen-tts torch soundfile pygame pyaudio
"""
import asyncio
import array
import logging
import os
import struct
import wave
from pathlib import Path
import pyaudio
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
DEFAULT_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
DEFAULT_VOICE_SAMPLE = "voices/echo_voice.wav"
DEFAULT_INSTRUCTION = (
"Speak clearly with a warm, friendly tone. Be natural and conversational."
)
OUTPUT_DIR = Path("audio_output")
# Recording constants
REC_FORMAT = pyaudio.paInt16
REC_CHANNELS = 1
REC_RATE = 16000
REC_CHUNK = 1024
REC_DURATION = 3 # seconds — optimal for Qwen3 voice cloning
# ---------------------------------------------------------------------------
# Voice sample validation
# ---------------------------------------------------------------------------
def validate_voice_sample(path: str | Path) -> tuple[bool, str]:
"""
Check that a voice sample file exists and meets Qwen3-TTS requirements.
Returns:
(is_valid, reason)
"""
p = Path(path)
if not p.exists():
return False, f"Voice sample not found at: {p.resolve()}. Record one with `python tts.py`"
if p.suffix.lower() != ".wav":
return False, f"Voice sample must be a .wav file, got '{p.suffix}'"
try:
with wave.open(str(p), "rb") as wf:
channels = wf.getnchannels()
sample_width = wf.getsampwidth()
framerate = wf.getframerate()
nframes = wf.getnframes()
duration = nframes / framerate
except Exception as exc:
return False, f"Could not read .wav file: {exc}"
issues = []
if channels != 1:
issues.append(f"expected mono (1 channel), got {channels}")
if framerate < 16000:
issues.append(f"sample rate {framerate} Hz is too low (min 16000)")
if duration < 2:
issues.append(f"sample is {duration:.1f}s — too short (min 2s, recommended 3s)")
elif duration > 5:
issues.append(f"sample is {duration:.1f}s — too long (max 5s, recommended 3s)")
if sample_width != 2:
issues.append(f"expected 16-bit audio, got {sample_width * 8}-bit")
if issues:
return False, f"Voice sample issues: {'; '.join(issues)}"
return True, f"Voice sample OK: {duration:.1f}s, {framerate} Hz, mono, 16-bit"
# ---------------------------------------------------------------------------
# Voice sample recorder
# ---------------------------------------------------------------------------
def record_voice_sample(output_path: str | Path, duration: int = REC_DURATION) -> Path:
"""
Record a short voice sample from the microphone for voice cloning.
The user will hear a countdown and should speak naturally for the
full duration. The recording is saved as a 16 kHz mono 16-bit .wav file.
Args:
output_path: Where to save the .wav file.
duration: Recording length in seconds (default 3).
Returns:
Path to the saved .wav file.
"""
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
print(f"\n🎙️ Recording a {duration}-second voice sample for Echo...")
print(f" Speak naturally in a clear voice. No background noise.")
print(f" Saving to: {output_path.resolve()}\n")
pa = pyaudio.PyAudio()
stream = pa.open(
format=REC_FORMAT,
channels=REC_CHANNELS,
rate=REC_RATE,
input=True,
frames_per_buffer=REC_CHUNK,
)
frames = []
for i in range(int(REC_RATE / REC_CHUNK * duration)):
frame = stream.read(REC_CHUNK, exception_on_overflow=False)
frames.append(frame)
remaining = duration - (i + 1) * REC_CHUNK / REC_RATE
if int(remaining) != int(remaining + REC_CHUNK / REC_RATE):
print(f" ... {int(remaining)}s remaining")
stream.stop_stream()
stream.close()
pa.terminate()
# Write .wav
with wave.open(str(output_path), "wb") as wf:
wf.setnchannels(REC_CHANNELS)
wf.setsampwidth(2) # 16-bit
wf.setframerate(REC_RATE)
wf.writeframes(b"".join(frames))
ok, msg = validate_voice_sample(output_path)
if ok:
print(f"\n{msg}")
else:
print(f"\n⚠️ {msg}")
print(f" File saved: {output_path.resolve()}\n")
return output_path
class TTSEngine:
"""
Wrapper around Qwen3-TTS for generating speech with a cloned voice.
The engine lazily loads the model on first use to avoid slow startup.
"""
def __init__(
self,
model_name: str = DEFAULT_MODEL,
voice_sample: str = DEFAULT_VOICE_SAMPLE,
instruction: str = DEFAULT_INSTRUCTION,
output_dir: str | Path = OUTPUT_DIR,
):
self.model_name = model_name
self.voice_sample = Path(voice_sample)
self.instruction = instruction
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
self._model = None
self._processor = None
self._lock = asyncio.Lock() # prevent concurrent generation
# Validate voice sample on init
self._validate_voice()
def _validate_voice(self):
"""Check the voice sample and log warnings if it's not usable."""
ok, msg = validate_voice_sample(self.voice_sample)
if ok:
logger.info("🎤 Voice: %s", msg)
else:
logger.warning("🎤 Voice sample issue — %s", msg)
logger.warning(
" Record a sample with: python tts.py record"
)
# ---- lazy model loading ----
def _ensure_loaded(self):
"""Load model and processor on first call (lazy init)."""
if self._model is not None:
return
logger.info(
"Loading Qwen3-TTS model '%s' (this may take a moment)...",
self.model_name,
)
try:
from qwen_tts import QwenTTSModel, QwenTTSProcessor
self._processor = QwenTTSProcessor()
self._model = QwenTTSModel.from_pretrained(self.model_name)
logger.info("Qwen3-TTS model loaded successfully")
except ImportError:
raise ImportError(
"qwen-tts is not installed. Install it with:\n"
" pip install qwen-tts torch soundfile\n"
"Also ensure you have CUDA-capable GPU for low-latency inference."
)
# ---- generation ----
async def generate(self, text: str, instruction: str | None = None) -> Path | None:
"""
Generate speech audio from text using the cloned voice.
Args:
text: The text to convert to speech.
instruction: Optional style instruction override.
Returns:
Path to the generated .wav file, or None on failure.
"""
if not text or not text.strip():
return None
async with self._lock:
return await asyncio.to_thread(
self._generate_sync, text.strip(), instruction or self.instruction
)
def _generate_sync(self, text: str, instruction: str) -> Path | None:
"""Synchronous generation (runs in thread pool)."""
self._ensure_loaded()
# Double-check voice sample before generating
if not self.voice_sample.exists():
logger.error(
"Voice sample missing at '%s' — cannot generate speech",
self.voice_sample.resolve(),
)
return None
output_path = self.output_dir / f"echo_{os.urandom(4).hex()}.wav"
try:
voice_ref = str(self.voice_sample.resolve())
logger.info(
"Generating speech: '%s' (voice=%s)",
text[:60],
self.voice_sample.name,
)
audio_array = self._model.generate(
processor=self._processor,
text=text,
voice=voice_ref,
instruction=instruction,
)
import soundfile as sf
sample_rate = self._processor.sampling_rate
sf.write(str(output_path), audio_array, sample_rate)
logger.info(
"Audio saved to %s (%.1fs)",
output_path,
len(audio_array) / sample_rate,
)
return output_path
except Exception:
logger.exception("TTS generation failed for: '%s'", text[:60])
return None
# ---- playback ----
async def speak(self, text: str, instruction: str | None = None) -> bool:
"""
Generate speech from text and play it immediately.
Returns:
True if playback succeeded, False otherwise.
"""
wav_path = await self.generate(text, instruction)
if not wav_path:
return False
return await self._play(wav_path)
async def speak_file(self, wav_path: Path) -> bool:
"""Play a previously generated .wav file."""
return await self._play(wav_path)
@staticmethod
async def _play(wav_path: Path) -> bool:
"""Play a .wav file using pygame.mixer (async-friendly)."""
try:
import pygame
pygame.mixer.init(frequency=22050, size=-16, channels=1, buffer=2048)
pygame.mixer.music.load(str(wav_path))
pygame.mixer.music.play()
while pygame.mixer.music.get_busy():
await asyncio.sleep(0.05)
pygame.mixer.music.stop()
pygame.mixer.quit()
logger.info("Playback finished: %s", wav_path.name)
return True
except Exception:
logger.exception("Playback failed for %s", wav_path)
return False
def set_voice_sample(self, path: str):
"""Switch to a different voice sample .wav file."""
self.voice_sample = Path(path)
self._validate_voice()
logger.info("Voice sample set to: %s", self.voice_sample.resolve())
def set_instruction(self, instruction: str):
"""Update the default style instruction."""
self.instruction = instruction
logger.info("TTS instruction updated: %s", instruction)
# ---------------------------------------------------------------------------
# CLI — record a voice sample directly
# ---------------------------------------------------------------------------
if __name__ == "__main__":
import sys
if len(sys.argv) > 1 and sys.argv[1] == "record":
output = sys.argv[2] if len(sys.argv) > 2 else DEFAULT_VOICE_SAMPLE
record_voice_sample(output)
else:
print("Usage:")
print(f" python {Path(__file__).name} record [output.wav]")
print()
print("Records a 3-second voice sample for Qwen3-TTS cloning.")
print(f"Default output: {DEFAULT_VOICE_SAMPLE}")