- tts.py: voice_sample param replaces preset voice, add validate_voice_sample(), add record_voice_sample() with CLI (python tts.py record), validate .wav format/duration/channels on init - main.py: warn at startup if voice sample missing, show voice status in banner - .env.example: QWEN_TTS_VOICE now points to voices/echo_voice.wav - .gitignore: voice samples gitignored (personal data) - voices/README.md: instructions for recording & placing voice samples
357 lines
12 KiB
Python
357 lines
12 KiB
Python
"""
|
||
tts.py — Text-To-Speech Module (Qwen3-TTS)
|
||
|
||
Responsibilities:
|
||
1. Accept text (full or partial sentence) and generate a .wav audio file
|
||
using the Qwen3-TTS model running locally with a **cloned voice**.
|
||
2. Validate the voice sample on init (must be a 2–5 second .wav file).
|
||
3. Provide a built-in recorder so users can create their voice sample
|
||
directly from the assistant.
|
||
4. Support instruction-based style control (e.g., energy, tone).
|
||
5. Play the generated audio immediately.
|
||
|
||
Cloned Voice Workflow:
|
||
- Qwen3-TTS-12Hz-1.7B-CustomVoice can clone a voice from a short audio
|
||
sample (recommended: 3 seconds, clean speech, no background noise).
|
||
- Place your sample at the path specified by QWEN_TTS_VOICE (default:
|
||
voices/echo_voice.wav).
|
||
- Or run `python tts.py` to record a 3-second sample interactively.
|
||
|
||
Environment Variables:
|
||
QWEN_TTS_MODEL — model name or local path
|
||
QWEN_TTS_VOICE — path to .wav voice sample (required for cloning)
|
||
QWEN_TTS_INSTRUCT — default style instruction for speech generation
|
||
|
||
Dependencies:
|
||
pip install qwen-tts torch soundfile pygame pyaudio
|
||
"""
|
||
|
||
import asyncio
|
||
import array
|
||
import logging
|
||
import os
|
||
import struct
|
||
import wave
|
||
from pathlib import Path
|
||
|
||
import pyaudio
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Configuration
|
||
# ---------------------------------------------------------------------------
|
||
DEFAULT_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
|
||
DEFAULT_VOICE_SAMPLE = "voices/echo_voice.wav"
|
||
DEFAULT_INSTRUCTION = (
|
||
"Speak clearly with a warm, friendly tone. Be natural and conversational."
|
||
)
|
||
OUTPUT_DIR = Path("audio_output")
|
||
|
||
# Recording constants
|
||
REC_FORMAT = pyaudio.paInt16
|
||
REC_CHANNELS = 1
|
||
REC_RATE = 16000
|
||
REC_CHUNK = 1024
|
||
REC_DURATION = 3 # seconds — optimal for Qwen3 voice cloning
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Voice sample validation
|
||
# ---------------------------------------------------------------------------
|
||
def validate_voice_sample(path: str | Path) -> tuple[bool, str]:
|
||
"""
|
||
Check that a voice sample file exists and meets Qwen3-TTS requirements.
|
||
|
||
Returns:
|
||
(is_valid, reason)
|
||
"""
|
||
p = Path(path)
|
||
|
||
if not p.exists():
|
||
return False, f"Voice sample not found at: {p.resolve()}. Record one with `python tts.py`"
|
||
|
||
if p.suffix.lower() != ".wav":
|
||
return False, f"Voice sample must be a .wav file, got '{p.suffix}'"
|
||
|
||
try:
|
||
with wave.open(str(p), "rb") as wf:
|
||
channels = wf.getnchannels()
|
||
sample_width = wf.getsampwidth()
|
||
framerate = wf.getframerate()
|
||
nframes = wf.getnframes()
|
||
duration = nframes / framerate
|
||
except Exception as exc:
|
||
return False, f"Could not read .wav file: {exc}"
|
||
|
||
issues = []
|
||
if channels != 1:
|
||
issues.append(f"expected mono (1 channel), got {channels}")
|
||
if framerate < 16000:
|
||
issues.append(f"sample rate {framerate} Hz is too low (min 16000)")
|
||
if duration < 2:
|
||
issues.append(f"sample is {duration:.1f}s — too short (min 2s, recommended 3s)")
|
||
elif duration > 5:
|
||
issues.append(f"sample is {duration:.1f}s — too long (max 5s, recommended 3s)")
|
||
if sample_width != 2:
|
||
issues.append(f"expected 16-bit audio, got {sample_width * 8}-bit")
|
||
|
||
if issues:
|
||
return False, f"Voice sample issues: {'; '.join(issues)}"
|
||
|
||
return True, f"Voice sample OK: {duration:.1f}s, {framerate} Hz, mono, 16-bit"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Voice sample recorder
|
||
# ---------------------------------------------------------------------------
|
||
def record_voice_sample(output_path: str | Path, duration: int = REC_DURATION) -> Path:
|
||
"""
|
||
Record a short voice sample from the microphone for voice cloning.
|
||
|
||
The user will hear a countdown and should speak naturally for the
|
||
full duration. The recording is saved as a 16 kHz mono 16-bit .wav file.
|
||
|
||
Args:
|
||
output_path: Where to save the .wav file.
|
||
duration: Recording length in seconds (default 3).
|
||
|
||
Returns:
|
||
Path to the saved .wav file.
|
||
"""
|
||
output_path = Path(output_path)
|
||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||
|
||
print(f"\n🎙️ Recording a {duration}-second voice sample for Echo...")
|
||
print(f" Speak naturally in a clear voice. No background noise.")
|
||
print(f" Saving to: {output_path.resolve()}\n")
|
||
|
||
pa = pyaudio.PyAudio()
|
||
stream = pa.open(
|
||
format=REC_FORMAT,
|
||
channels=REC_CHANNELS,
|
||
rate=REC_RATE,
|
||
input=True,
|
||
frames_per_buffer=REC_CHUNK,
|
||
)
|
||
|
||
frames = []
|
||
for i in range(int(REC_RATE / REC_CHUNK * duration)):
|
||
frame = stream.read(REC_CHUNK, exception_on_overflow=False)
|
||
frames.append(frame)
|
||
remaining = duration - (i + 1) * REC_CHUNK / REC_RATE
|
||
if int(remaining) != int(remaining + REC_CHUNK / REC_RATE):
|
||
print(f" ... {int(remaining)}s remaining")
|
||
|
||
stream.stop_stream()
|
||
stream.close()
|
||
pa.terminate()
|
||
|
||
# Write .wav
|
||
with wave.open(str(output_path), "wb") as wf:
|
||
wf.setnchannels(REC_CHANNELS)
|
||
wf.setsampwidth(2) # 16-bit
|
||
wf.setframerate(REC_RATE)
|
||
wf.writeframes(b"".join(frames))
|
||
|
||
ok, msg = validate_voice_sample(output_path)
|
||
if ok:
|
||
print(f"\n✅ {msg}")
|
||
else:
|
||
print(f"\n⚠️ {msg}")
|
||
print(f" File saved: {output_path.resolve()}\n")
|
||
|
||
return output_path
|
||
|
||
|
||
class TTSEngine:
|
||
"""
|
||
Wrapper around Qwen3-TTS for generating speech with a cloned voice.
|
||
|
||
The engine lazily loads the model on first use to avoid slow startup.
|
||
"""
|
||
|
||
def __init__(
|
||
self,
|
||
model_name: str = DEFAULT_MODEL,
|
||
voice_sample: str = DEFAULT_VOICE_SAMPLE,
|
||
instruction: str = DEFAULT_INSTRUCTION,
|
||
output_dir: str | Path = OUTPUT_DIR,
|
||
):
|
||
self.model_name = model_name
|
||
self.voice_sample = Path(voice_sample)
|
||
self.instruction = instruction
|
||
self.output_dir = Path(output_dir)
|
||
self.output_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
self._model = None
|
||
self._processor = None
|
||
self._lock = asyncio.Lock() # prevent concurrent generation
|
||
|
||
# Validate voice sample on init
|
||
self._validate_voice()
|
||
|
||
def _validate_voice(self):
|
||
"""Check the voice sample and log warnings if it's not usable."""
|
||
ok, msg = validate_voice_sample(self.voice_sample)
|
||
if ok:
|
||
logger.info("🎤 Voice: %s", msg)
|
||
else:
|
||
logger.warning("🎤 Voice sample issue — %s", msg)
|
||
logger.warning(
|
||
" Record a sample with: python tts.py record"
|
||
)
|
||
|
||
# ---- lazy model loading ----
|
||
def _ensure_loaded(self):
|
||
"""Load model and processor on first call (lazy init)."""
|
||
if self._model is not None:
|
||
return
|
||
|
||
logger.info(
|
||
"Loading Qwen3-TTS model '%s' (this may take a moment)...",
|
||
self.model_name,
|
||
)
|
||
try:
|
||
from qwen_tts import QwenTTSModel, QwenTTSProcessor
|
||
|
||
self._processor = QwenTTSProcessor()
|
||
self._model = QwenTTSModel.from_pretrained(self.model_name)
|
||
logger.info("Qwen3-TTS model loaded successfully")
|
||
except ImportError:
|
||
raise ImportError(
|
||
"qwen-tts is not installed. Install it with:\n"
|
||
" pip install qwen-tts torch soundfile\n"
|
||
"Also ensure you have CUDA-capable GPU for low-latency inference."
|
||
)
|
||
|
||
# ---- generation ----
|
||
async def generate(self, text: str, instruction: str | None = None) -> Path | None:
|
||
"""
|
||
Generate speech audio from text using the cloned voice.
|
||
|
||
Args:
|
||
text: The text to convert to speech.
|
||
instruction: Optional style instruction override.
|
||
|
||
Returns:
|
||
Path to the generated .wav file, or None on failure.
|
||
"""
|
||
if not text or not text.strip():
|
||
return None
|
||
|
||
async with self._lock:
|
||
return await asyncio.to_thread(
|
||
self._generate_sync, text.strip(), instruction or self.instruction
|
||
)
|
||
|
||
def _generate_sync(self, text: str, instruction: str) -> Path | None:
|
||
"""Synchronous generation (runs in thread pool)."""
|
||
self._ensure_loaded()
|
||
|
||
# Double-check voice sample before generating
|
||
if not self.voice_sample.exists():
|
||
logger.error(
|
||
"Voice sample missing at '%s' — cannot generate speech",
|
||
self.voice_sample.resolve(),
|
||
)
|
||
return None
|
||
|
||
output_path = self.output_dir / f"echo_{os.urandom(4).hex()}.wav"
|
||
|
||
try:
|
||
voice_ref = str(self.voice_sample.resolve())
|
||
|
||
logger.info(
|
||
"Generating speech: '%s' (voice=%s)",
|
||
text[:60],
|
||
self.voice_sample.name,
|
||
)
|
||
audio_array = self._model.generate(
|
||
processor=self._processor,
|
||
text=text,
|
||
voice=voice_ref,
|
||
instruction=instruction,
|
||
)
|
||
|
||
import soundfile as sf
|
||
|
||
sample_rate = self._processor.sampling_rate
|
||
sf.write(str(output_path), audio_array, sample_rate)
|
||
logger.info(
|
||
"Audio saved to %s (%.1fs)",
|
||
output_path,
|
||
len(audio_array) / sample_rate,
|
||
)
|
||
return output_path
|
||
|
||
except Exception:
|
||
logger.exception("TTS generation failed for: '%s'", text[:60])
|
||
return None
|
||
|
||
# ---- playback ----
|
||
async def speak(self, text: str, instruction: str | None = None) -> bool:
|
||
"""
|
||
Generate speech from text and play it immediately.
|
||
|
||
Returns:
|
||
True if playback succeeded, False otherwise.
|
||
"""
|
||
wav_path = await self.generate(text, instruction)
|
||
if not wav_path:
|
||
return False
|
||
return await self._play(wav_path)
|
||
|
||
async def speak_file(self, wav_path: Path) -> bool:
|
||
"""Play a previously generated .wav file."""
|
||
return await self._play(wav_path)
|
||
|
||
@staticmethod
|
||
async def _play(wav_path: Path) -> bool:
|
||
"""Play a .wav file using pygame.mixer (async-friendly)."""
|
||
try:
|
||
import pygame
|
||
|
||
pygame.mixer.init(frequency=22050, size=-16, channels=1, buffer=2048)
|
||
pygame.mixer.music.load(str(wav_path))
|
||
pygame.mixer.music.play()
|
||
|
||
while pygame.mixer.music.get_busy():
|
||
await asyncio.sleep(0.05)
|
||
|
||
pygame.mixer.music.stop()
|
||
pygame.mixer.quit()
|
||
logger.info("Playback finished: %s", wav_path.name)
|
||
return True
|
||
except Exception:
|
||
logger.exception("Playback failed for %s", wav_path)
|
||
return False
|
||
|
||
def set_voice_sample(self, path: str):
|
||
"""Switch to a different voice sample .wav file."""
|
||
self.voice_sample = Path(path)
|
||
self._validate_voice()
|
||
logger.info("Voice sample set to: %s", self.voice_sample.resolve())
|
||
|
||
def set_instruction(self, instruction: str):
|
||
"""Update the default style instruction."""
|
||
self.instruction = instruction
|
||
logger.info("TTS instruction updated: %s", instruction)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# CLI — record a voice sample directly
|
||
# ---------------------------------------------------------------------------
|
||
if __name__ == "__main__":
|
||
import sys
|
||
|
||
if len(sys.argv) > 1 and sys.argv[1] == "record":
|
||
output = sys.argv[2] if len(sys.argv) > 2 else DEFAULT_VOICE_SAMPLE
|
||
record_voice_sample(output)
|
||
else:
|
||
print("Usage:")
|
||
print(f" python {Path(__file__).name} record [output.wav]")
|
||
print()
|
||
print("Records a 3-second voice sample for Qwen3-TTS cloning.")
|
||
print(f"Default output: {DEFAULT_VOICE_SAMPLE}")
|