feat: switch TTS to cloned voice mode with sample validation & recorder

- tts.py: voice_sample param replaces preset voice, add validate_voice_sample(),
  add record_voice_sample() with CLI (python tts.py record), validate .wav
  format/duration/channels on init
- main.py: warn at startup if voice sample missing, show voice status in banner
- .env.example: QWEN_TTS_VOICE now points to voices/echo_voice.wav
- .gitignore: voice samples gitignored (personal data)
- voices/README.md: instructions for recording & placing voice samples
This commit is contained in:
Echo Assistant 2026-03-31 00:31:56 +00:00
parent d6b64d04d1
commit 19a283ec0f
4 changed files with 226 additions and 35 deletions

View File

@ -16,9 +16,9 @@ OPENROUTER_MODEL=qwen/qwen-3-235b-a22b
VOSK_MODEL_PATH=models/vosk-model-small-en-us
WAKE_WORD=echo
# --- Qwen3-TTS (optional overrides) ---
# Available preset voices: Ryan, Serena, Diana, etc.
# Or set a path to a 3-second .wav sample for voice cloning
# --- Qwen3-TTS — Cloned Voice (required) ---
# Path to your 3-second .wav voice sample (16 kHz mono, 16-bit).
# Record one with: python tts.py record
QWEN_TTS_MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice
QWEN_TTS_VOICE=Ryan
QWEN_TTS_VOICE=voices/echo_voice.wav
QWEN_TTS_INSTRUCT=Speak clearly with a warm, friendly tone. Be natural and conversational.

5
.gitignore vendored
View File

@ -21,6 +21,11 @@ models/
audio_output/
!audio_output/.gitkeep
# Voice samples (personal — keep local)
voices/
!voices/.gitkeep
!voices/README.md
# Environment & secrets
.env
.env.local

15
main.py
View File

@ -78,10 +78,11 @@ class EchoAssistant:
model=os.environ.get("OPENROUTER_MODEL", "qwen/qwen-3-235b-a22b"),
)
# --- TTS ---
# --- TTS (Cloned Voice) ---
voice_path = os.environ.get("QWEN_TTS_VOICE", "voices/echo_voice.wav")
self.tts = TTSEngine(
model_name=os.environ.get("QWEN_TTS_MODEL", "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"),
voice=os.environ.get("QWEN_TTS_VOICE", "Ryan"),
voice_sample=voice_path,
instruction=os.environ.get(
"QWEN_TTS_INSTRUCT",
"Speak clearly with a warm, friendly tone. Be natural and conversational.",
@ -92,6 +93,13 @@ class EchoAssistant:
self._processing = False # guard against concurrent commands
self._shutdown_event = asyncio.Event()
if not Path(voice_path).exists():
logger.warning(
"No voice sample at '%s' — TTS will not work until you record one. "
"Run: python tts.py record",
voice_path,
)
logger.info("Echo assistant initialized (wake word: '%s')", wake_word)
# ------------------------------------------------------------------
@ -238,7 +246,10 @@ class EchoAssistant:
async def start(self):
"""Start the Echo assistant."""
logger.info("=" * 60)
voice_path = os.environ.get("QWEN_TTS_VOICE", "voices/echo_voice.wav")
logger.info(" ECHO VOICE ASSISTANT")
logger.info(" Voice: %s (%s)", voice_path,
"" if Path(voice_path).exists() else "❌ missing")
logger.info(" Say '%s' to activate", os.environ.get("WAKE_WORD", "echo").upper())
logger.info(" Press Ctrl+C to quit")
logger.info("=" * 60)

233
tts.py
View File

@ -3,40 +3,170 @@ tts.py — Text-To-Speech Module (Qwen3-TTS)
Responsibilities:
1. Accept text (full or partial sentence) and generate a .wav audio file
using the Qwen3-TTS model running locally.
2. Support voice selection (preset voices or custom voice cloning).
3. Support instruction-based style control (e.g., energy, tone).
4. Play the generated audio immediately.
using the Qwen3-TTS model running locally with a **cloned voice**.
2. Validate the voice sample on init (must be a 25 second .wav file).
3. Provide a built-in recorder so users can create their voice sample
directly from the assistant.
4. Support instruction-based style control (e.g., energy, tone).
5. Play the generated audio immediately.
Cloned Voice Workflow:
- Qwen3-TTS-12Hz-1.7B-CustomVoice can clone a voice from a short audio
sample (recommended: 3 seconds, clean speech, no background noise).
- Place your sample at the path specified by QWEN_TTS_VOICE (default:
voices/echo_voice.wav).
- Or run `python tts.py` to record a 3-second sample interactively.
Environment Variables:
QWEN_TTS_MODEL model name or local path (default: Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice)
QWEN_TTS_VOICE preset voice name or path to 3s .wav sample
QWEN_TTS_MODEL model name or local path
QWEN_TTS_VOICE path to .wav voice sample (required for cloning)
QWEN_TTS_INSTRUCT default style instruction for speech generation
Dependencies:
pip install qwen-tts torch soundfile pygame
pip install qwen-tts torch soundfile pygame pyaudio
"""
import asyncio
import array
import logging
import os
import tempfile
import struct
import wave
from pathlib import Path
import pyaudio
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Configuration
# ---------------------------------------------------------------------------
DEFAULT_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
DEFAULT_VOICE = "Ryan" # preset voice; alternatives: "Serena", "Diana", etc.
DEFAULT_INSTRUCTION = "Speak clearly with a warm, friendly tone. Be natural and conversational."
DEFAULT_VOICE_SAMPLE = "voices/echo_voice.wav"
DEFAULT_INSTRUCTION = (
"Speak clearly with a warm, friendly tone. Be natural and conversational."
)
OUTPUT_DIR = Path("audio_output")
# Recording constants
REC_FORMAT = pyaudio.paInt16
REC_CHANNELS = 1
REC_RATE = 16000
REC_CHUNK = 1024
REC_DURATION = 3 # seconds — optimal for Qwen3 voice cloning
# ---------------------------------------------------------------------------
# Voice sample validation
# ---------------------------------------------------------------------------
def validate_voice_sample(path: str | Path) -> tuple[bool, str]:
"""
Check that a voice sample file exists and meets Qwen3-TTS requirements.
Returns:
(is_valid, reason)
"""
p = Path(path)
if not p.exists():
return False, f"Voice sample not found at: {p.resolve()}. Record one with `python tts.py`"
if p.suffix.lower() != ".wav":
return False, f"Voice sample must be a .wav file, got '{p.suffix}'"
try:
with wave.open(str(p), "rb") as wf:
channels = wf.getnchannels()
sample_width = wf.getsampwidth()
framerate = wf.getframerate()
nframes = wf.getnframes()
duration = nframes / framerate
except Exception as exc:
return False, f"Could not read .wav file: {exc}"
issues = []
if channels != 1:
issues.append(f"expected mono (1 channel), got {channels}")
if framerate < 16000:
issues.append(f"sample rate {framerate} Hz is too low (min 16000)")
if duration < 2:
issues.append(f"sample is {duration:.1f}s — too short (min 2s, recommended 3s)")
elif duration > 5:
issues.append(f"sample is {duration:.1f}s — too long (max 5s, recommended 3s)")
if sample_width != 2:
issues.append(f"expected 16-bit audio, got {sample_width * 8}-bit")
if issues:
return False, f"Voice sample issues: {'; '.join(issues)}"
return True, f"Voice sample OK: {duration:.1f}s, {framerate} Hz, mono, 16-bit"
# ---------------------------------------------------------------------------
# Voice sample recorder
# ---------------------------------------------------------------------------
def record_voice_sample(output_path: str | Path, duration: int = REC_DURATION) -> Path:
"""
Record a short voice sample from the microphone for voice cloning.
The user will hear a countdown and should speak naturally for the
full duration. The recording is saved as a 16 kHz mono 16-bit .wav file.
Args:
output_path: Where to save the .wav file.
duration: Recording length in seconds (default 3).
Returns:
Path to the saved .wav file.
"""
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
print(f"\n🎙️ Recording a {duration}-second voice sample for Echo...")
print(f" Speak naturally in a clear voice. No background noise.")
print(f" Saving to: {output_path.resolve()}\n")
pa = pyaudio.PyAudio()
stream = pa.open(
format=REC_FORMAT,
channels=REC_CHANNELS,
rate=REC_RATE,
input=True,
frames_per_buffer=REC_CHUNK,
)
frames = []
for i in range(int(REC_RATE / REC_CHUNK * duration)):
frame = stream.read(REC_CHUNK, exception_on_overflow=False)
frames.append(frame)
remaining = duration - (i + 1) * REC_CHUNK / REC_RATE
if int(remaining) != int(remaining + REC_CHUNK / REC_RATE):
print(f" ... {int(remaining)}s remaining")
stream.stop_stream()
stream.close()
pa.terminate()
# Write .wav
with wave.open(str(output_path), "wb") as wf:
wf.setnchannels(REC_CHANNELS)
wf.setsampwidth(2) # 16-bit
wf.setframerate(REC_RATE)
wf.writeframes(b"".join(frames))
ok, msg = validate_voice_sample(output_path)
if ok:
print(f"\n{msg}")
else:
print(f"\n⚠️ {msg}")
print(f" File saved: {output_path.resolve()}\n")
return output_path
class TTSEngine:
"""
Wrapper around Qwen3-TTS for generating speech from text.
Wrapper around Qwen3-TTS for generating speech with a cloned voice.
The engine lazily loads the model on first use to avoid slow startup.
"""
@ -44,12 +174,12 @@ class TTSEngine:
def __init__(
self,
model_name: str = DEFAULT_MODEL,
voice: str = DEFAULT_VOICE,
voice_sample: str = DEFAULT_VOICE_SAMPLE,
instruction: str = DEFAULT_INSTRUCTION,
output_dir: str | Path = OUTPUT_DIR,
):
self.model_name = model_name
self.voice = voice
self.voice_sample = Path(voice_sample)
self.instruction = instruction
self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True)
@ -58,15 +188,32 @@ class TTSEngine:
self._processor = None
self._lock = asyncio.Lock() # prevent concurrent generation
# Validate voice sample on init
self._validate_voice()
def _validate_voice(self):
"""Check the voice sample and log warnings if it's not usable."""
ok, msg = validate_voice_sample(self.voice_sample)
if ok:
logger.info("🎤 Voice: %s", msg)
else:
logger.warning("🎤 Voice sample issue — %s", msg)
logger.warning(
" Record a sample with: python tts.py record"
)
# ---- lazy model loading ----
def _ensure_loaded(self):
"""Load model and processor on first call (lazy init)."""
if self._model is not None:
return
logger.info("Loading Qwen3-TTS model '%s' (this may take a moment)...", self.model_name)
logger.info(
"Loading Qwen3-TTS model '%s' (this may take a moment)...",
self.model_name,
)
try:
from qwen_tts import QwenTTSProcessor, QwenTTSModel
from qwen_tts import QwenTTSModel, QwenTTSProcessor
self._processor = QwenTTSProcessor()
self._model = QwenTTSModel.from_pretrained(self.model_name)
@ -81,7 +228,7 @@ class TTSEngine:
# ---- generation ----
async def generate(self, text: str, instruction: str | None = None) -> Path | None:
"""
Generate speech audio from text and save as .wav.
Generate speech audio from text using the cloned voice.
Args:
text: The text to convert to speech.
@ -102,16 +249,24 @@ class TTSEngine:
"""Synchronous generation (runs in thread pool)."""
self._ensure_loaded()
# Double-check voice sample before generating
if not self.voice_sample.exists():
logger.error(
"Voice sample missing at '%s' — cannot generate speech",
self.voice_sample.resolve(),
)
return None
output_path = self.output_dir / f"echo_{os.urandom(4).hex()}.wav"
try:
# Build voice reference: preset name or custom .wav path
voice_ref = self.voice
if Path(self.voice).exists():
voice_ref = str(Path(self.voice).resolve())
voice_ref = str(self.voice_sample.resolve())
# Generate audio
logger.info("Generating speech: '%s' (voice=%s)", text[:60], self.voice)
logger.info(
"Generating speech: '%s' (voice=%s)",
text[:60],
self.voice_sample.name,
)
audio_array = self._model.generate(
processor=self._processor,
text=text,
@ -119,12 +274,15 @@ class TTSEngine:
instruction=instruction,
)
# Save to file
import soundfile as sf
sample_rate = self._processor.sampling_rate
sf.write(str(output_path), audio_array, sample_rate)
logger.info("Audio saved to %s (%.1fs)", output_path, len(audio_array) / sample_rate)
logger.info(
"Audio saved to %s (%.1fs)",
output_path,
len(audio_array) / sample_rate,
)
return output_path
except Exception:
@ -158,7 +316,6 @@ class TTSEngine:
pygame.mixer.music.load(str(wav_path))
pygame.mixer.music.play()
# Wait for playback to finish
while pygame.mixer.music.get_busy():
await asyncio.sleep(0.05)
@ -170,12 +327,30 @@ class TTSEngine:
logger.exception("Playback failed for %s", wav_path)
return False
def set_voice(self, voice: str):
"""Switch to a different voice preset or custom sample path."""
self.voice = voice
logger.info("Voice set to: %s", voice)
def set_voice_sample(self, path: str):
"""Switch to a different voice sample .wav file."""
self.voice_sample = Path(path)
self._validate_voice()
logger.info("Voice sample set to: %s", self.voice_sample.resolve())
def set_instruction(self, instruction: str):
"""Update the default style instruction."""
self.instruction = instruction
logger.info("TTS instruction updated: %s", instruction)
# ---------------------------------------------------------------------------
# CLI — record a voice sample directly
# ---------------------------------------------------------------------------
if __name__ == "__main__":
import sys
if len(sys.argv) > 1 and sys.argv[1] == "record":
output = sys.argv[2] if len(sys.argv) > 2 else DEFAULT_VOICE_SAMPLE
record_voice_sample(output)
else:
print("Usage:")
print(f" python {Path(__file__).name} record [output.wav]")
print()
print("Records a 3-second voice sample for Qwen3-TTS cloning.")
print(f"Default output: {DEFAULT_VOICE_SAMPLE}")