feat: switch TTS to cloned voice mode with sample validation & recorder

- tts.py: voice_sample param replaces preset voice, add validate_voice_sample(),
  add record_voice_sample() with CLI (python tts.py record), validate .wav
  format/duration/channels on init
- main.py: warn at startup if voice sample missing, show voice status in banner
- .env.example: QWEN_TTS_VOICE now points to voices/echo_voice.wav
- .gitignore: voice samples gitignored (personal data)
- voices/README.md: instructions for recording & placing voice samples
This commit is contained in:
Echo Assistant 2026-03-31 00:31:56 +00:00
parent d6b64d04d1
commit 19a283ec0f
4 changed files with 226 additions and 35 deletions

View File

@ -16,9 +16,9 @@ OPENROUTER_MODEL=qwen/qwen-3-235b-a22b
VOSK_MODEL_PATH=models/vosk-model-small-en-us VOSK_MODEL_PATH=models/vosk-model-small-en-us
WAKE_WORD=echo WAKE_WORD=echo
# --- Qwen3-TTS (optional overrides) --- # --- Qwen3-TTS — Cloned Voice (required) ---
# Available preset voices: Ryan, Serena, Diana, etc. # Path to your 3-second .wav voice sample (16 kHz mono, 16-bit).
# Or set a path to a 3-second .wav sample for voice cloning # Record one with: python tts.py record
QWEN_TTS_MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice QWEN_TTS_MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice
QWEN_TTS_VOICE=Ryan QWEN_TTS_VOICE=voices/echo_voice.wav
QWEN_TTS_INSTRUCT=Speak clearly with a warm, friendly tone. Be natural and conversational. QWEN_TTS_INSTRUCT=Speak clearly with a warm, friendly tone. Be natural and conversational.

5
.gitignore vendored
View File

@ -21,6 +21,11 @@ models/
audio_output/ audio_output/
!audio_output/.gitkeep !audio_output/.gitkeep
# Voice samples (personal — keep local)
voices/
!voices/.gitkeep
!voices/README.md
# Environment & secrets # Environment & secrets
.env .env
.env.local .env.local

15
main.py
View File

@ -78,10 +78,11 @@ class EchoAssistant:
model=os.environ.get("OPENROUTER_MODEL", "qwen/qwen-3-235b-a22b"), model=os.environ.get("OPENROUTER_MODEL", "qwen/qwen-3-235b-a22b"),
) )
# --- TTS --- # --- TTS (Cloned Voice) ---
voice_path = os.environ.get("QWEN_TTS_VOICE", "voices/echo_voice.wav")
self.tts = TTSEngine( self.tts = TTSEngine(
model_name=os.environ.get("QWEN_TTS_MODEL", "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"), model_name=os.environ.get("QWEN_TTS_MODEL", "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"),
voice=os.environ.get("QWEN_TTS_VOICE", "Ryan"), voice_sample=voice_path,
instruction=os.environ.get( instruction=os.environ.get(
"QWEN_TTS_INSTRUCT", "QWEN_TTS_INSTRUCT",
"Speak clearly with a warm, friendly tone. Be natural and conversational.", "Speak clearly with a warm, friendly tone. Be natural and conversational.",
@ -92,6 +93,13 @@ class EchoAssistant:
self._processing = False # guard against concurrent commands self._processing = False # guard against concurrent commands
self._shutdown_event = asyncio.Event() self._shutdown_event = asyncio.Event()
if not Path(voice_path).exists():
logger.warning(
"No voice sample at '%s' — TTS will not work until you record one. "
"Run: python tts.py record",
voice_path,
)
logger.info("Echo assistant initialized (wake word: '%s')", wake_word) logger.info("Echo assistant initialized (wake word: '%s')", wake_word)
# ------------------------------------------------------------------ # ------------------------------------------------------------------
@ -238,7 +246,10 @@ class EchoAssistant:
async def start(self): async def start(self):
"""Start the Echo assistant.""" """Start the Echo assistant."""
logger.info("=" * 60) logger.info("=" * 60)
voice_path = os.environ.get("QWEN_TTS_VOICE", "voices/echo_voice.wav")
logger.info(" ECHO VOICE ASSISTANT") logger.info(" ECHO VOICE ASSISTANT")
logger.info(" Voice: %s (%s)", voice_path,
"" if Path(voice_path).exists() else "❌ missing")
logger.info(" Say '%s' to activate", os.environ.get("WAKE_WORD", "echo").upper()) logger.info(" Say '%s' to activate", os.environ.get("WAKE_WORD", "echo").upper())
logger.info(" Press Ctrl+C to quit") logger.info(" Press Ctrl+C to quit")
logger.info("=" * 60) logger.info("=" * 60)

233
tts.py
View File

@ -3,40 +3,170 @@ tts.py — Text-To-Speech Module (Qwen3-TTS)
Responsibilities: Responsibilities:
1. Accept text (full or partial sentence) and generate a .wav audio file 1. Accept text (full or partial sentence) and generate a .wav audio file
using the Qwen3-TTS model running locally. using the Qwen3-TTS model running locally with a **cloned voice**.
2. Support voice selection (preset voices or custom voice cloning). 2. Validate the voice sample on init (must be a 25 second .wav file).
3. Support instruction-based style control (e.g., energy, tone). 3. Provide a built-in recorder so users can create their voice sample
4. Play the generated audio immediately. directly from the assistant.
4. Support instruction-based style control (e.g., energy, tone).
5. Play the generated audio immediately.
Cloned Voice Workflow:
- Qwen3-TTS-12Hz-1.7B-CustomVoice can clone a voice from a short audio
sample (recommended: 3 seconds, clean speech, no background noise).
- Place your sample at the path specified by QWEN_TTS_VOICE (default:
voices/echo_voice.wav).
- Or run `python tts.py` to record a 3-second sample interactively.
Environment Variables: Environment Variables:
QWEN_TTS_MODEL model name or local path (default: Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice) QWEN_TTS_MODEL model name or local path
QWEN_TTS_VOICE preset voice name or path to 3s .wav sample QWEN_TTS_VOICE path to .wav voice sample (required for cloning)
QWEN_TTS_INSTRUCT default style instruction for speech generation QWEN_TTS_INSTRUCT default style instruction for speech generation
Dependencies: Dependencies:
pip install qwen-tts torch soundfile pygame pip install qwen-tts torch soundfile pygame pyaudio
""" """
import asyncio import asyncio
import array
import logging import logging
import os import os
import tempfile import struct
import wave
from pathlib import Path from pathlib import Path
import pyaudio
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Configuration # Configuration
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
DEFAULT_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice" DEFAULT_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
DEFAULT_VOICE = "Ryan" # preset voice; alternatives: "Serena", "Diana", etc. DEFAULT_VOICE_SAMPLE = "voices/echo_voice.wav"
DEFAULT_INSTRUCTION = "Speak clearly with a warm, friendly tone. Be natural and conversational." DEFAULT_INSTRUCTION = (
"Speak clearly with a warm, friendly tone. Be natural and conversational."
)
OUTPUT_DIR = Path("audio_output") OUTPUT_DIR = Path("audio_output")
# Recording constants
REC_FORMAT = pyaudio.paInt16
REC_CHANNELS = 1
REC_RATE = 16000
REC_CHUNK = 1024
REC_DURATION = 3 # seconds — optimal for Qwen3 voice cloning
# ---------------------------------------------------------------------------
# Voice sample validation
# ---------------------------------------------------------------------------
def validate_voice_sample(path: str | Path) -> tuple[bool, str]:
"""
Check that a voice sample file exists and meets Qwen3-TTS requirements.
Returns:
(is_valid, reason)
"""
p = Path(path)
if not p.exists():
return False, f"Voice sample not found at: {p.resolve()}. Record one with `python tts.py`"
if p.suffix.lower() != ".wav":
return False, f"Voice sample must be a .wav file, got '{p.suffix}'"
try:
with wave.open(str(p), "rb") as wf:
channels = wf.getnchannels()
sample_width = wf.getsampwidth()
framerate = wf.getframerate()
nframes = wf.getnframes()
duration = nframes / framerate
except Exception as exc:
return False, f"Could not read .wav file: {exc}"
issues = []
if channels != 1:
issues.append(f"expected mono (1 channel), got {channels}")
if framerate < 16000:
issues.append(f"sample rate {framerate} Hz is too low (min 16000)")
if duration < 2:
issues.append(f"sample is {duration:.1f}s — too short (min 2s, recommended 3s)")
elif duration > 5:
issues.append(f"sample is {duration:.1f}s — too long (max 5s, recommended 3s)")
if sample_width != 2:
issues.append(f"expected 16-bit audio, got {sample_width * 8}-bit")
if issues:
return False, f"Voice sample issues: {'; '.join(issues)}"
return True, f"Voice sample OK: {duration:.1f}s, {framerate} Hz, mono, 16-bit"
# ---------------------------------------------------------------------------
# Voice sample recorder
# ---------------------------------------------------------------------------
def record_voice_sample(output_path: str | Path, duration: int = REC_DURATION) -> Path:
"""
Record a short voice sample from the microphone for voice cloning.
The user will hear a countdown and should speak naturally for the
full duration. The recording is saved as a 16 kHz mono 16-bit .wav file.
Args:
output_path: Where to save the .wav file.
duration: Recording length in seconds (default 3).
Returns:
Path to the saved .wav file.
"""
output_path = Path(output_path)
output_path.parent.mkdir(parents=True, exist_ok=True)
print(f"\n🎙️ Recording a {duration}-second voice sample for Echo...")
print(f" Speak naturally in a clear voice. No background noise.")
print(f" Saving to: {output_path.resolve()}\n")
pa = pyaudio.PyAudio()
stream = pa.open(
format=REC_FORMAT,
channels=REC_CHANNELS,
rate=REC_RATE,
input=True,
frames_per_buffer=REC_CHUNK,
)
frames = []
for i in range(int(REC_RATE / REC_CHUNK * duration)):
frame = stream.read(REC_CHUNK, exception_on_overflow=False)
frames.append(frame)
remaining = duration - (i + 1) * REC_CHUNK / REC_RATE
if int(remaining) != int(remaining + REC_CHUNK / REC_RATE):
print(f" ... {int(remaining)}s remaining")
stream.stop_stream()
stream.close()
pa.terminate()
# Write .wav
with wave.open(str(output_path), "wb") as wf:
wf.setnchannels(REC_CHANNELS)
wf.setsampwidth(2) # 16-bit
wf.setframerate(REC_RATE)
wf.writeframes(b"".join(frames))
ok, msg = validate_voice_sample(output_path)
if ok:
print(f"\n{msg}")
else:
print(f"\n⚠️ {msg}")
print(f" File saved: {output_path.resolve()}\n")
return output_path
class TTSEngine: class TTSEngine:
""" """
Wrapper around Qwen3-TTS for generating speech from text. Wrapper around Qwen3-TTS for generating speech with a cloned voice.
The engine lazily loads the model on first use to avoid slow startup. The engine lazily loads the model on first use to avoid slow startup.
""" """
@ -44,12 +174,12 @@ class TTSEngine:
def __init__( def __init__(
self, self,
model_name: str = DEFAULT_MODEL, model_name: str = DEFAULT_MODEL,
voice: str = DEFAULT_VOICE, voice_sample: str = DEFAULT_VOICE_SAMPLE,
instruction: str = DEFAULT_INSTRUCTION, instruction: str = DEFAULT_INSTRUCTION,
output_dir: str | Path = OUTPUT_DIR, output_dir: str | Path = OUTPUT_DIR,
): ):
self.model_name = model_name self.model_name = model_name
self.voice = voice self.voice_sample = Path(voice_sample)
self.instruction = instruction self.instruction = instruction
self.output_dir = Path(output_dir) self.output_dir = Path(output_dir)
self.output_dir.mkdir(parents=True, exist_ok=True) self.output_dir.mkdir(parents=True, exist_ok=True)
@ -58,15 +188,32 @@ class TTSEngine:
self._processor = None self._processor = None
self._lock = asyncio.Lock() # prevent concurrent generation self._lock = asyncio.Lock() # prevent concurrent generation
# Validate voice sample on init
self._validate_voice()
def _validate_voice(self):
"""Check the voice sample and log warnings if it's not usable."""
ok, msg = validate_voice_sample(self.voice_sample)
if ok:
logger.info("🎤 Voice: %s", msg)
else:
logger.warning("🎤 Voice sample issue — %s", msg)
logger.warning(
" Record a sample with: python tts.py record"
)
# ---- lazy model loading ---- # ---- lazy model loading ----
def _ensure_loaded(self): def _ensure_loaded(self):
"""Load model and processor on first call (lazy init).""" """Load model and processor on first call (lazy init)."""
if self._model is not None: if self._model is not None:
return return
logger.info("Loading Qwen3-TTS model '%s' (this may take a moment)...", self.model_name) logger.info(
"Loading Qwen3-TTS model '%s' (this may take a moment)...",
self.model_name,
)
try: try:
from qwen_tts import QwenTTSProcessor, QwenTTSModel from qwen_tts import QwenTTSModel, QwenTTSProcessor
self._processor = QwenTTSProcessor() self._processor = QwenTTSProcessor()
self._model = QwenTTSModel.from_pretrained(self.model_name) self._model = QwenTTSModel.from_pretrained(self.model_name)
@ -81,7 +228,7 @@ class TTSEngine:
# ---- generation ---- # ---- generation ----
async def generate(self, text: str, instruction: str | None = None) -> Path | None: async def generate(self, text: str, instruction: str | None = None) -> Path | None:
""" """
Generate speech audio from text and save as .wav. Generate speech audio from text using the cloned voice.
Args: Args:
text: The text to convert to speech. text: The text to convert to speech.
@ -102,16 +249,24 @@ class TTSEngine:
"""Synchronous generation (runs in thread pool).""" """Synchronous generation (runs in thread pool)."""
self._ensure_loaded() self._ensure_loaded()
# Double-check voice sample before generating
if not self.voice_sample.exists():
logger.error(
"Voice sample missing at '%s' — cannot generate speech",
self.voice_sample.resolve(),
)
return None
output_path = self.output_dir / f"echo_{os.urandom(4).hex()}.wav" output_path = self.output_dir / f"echo_{os.urandom(4).hex()}.wav"
try: try:
# Build voice reference: preset name or custom .wav path voice_ref = str(self.voice_sample.resolve())
voice_ref = self.voice
if Path(self.voice).exists():
voice_ref = str(Path(self.voice).resolve())
# Generate audio logger.info(
logger.info("Generating speech: '%s' (voice=%s)", text[:60], self.voice) "Generating speech: '%s' (voice=%s)",
text[:60],
self.voice_sample.name,
)
audio_array = self._model.generate( audio_array = self._model.generate(
processor=self._processor, processor=self._processor,
text=text, text=text,
@ -119,12 +274,15 @@ class TTSEngine:
instruction=instruction, instruction=instruction,
) )
# Save to file
import soundfile as sf import soundfile as sf
sample_rate = self._processor.sampling_rate sample_rate = self._processor.sampling_rate
sf.write(str(output_path), audio_array, sample_rate) sf.write(str(output_path), audio_array, sample_rate)
logger.info("Audio saved to %s (%.1fs)", output_path, len(audio_array) / sample_rate) logger.info(
"Audio saved to %s (%.1fs)",
output_path,
len(audio_array) / sample_rate,
)
return output_path return output_path
except Exception: except Exception:
@ -158,7 +316,6 @@ class TTSEngine:
pygame.mixer.music.load(str(wav_path)) pygame.mixer.music.load(str(wav_path))
pygame.mixer.music.play() pygame.mixer.music.play()
# Wait for playback to finish
while pygame.mixer.music.get_busy(): while pygame.mixer.music.get_busy():
await asyncio.sleep(0.05) await asyncio.sleep(0.05)
@ -170,12 +327,30 @@ class TTSEngine:
logger.exception("Playback failed for %s", wav_path) logger.exception("Playback failed for %s", wav_path)
return False return False
def set_voice(self, voice: str): def set_voice_sample(self, path: str):
"""Switch to a different voice preset or custom sample path.""" """Switch to a different voice sample .wav file."""
self.voice = voice self.voice_sample = Path(path)
logger.info("Voice set to: %s", voice) self._validate_voice()
logger.info("Voice sample set to: %s", self.voice_sample.resolve())
def set_instruction(self, instruction: str): def set_instruction(self, instruction: str):
"""Update the default style instruction.""" """Update the default style instruction."""
self.instruction = instruction self.instruction = instruction
logger.info("TTS instruction updated: %s", instruction) logger.info("TTS instruction updated: %s", instruction)
# ---------------------------------------------------------------------------
# CLI — record a voice sample directly
# ---------------------------------------------------------------------------
if __name__ == "__main__":
import sys
if len(sys.argv) > 1 and sys.argv[1] == "record":
output = sys.argv[2] if len(sys.argv) > 2 else DEFAULT_VOICE_SAMPLE
record_voice_sample(output)
else:
print("Usage:")
print(f" python {Path(__file__).name} record [output.wav]")
print()
print("Records a 3-second voice sample for Qwen3-TTS cloning.")
print(f"Default output: {DEFAULT_VOICE_SAMPLE}")