feat: switch TTS to cloned voice mode with sample validation & recorder

- tts.py: voice_sample param replaces preset voice, add validate_voice_sample(), add record_voice_sample() with CLI (python tts.py record), validate .wav format/duration/channels on init - main.py: warn at startup if voice sample missing, show voice status in banner - .env.example: QWEN_TTS_VOICE now points to voices/echo_voice.wav - .gitignore: voice samples gitignored (personal data) - voices/README.md: instructions for recording & placing voice samples
2026-03-31 00:31:56 +00:00 · 2026-03-31 00:31:56 +00:00 · 19a283ec0f
commit 19a283ec0f
parent d6b64d04d1
4 changed files with 226 additions and 35 deletions
--- a/.env.example
+++ b/.env.example
@ -16,9 +16,9 @@ OPENROUTER_MODEL=qwen/qwen-3-235b-a22b
 VOSK_MODEL_PATH=models/vosk-model-small-en-us
 WAKE_WORD=echo
-# --- Qwen3-TTS (optional overrides) ---
+# --- Qwen3-TTS — Cloned Voice (required) ---
-# Available preset voices: Ryan, Serena, Diana, etc.
+# Path to your 3-second .wav voice sample (16 kHz mono, 16-bit).
-# Or set a path to a 3-second .wav sample for voice cloning
+# Record one with: python tts.py record
 QWEN_TTS_MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice
-QWEN_TTS_VOICE=Ryan
+QWEN_TTS_VOICE=voices/echo_voice.wav
 QWEN_TTS_INSTRUCT=Speak clearly with a warm, friendly tone. Be natural and conversational.
--- a/.gitignore
+++ b/.gitignore
@ -21,6 +21,11 @@ models/
 audio_output/
 !audio_output/.gitkeep
 # Voice samples (personal — keep local)
 voices/
 !voices/.gitkeep
 !voices/README.md
 # Environment & secrets
 .env
 .env.local
--- a/main.py
+++ b/main.py
@ -78,10 +78,11 @@ class EchoAssistant:
            model=os.environ.get("OPENROUTER_MODEL", "qwen/qwen-3-235b-a22b"),
        )
-        # --- TTS ---
+        # --- TTS (Cloned Voice) ---
        voice_path = os.environ.get("QWEN_TTS_VOICE", "voices/echo_voice.wav")
        self.tts = TTSEngine(
            model_name=os.environ.get("QWEN_TTS_MODEL", "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"),
-            voice=os.environ.get("QWEN_TTS_VOICE", "Ryan"),
+            voice_sample=voice_path,
            instruction=os.environ.get(
                "QWEN_TTS_INSTRUCT",
                "Speak clearly with a warm, friendly tone. Be natural and conversational.",
@ -92,6 +93,13 @@ class EchoAssistant:
        self._processing = False  # guard against concurrent commands
        self._shutdown_event = asyncio.Event()
        if not Path(voice_path).exists():
            logger.warning(
                "No voice sample at '%s' — TTS will not work until you record one. "
                "Run: python tts.py record",
                voice_path,
            )
        logger.info("Echo assistant initialized (wake word: '%s')", wake_word)
    # ------------------------------------------------------------------
@ -238,7 +246,10 @@ class EchoAssistant:
    async def start(self):
        """Start the Echo assistant."""
        logger.info("=" * 60)
        voice_path = os.environ.get("QWEN_TTS_VOICE", "voices/echo_voice.wav")
        logger.info("  ECHO VOICE ASSISTANT")
        logger.info("  Voice: %s (%s)", voice_path,
                     "✅" if Path(voice_path).exists() else "❌ missing")
        logger.info("  Say '%s' to activate", os.environ.get("WAKE_WORD", "echo").upper())
        logger.info("  Press Ctrl+C to quit")
        logger.info("=" * 60)
--- a/tts.py
+++ b/tts.py
@ -3,40 +3,170 @@ tts.py — Text-To-Speech Module (Qwen3-TTS)
 Responsibilities:
  1. Accept text (full or partial sentence) and generate a .wav audio file
-     using the Qwen3-TTS model running locally.
+     using the Qwen3-TTS model running locally with a **cloned voice**.
-  2. Support voice selection (preset voices or custom voice cloning).
+  2. Validate the voice sample on init (must be a 2–5 second .wav file).
-  3. Support instruction-based style control (e.g., energy, tone).
+  3. Provide a built-in recorder so users can create their voice sample
-  4. Play the generated audio immediately.
+     directly from the assistant.
  4. Support instruction-based style control (e.g., energy, tone).
  5. Play the generated audio immediately.
 Cloned Voice Workflow:
  - Qwen3-TTS-12Hz-1.7B-CustomVoice can clone a voice from a short audio
    sample (recommended: 3 seconds, clean speech, no background noise).
  - Place your sample at the path specified by QWEN_TTS_VOICE (default:
    voices/echo_voice.wav).
  - Or run `python tts.py` to record a 3-second sample interactively.
 Environment Variables:
-  QWEN_TTS_MODEL    — model name or local path (default: Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice)
+  QWEN_TTS_MODEL     — model name or local path
-  QWEN_TTS_VOICE    — preset voice name or path to 3s .wav sample
+  QWEN_TTS_VOICE     — path to .wav voice sample (required for cloning)
  QWEN_TTS_INSTRUCT  — default style instruction for speech generation
 Dependencies:
-  pip install qwen-tts torch soundfile pygame
+  pip install qwen-tts torch soundfile pygame pyaudio
 """
 import asyncio
 import array
 import logging
 import os
-import tempfile
+import struct
 import wave
 from pathlib import Path
 import pyaudio
 logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Configuration
 # ---------------------------------------------------------------------------
 DEFAULT_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
-DEFAULT_VOICE = "Ryan"  # preset voice; alternatives: "Serena", "Diana", etc.
+DEFAULT_VOICE_SAMPLE = "voices/echo_voice.wav"
-DEFAULT_INSTRUCTION = "Speak clearly with a warm, friendly tone. Be natural and conversational."
+DEFAULT_INSTRUCTION = (
    "Speak clearly with a warm, friendly tone. Be natural and conversational."
 )
 OUTPUT_DIR = Path("audio_output")
 # Recording constants
 REC_FORMAT = pyaudio.paInt16
 REC_CHANNELS = 1
 REC_RATE = 16000
 REC_CHUNK = 1024
 REC_DURATION = 3  # seconds — optimal for Qwen3 voice cloning
 # ---------------------------------------------------------------------------
 # Voice sample validation
 # ---------------------------------------------------------------------------
 def validate_voice_sample(path: str | Path) -> tuple[bool, str]:
    """
    Check that a voice sample file exists and meets Qwen3-TTS requirements.
    Returns:
        (is_valid, reason)
    """
    p = Path(path)
    if not p.exists():
        return False, f"Voice sample not found at: {p.resolve()}. Record one with `python tts.py`"
    if p.suffix.lower() != ".wav":
        return False, f"Voice sample must be a .wav file, got '{p.suffix}'"
    try:
        with wave.open(str(p), "rb") as wf:
            channels = wf.getnchannels()
            sample_width = wf.getsampwidth()
            framerate = wf.getframerate()
            nframes = wf.getnframes()
            duration = nframes / framerate
    except Exception as exc:
        return False, f"Could not read .wav file: {exc}"
    issues = []
    if channels != 1:
        issues.append(f"expected mono (1 channel), got {channels}")
    if framerate < 16000:
        issues.append(f"sample rate {framerate} Hz is too low (min 16000)")
    if duration < 2:
        issues.append(f"sample is {duration:.1f}s — too short (min 2s, recommended 3s)")
    elif duration > 5:
        issues.append(f"sample is {duration:.1f}s — too long (max 5s, recommended 3s)")
    if sample_width != 2:
        issues.append(f"expected 16-bit audio, got {sample_width * 8}-bit")
    if issues:
        return False, f"Voice sample issues: {'; '.join(issues)}"
    return True, f"Voice sample OK: {duration:.1f}s, {framerate} Hz, mono, 16-bit"
 # ---------------------------------------------------------------------------
 # Voice sample recorder
 # ---------------------------------------------------------------------------
 def record_voice_sample(output_path: str | Path, duration: int = REC_DURATION) -> Path:
    """
    Record a short voice sample from the microphone for voice cloning.
    The user will hear a countdown and should speak naturally for the
    full duration. The recording is saved as a 16 kHz mono 16-bit .wav file.
    Args:
        output_path: Where to save the .wav file.
        duration: Recording length in seconds (default 3).
    Returns:
        Path to the saved .wav file.
    """
    output_path = Path(output_path)
    output_path.parent.mkdir(parents=True, exist_ok=True)
    print(f"\n🎙️  Recording a {duration}-second voice sample for Echo...")
    print(f"   Speak naturally in a clear voice. No background noise.")
    print(f"   Saving to: {output_path.resolve()}\n")
    pa = pyaudio.PyAudio()
    stream = pa.open(
        format=REC_FORMAT,
        channels=REC_CHANNELS,
        rate=REC_RATE,
        input=True,
        frames_per_buffer=REC_CHUNK,
    )
    frames = []
    for i in range(int(REC_RATE / REC_CHUNK * duration)):
        frame = stream.read(REC_CHUNK, exception_on_overflow=False)
        frames.append(frame)
        remaining = duration - (i + 1) * REC_CHUNK / REC_RATE
        if int(remaining) != int(remaining + REC_CHUNK / REC_RATE):
            print(f"   ... {int(remaining)}s remaining")
    stream.stop_stream()
    stream.close()
    pa.terminate()
    # Write .wav
    with wave.open(str(output_path), "wb") as wf:
        wf.setnchannels(REC_CHANNELS)
        wf.setsampwidth(2)  # 16-bit
        wf.setframerate(REC_RATE)
        wf.writeframes(b"".join(frames))
    ok, msg = validate_voice_sample(output_path)
    if ok:
        print(f"\n✅ {msg}")
    else:
        print(f"\n⚠️  {msg}")
    print(f"   File saved: {output_path.resolve()}\n")
    return output_path
 class TTSEngine:
    """
-    Wrapper around Qwen3-TTS for generating speech from text.
+    Wrapper around Qwen3-TTS for generating speech with a cloned voice.
    The engine lazily loads the model on first use to avoid slow startup.
    """
@ -44,12 +174,12 @@ class TTSEngine:
    def __init__(
        self,
        model_name: str = DEFAULT_MODEL,
-        voice: str = DEFAULT_VOICE,
+        voice_sample: str = DEFAULT_VOICE_SAMPLE,
        instruction: str = DEFAULT_INSTRUCTION,
        output_dir: str | Path = OUTPUT_DIR,
    ):
        self.model_name = model_name
-        self.voice = voice
+        self.voice_sample = Path(voice_sample)
        self.instruction = instruction
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
@ -58,15 +188,32 @@ class TTSEngine:
        self._processor = None
        self._lock = asyncio.Lock()  # prevent concurrent generation
        # Validate voice sample on init
        self._validate_voice()
    def _validate_voice(self):
        """Check the voice sample and log warnings if it's not usable."""
        ok, msg = validate_voice_sample(self.voice_sample)
        if ok:
            logger.info("🎤 Voice: %s", msg)
        else:
            logger.warning("🎤 Voice sample issue — %s", msg)
            logger.warning(
                "   Record a sample with: python tts.py record"
            )
    # ---- lazy model loading ----
    def _ensure_loaded(self):
        """Load model and processor on first call (lazy init)."""
        if self._model is not None:
            return
-        logger.info("Loading Qwen3-TTS model '%s' (this may take a moment)...", self.model_name)
+        logger.info(
            "Loading Qwen3-TTS model '%s' (this may take a moment)...",
            self.model_name,
        )
        try:
-            from qwen_tts import QwenTTSProcessor, QwenTTSModel
+            from qwen_tts import QwenTTSModel, QwenTTSProcessor
            self._processor = QwenTTSProcessor()
            self._model = QwenTTSModel.from_pretrained(self.model_name)
@ -81,7 +228,7 @@ class TTSEngine:
    # ---- generation ----
    async def generate(self, text: str, instruction: str | None = None) -> Path | None:
        """
-        Generate speech audio from text and save as .wav.
+        Generate speech audio from text using the cloned voice.
        Args:
            text: The text to convert to speech.
@ -102,16 +249,24 @@ class TTSEngine:
        """Synchronous generation (runs in thread pool)."""
        self._ensure_loaded()
        # Double-check voice sample before generating
        if not self.voice_sample.exists():
            logger.error(
                "Voice sample missing at '%s' — cannot generate speech",
                self.voice_sample.resolve(),
            )
            return None
        output_path = self.output_dir / f"echo_{os.urandom(4).hex()}.wav"
        try:
-            # Build voice reference: preset name or custom .wav path
+            voice_ref = str(self.voice_sample.resolve())
            voice_ref = self.voice
            if Path(self.voice).exists():
                voice_ref = str(Path(self.voice).resolve())
-            # Generate audio
+            logger.info(
-            logger.info("Generating speech: '%s' (voice=%s)", text[:60], self.voice)
+                "Generating speech: '%s' (voice=%s)",
                text[:60],
                self.voice_sample.name,
            )
            audio_array = self._model.generate(
                processor=self._processor,
                text=text,
@ -119,12 +274,15 @@ class TTSEngine:
                instruction=instruction,
            )
            # Save to file
            import soundfile as sf
            sample_rate = self._processor.sampling_rate
            sf.write(str(output_path), audio_array, sample_rate)
-            logger.info("Audio saved to %s (%.1fs)", output_path, len(audio_array) / sample_rate)
+            logger.info(
                "Audio saved to %s (%.1fs)",
                output_path,
                len(audio_array) / sample_rate,
            )
            return output_path
        except Exception:
@ -158,7 +316,6 @@ class TTSEngine:
            pygame.mixer.music.load(str(wav_path))
            pygame.mixer.music.play()
            # Wait for playback to finish
            while pygame.mixer.music.get_busy():
                await asyncio.sleep(0.05)
@ -170,12 +327,30 @@ class TTSEngine:
            logger.exception("Playback failed for %s", wav_path)
            return False
-    def set_voice(self, voice: str):
+    def set_voice_sample(self, path: str):
-        """Switch to a different voice preset or custom sample path."""
+        """Switch to a different voice sample .wav file."""
-        self.voice = voice
+        self.voice_sample = Path(path)
-        logger.info("Voice set to: %s", voice)
+        self._validate_voice()
        logger.info("Voice sample set to: %s", self.voice_sample.resolve())
    def set_instruction(self, instruction: str):
        """Update the default style instruction."""
        self.instruction = instruction
        logger.info("TTS instruction updated: %s", instruction)
 # ---------------------------------------------------------------------------
 # CLI — record a voice sample directly
 # ---------------------------------------------------------------------------
 if __name__ == "__main__":
    import sys
    if len(sys.argv) > 1 and sys.argv[1] == "record":
        output = sys.argv[2] if len(sys.argv) > 2 else DEFAULT_VOICE_SAMPLE
        record_voice_sample(output)
    else:
        print("Usage:")
        print(f"  python {Path(__file__).name} record [output.wav]")
        print()
        print("Records a 3-second voice sample for Qwen3-TTS cloning.")
        print(f"Default output: {DEFAULT_VOICE_SAMPLE}")