feat: switch TTS to cloned voice mode with sample validation & recorder

- tts.py: voice_sample param replaces preset voice, add validate_voice_sample(), add record_voice_sample() with CLI (python tts.py record), validate .wav format/duration/channels on init - main.py: warn at startup if voice sample missing, show voice status in banner - .env.example: QWEN_TTS_VOICE now points to voices/echo_voice.wav - .gitignore: voice samples gitignored (personal data) - voices/README.md: instructions for recording & placing voice samples
2026-03-31 00:31:56 +00:00 · 2026-03-31 00:31:56 +00:00 · 19a283ec0f
commit 19a283ec0f
parent d6b64d04d1
4 changed files with 226 additions and 35 deletions
--- a/.env.example
+++ b/.env.example
@ -16,9 +16,9 @@ OPENROUTER_MODEL=qwen/qwen-3-235b-a22b
 VOSK_MODEL_PATH=models/vosk-model-small-en-us
 WAKE_WORD=echo

-# --- Qwen3-TTS (optional overrides) ---
-# Available preset voices: Ryan, Serena, Diana, etc.
-# Or set a path to a 3-second .wav sample for voice cloning
+# --- Qwen3-TTS — Cloned Voice (required) ---
+# Path to your 3-second .wav voice sample (16 kHz mono, 16-bit).
+# Record one with: python tts.py record
 QWEN_TTS_MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice
-QWEN_TTS_VOICE=Ryan
+QWEN_TTS_VOICE=voices/echo_voice.wav
 QWEN_TTS_INSTRUCT=Speak clearly with a warm, friendly tone. Be natural and conversational.
--- a/.gitignore
+++ b/.gitignore
@ -21,6 +21,11 @@ models/
 audio_output/
 !audio_output/.gitkeep

+# Voice samples (personal — keep local)
+voices/
+!voices/.gitkeep
+!voices/README.md
+
 # Environment & secrets
 .env
 .env.local
--- a/main.py
+++ b/main.py
@ -78,10 +78,11 @@ class EchoAssistant:
            model=os.environ.get("OPENROUTER_MODEL", "qwen/qwen-3-235b-a22b"),
        )

-        # --- TTS ---
+        # --- TTS (Cloned Voice) ---
+        voice_path = os.environ.get("QWEN_TTS_VOICE", "voices/echo_voice.wav")
        self.tts = TTSEngine(
            model_name=os.environ.get("QWEN_TTS_MODEL", "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"),
-            voice=os.environ.get("QWEN_TTS_VOICE", "Ryan"),
+            voice_sample=voice_path,
            instruction=os.environ.get(
                "QWEN_TTS_INSTRUCT",
                "Speak clearly with a warm, friendly tone. Be natural and conversational.",
@ -92,6 +93,13 @@ class EchoAssistant:
        self._processing = False  # guard against concurrent commands
        self._shutdown_event = asyncio.Event()

+        if not Path(voice_path).exists():
+            logger.warning(
+                "No voice sample at '%s' — TTS will not work until you record one. "
+                "Run: python tts.py record",
+                voice_path,
+            )
+
        logger.info("Echo assistant initialized (wake word: '%s')", wake_word)

    # ------------------------------------------------------------------
@ -238,7 +246,10 @@ class EchoAssistant:
    async def start(self):
        """Start the Echo assistant."""
        logger.info("=" * 60)
+        voice_path = os.environ.get("QWEN_TTS_VOICE", "voices/echo_voice.wav")
        logger.info("  ECHO VOICE ASSISTANT")
+        logger.info("  Voice: %s (%s)", voice_path,
+                     "✅" if Path(voice_path).exists() else "❌ missing")
        logger.info("  Say '%s' to activate", os.environ.get("WAKE_WORD", "echo").upper())
        logger.info("  Press Ctrl+C to quit")
        logger.info("=" * 60)
--- a/tts.py
+++ b/tts.py
@ -3,40 +3,170 @@ tts.py — Text-To-Speech Module (Qwen3-TTS)

 Responsibilities:
  1. Accept text (full or partial sentence) and generate a .wav audio file
-     using the Qwen3-TTS model running locally.
-  2. Support voice selection (preset voices or custom voice cloning).
-  3. Support instruction-based style control (e.g., energy, tone).
-  4. Play the generated audio immediately.
+     using the Qwen3-TTS model running locally with a **cloned voice**.
+  2. Validate the voice sample on init (must be a 2–5 second .wav file).
+  3. Provide a built-in recorder so users can create their voice sample
+     directly from the assistant.
+  4. Support instruction-based style control (e.g., energy, tone).
+  5. Play the generated audio immediately.
+
+Cloned Voice Workflow:
+  - Qwen3-TTS-12Hz-1.7B-CustomVoice can clone a voice from a short audio
+    sample (recommended: 3 seconds, clean speech, no background noise).
+  - Place your sample at the path specified by QWEN_TTS_VOICE (default:
+    voices/echo_voice.wav).
+  - Or run `python tts.py` to record a 3-second sample interactively.

 Environment Variables:
-  QWEN_TTS_MODEL    — model name or local path (default: Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice)
-  QWEN_TTS_VOICE    — preset voice name or path to 3s .wav sample
+  QWEN_TTS_MODEL     — model name or local path
+  QWEN_TTS_VOICE     — path to .wav voice sample (required for cloning)
  QWEN_TTS_INSTRUCT  — default style instruction for speech generation

 Dependencies:
-  pip install qwen-tts torch soundfile pygame
+  pip install qwen-tts torch soundfile pygame pyaudio
 """

 import asyncio
+import array
 import logging
 import os
-import tempfile
+import struct
+import wave
 from pathlib import Path

+import pyaudio
+
 logger = logging.getLogger(__name__)

 # ---------------------------------------------------------------------------
 # Configuration
 # ---------------------------------------------------------------------------
 DEFAULT_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
-DEFAULT_VOICE = "Ryan"  # preset voice; alternatives: "Serena", "Diana", etc.
-DEFAULT_INSTRUCTION = "Speak clearly with a warm, friendly tone. Be natural and conversational."
+DEFAULT_VOICE_SAMPLE = "voices/echo_voice.wav"
+DEFAULT_INSTRUCTION = (
+    "Speak clearly with a warm, friendly tone. Be natural and conversational."
+)
 OUTPUT_DIR = Path("audio_output")

+# Recording constants
+REC_FORMAT = pyaudio.paInt16
+REC_CHANNELS = 1
+REC_RATE = 16000
+REC_CHUNK = 1024
+REC_DURATION = 3  # seconds — optimal for Qwen3 voice cloning
+
+
+# ---------------------------------------------------------------------------
+# Voice sample validation
+# ---------------------------------------------------------------------------
+def validate_voice_sample(path: str | Path) -> tuple[bool, str]:
+    """
+    Check that a voice sample file exists and meets Qwen3-TTS requirements.
+
+    Returns:
+        (is_valid, reason)
+    """
+    p = Path(path)
+
+    if not p.exists():
+        return False, f"Voice sample not found at: {p.resolve()}. Record one with `python tts.py`"
+
+    if p.suffix.lower() != ".wav":
+        return False, f"Voice sample must be a .wav file, got '{p.suffix}'"
+
+    try:
+        with wave.open(str(p), "rb") as wf:
+            channels = wf.getnchannels()
+            sample_width = wf.getsampwidth()
+            framerate = wf.getframerate()
+            nframes = wf.getnframes()
+            duration = nframes / framerate
+    except Exception as exc:
+        return False, f"Could not read .wav file: {exc}"
+
+    issues = []
+    if channels != 1:
+        issues.append(f"expected mono (1 channel), got {channels}")
+    if framerate < 16000:
+        issues.append(f"sample rate {framerate} Hz is too low (min 16000)")
+    if duration < 2:
+        issues.append(f"sample is {duration:.1f}s — too short (min 2s, recommended 3s)")
+    elif duration > 5:
+        issues.append(f"sample is {duration:.1f}s — too long (max 5s, recommended 3s)")
+    if sample_width != 2:
+        issues.append(f"expected 16-bit audio, got {sample_width * 8}-bit")
+
+    if issues:
+        return False, f"Voice sample issues: {'; '.join(issues)}"
+
+    return True, f"Voice sample OK: {duration:.1f}s, {framerate} Hz, mono, 16-bit"
+
+
+# ---------------------------------------------------------------------------
+# Voice sample recorder
+# ---------------------------------------------------------------------------
+def record_voice_sample(output_path: str | Path, duration: int = REC_DURATION) -> Path:
+    """
+    Record a short voice sample from the microphone for voice cloning.
+
+    The user will hear a countdown and should speak naturally for the
+    full duration. The recording is saved as a 16 kHz mono 16-bit .wav file.
+
+    Args:
+        output_path: Where to save the .wav file.
+        duration: Recording length in seconds (default 3).
+
+    Returns:
+        Path to the saved .wav file.
+    """
+    output_path = Path(output_path)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+
+    print(f"\n🎙️  Recording a {duration}-second voice sample for Echo...")
+    print(f"   Speak naturally in a clear voice. No background noise.")
+    print(f"   Saving to: {output_path.resolve()}\n")
+
+    pa = pyaudio.PyAudio()
+    stream = pa.open(
+        format=REC_FORMAT,
+        channels=REC_CHANNELS,
+        rate=REC_RATE,
+        input=True,
+        frames_per_buffer=REC_CHUNK,
+    )
+
+    frames = []
+    for i in range(int(REC_RATE / REC_CHUNK * duration)):
+        frame = stream.read(REC_CHUNK, exception_on_overflow=False)
+        frames.append(frame)
+        remaining = duration - (i + 1) * REC_CHUNK / REC_RATE
+        if int(remaining) != int(remaining + REC_CHUNK / REC_RATE):
+            print(f"   ... {int(remaining)}s remaining")
+
+    stream.stop_stream()
+    stream.close()
+    pa.terminate()
+
+    # Write .wav
+    with wave.open(str(output_path), "wb") as wf:
+        wf.setnchannels(REC_CHANNELS)
+        wf.setsampwidth(2)  # 16-bit
+        wf.setframerate(REC_RATE)
+        wf.writeframes(b"".join(frames))
+
+    ok, msg = validate_voice_sample(output_path)
+    if ok:
+        print(f"\n✅ {msg}")
+    else:
+        print(f"\n⚠️  {msg}")
+    print(f"   File saved: {output_path.resolve()}\n")
+
+    return output_path
+

 class TTSEngine:
    """
-    Wrapper around Qwen3-TTS for generating speech from text.
+    Wrapper around Qwen3-TTS for generating speech with a cloned voice.

    The engine lazily loads the model on first use to avoid slow startup.
    """
@ -44,12 +174,12 @@ class TTSEngine:
    def __init__(
        self,
        model_name: str = DEFAULT_MODEL,
-        voice: str = DEFAULT_VOICE,
+        voice_sample: str = DEFAULT_VOICE_SAMPLE,
        instruction: str = DEFAULT_INSTRUCTION,
        output_dir: str | Path = OUTPUT_DIR,
    ):
        self.model_name = model_name
-        self.voice = voice
+        self.voice_sample = Path(voice_sample)
        self.instruction = instruction
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
@ -58,15 +188,32 @@ class TTSEngine:
        self._processor = None
        self._lock = asyncio.Lock()  # prevent concurrent generation

+        # Validate voice sample on init
+        self._validate_voice()
+
+    def _validate_voice(self):
+        """Check the voice sample and log warnings if it's not usable."""
+        ok, msg = validate_voice_sample(self.voice_sample)
+        if ok:
+            logger.info("🎤 Voice: %s", msg)
+        else:
+            logger.warning("🎤 Voice sample issue — %s", msg)
+            logger.warning(
+                "   Record a sample with: python tts.py record"
+            )
+
    # ---- lazy model loading ----
    def _ensure_loaded(self):
        """Load model and processor on first call (lazy init)."""
        if self._model is not None:
            return

-        logger.info("Loading Qwen3-TTS model '%s' (this may take a moment)...", self.model_name)
+        logger.info(
+            "Loading Qwen3-TTS model '%s' (this may take a moment)...",
+            self.model_name,
+        )
        try:
-            from qwen_tts import QwenTTSProcessor, QwenTTSModel
+            from qwen_tts import QwenTTSModel, QwenTTSProcessor

            self._processor = QwenTTSProcessor()
            self._model = QwenTTSModel.from_pretrained(self.model_name)
@ -81,7 +228,7 @@ class TTSEngine:
    # ---- generation ----
    async def generate(self, text: str, instruction: str | None = None) -> Path | None:
        """
-        Generate speech audio from text and save as .wav.
+        Generate speech audio from text using the cloned voice.

        Args:
            text: The text to convert to speech.
@ -102,16 +249,24 @@ class TTSEngine:
        """Synchronous generation (runs in thread pool)."""
        self._ensure_loaded()

+        # Double-check voice sample before generating
+        if not self.voice_sample.exists():
+            logger.error(
+                "Voice sample missing at '%s' — cannot generate speech",
+                self.voice_sample.resolve(),
+            )
+            return None
+
        output_path = self.output_dir / f"echo_{os.urandom(4).hex()}.wav"

        try:
-            # Build voice reference: preset name or custom .wav path
-            voice_ref = self.voice
-            if Path(self.voice).exists():
-                voice_ref = str(Path(self.voice).resolve())
+            voice_ref = str(self.voice_sample.resolve())

-            # Generate audio
-            logger.info("Generating speech: '%s' (voice=%s)", text[:60], self.voice)
+            logger.info(
+                "Generating speech: '%s' (voice=%s)",
+                text[:60],
+                self.voice_sample.name,
+            )
            audio_array = self._model.generate(
                processor=self._processor,
                text=text,
@ -119,12 +274,15 @@ class TTSEngine:
                instruction=instruction,
            )

-            # Save to file
            import soundfile as sf

            sample_rate = self._processor.sampling_rate
            sf.write(str(output_path), audio_array, sample_rate)
-            logger.info("Audio saved to %s (%.1fs)", output_path, len(audio_array) / sample_rate)
+            logger.info(
+                "Audio saved to %s (%.1fs)",
+                output_path,
+                len(audio_array) / sample_rate,
+            )
            return output_path

        except Exception:
@ -158,7 +316,6 @@ class TTSEngine:
            pygame.mixer.music.load(str(wav_path))
            pygame.mixer.music.play()

-            # Wait for playback to finish
            while pygame.mixer.music.get_busy():
                await asyncio.sleep(0.05)

@ -170,12 +327,30 @@ class TTSEngine:
            logger.exception("Playback failed for %s", wav_path)
            return False

-    def set_voice(self, voice: str):
-        """Switch to a different voice preset or custom sample path."""
-        self.voice = voice
-        logger.info("Voice set to: %s", voice)
+    def set_voice_sample(self, path: str):
+        """Switch to a different voice sample .wav file."""
+        self.voice_sample = Path(path)
+        self._validate_voice()
+        logger.info("Voice sample set to: %s", self.voice_sample.resolve())

    def set_instruction(self, instruction: str):
        """Update the default style instruction."""
        self.instruction = instruction
        logger.info("TTS instruction updated: %s", instruction)
+
+
+# ---------------------------------------------------------------------------
+# CLI — record a voice sample directly
+# ---------------------------------------------------------------------------
+if __name__ == "__main__":
+    import sys
+
+    if len(sys.argv) > 1 and sys.argv[1] == "record":
+        output = sys.argv[2] if len(sys.argv) > 2 else DEFAULT_VOICE_SAMPLE
+        record_voice_sample(output)
+    else:
+        print("Usage:")
+        print(f"  python {Path(__file__).name} record [output.wav]")
+        print()
+        print("Records a 3-second voice sample for Qwen3-TTS cloning.")
+        print(f"Default output: {DEFAULT_VOICE_SAMPLE}")