diff --git a/.env.example b/.env.example index 88c5505..4f1d57c 100644 --- a/.env.example +++ b/.env.example @@ -16,9 +16,9 @@ OPENROUTER_MODEL=qwen/qwen-3-235b-a22b VOSK_MODEL_PATH=models/vosk-model-small-en-us WAKE_WORD=echo -# --- Qwen3-TTS (optional overrides) --- -# Available preset voices: Ryan, Serena, Diana, etc. -# Or set a path to a 3-second .wav sample for voice cloning +# --- Qwen3-TTS — Cloned Voice (required) --- +# Path to your 3-second .wav voice sample (16 kHz mono, 16-bit). +# Record one with: python tts.py record QWEN_TTS_MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice -QWEN_TTS_VOICE=Ryan +QWEN_TTS_VOICE=voices/echo_voice.wav QWEN_TTS_INSTRUCT=Speak clearly with a warm, friendly tone. Be natural and conversational. diff --git a/.gitignore b/.gitignore index 291cdf7..c88c732 100644 --- a/.gitignore +++ b/.gitignore @@ -21,6 +21,11 @@ models/ audio_output/ !audio_output/.gitkeep +# Voice samples (personal — keep local) +voices/ +!voices/.gitkeep +!voices/README.md + # Environment & secrets .env .env.local diff --git a/main.py b/main.py index a3ff55d..4f0e62f 100644 --- a/main.py +++ b/main.py @@ -78,10 +78,11 @@ class EchoAssistant: model=os.environ.get("OPENROUTER_MODEL", "qwen/qwen-3-235b-a22b"), ) - # --- TTS --- + # --- TTS (Cloned Voice) --- + voice_path = os.environ.get("QWEN_TTS_VOICE", "voices/echo_voice.wav") self.tts = TTSEngine( model_name=os.environ.get("QWEN_TTS_MODEL", "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"), - voice=os.environ.get("QWEN_TTS_VOICE", "Ryan"), + voice_sample=voice_path, instruction=os.environ.get( "QWEN_TTS_INSTRUCT", "Speak clearly with a warm, friendly tone. Be natural and conversational.", @@ -92,6 +93,13 @@ class EchoAssistant: self._processing = False # guard against concurrent commands self._shutdown_event = asyncio.Event() + if not Path(voice_path).exists(): + logger.warning( + "No voice sample at '%s' — TTS will not work until you record one. " + "Run: python tts.py record", + voice_path, + ) + logger.info("Echo assistant initialized (wake word: '%s')", wake_word) # ------------------------------------------------------------------ @@ -238,7 +246,10 @@ class EchoAssistant: async def start(self): """Start the Echo assistant.""" logger.info("=" * 60) + voice_path = os.environ.get("QWEN_TTS_VOICE", "voices/echo_voice.wav") logger.info(" ECHO VOICE ASSISTANT") + logger.info(" Voice: %s (%s)", voice_path, + "✅" if Path(voice_path).exists() else "❌ missing") logger.info(" Say '%s' to activate", os.environ.get("WAKE_WORD", "echo").upper()) logger.info(" Press Ctrl+C to quit") logger.info("=" * 60) diff --git a/tts.py b/tts.py index f5f01e6..fb3590a 100644 --- a/tts.py +++ b/tts.py @@ -3,40 +3,170 @@ tts.py — Text-To-Speech Module (Qwen3-TTS) Responsibilities: 1. Accept text (full or partial sentence) and generate a .wav audio file - using the Qwen3-TTS model running locally. - 2. Support voice selection (preset voices or custom voice cloning). - 3. Support instruction-based style control (e.g., energy, tone). - 4. Play the generated audio immediately. + using the Qwen3-TTS model running locally with a **cloned voice**. + 2. Validate the voice sample on init (must be a 2–5 second .wav file). + 3. Provide a built-in recorder so users can create their voice sample + directly from the assistant. + 4. Support instruction-based style control (e.g., energy, tone). + 5. Play the generated audio immediately. + +Cloned Voice Workflow: + - Qwen3-TTS-12Hz-1.7B-CustomVoice can clone a voice from a short audio + sample (recommended: 3 seconds, clean speech, no background noise). + - Place your sample at the path specified by QWEN_TTS_VOICE (default: + voices/echo_voice.wav). + - Or run `python tts.py` to record a 3-second sample interactively. Environment Variables: - QWEN_TTS_MODEL — model name or local path (default: Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice) - QWEN_TTS_VOICE — preset voice name or path to 3s .wav sample + QWEN_TTS_MODEL — model name or local path + QWEN_TTS_VOICE — path to .wav voice sample (required for cloning) QWEN_TTS_INSTRUCT — default style instruction for speech generation Dependencies: - pip install qwen-tts torch soundfile pygame + pip install qwen-tts torch soundfile pygame pyaudio """ import asyncio +import array import logging import os -import tempfile +import struct +import wave from pathlib import Path +import pyaudio + logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Configuration # --------------------------------------------------------------------------- DEFAULT_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice" -DEFAULT_VOICE = "Ryan" # preset voice; alternatives: "Serena", "Diana", etc. -DEFAULT_INSTRUCTION = "Speak clearly with a warm, friendly tone. Be natural and conversational." +DEFAULT_VOICE_SAMPLE = "voices/echo_voice.wav" +DEFAULT_INSTRUCTION = ( + "Speak clearly with a warm, friendly tone. Be natural and conversational." +) OUTPUT_DIR = Path("audio_output") +# Recording constants +REC_FORMAT = pyaudio.paInt16 +REC_CHANNELS = 1 +REC_RATE = 16000 +REC_CHUNK = 1024 +REC_DURATION = 3 # seconds — optimal for Qwen3 voice cloning + + +# --------------------------------------------------------------------------- +# Voice sample validation +# --------------------------------------------------------------------------- +def validate_voice_sample(path: str | Path) -> tuple[bool, str]: + """ + Check that a voice sample file exists and meets Qwen3-TTS requirements. + + Returns: + (is_valid, reason) + """ + p = Path(path) + + if not p.exists(): + return False, f"Voice sample not found at: {p.resolve()}. Record one with `python tts.py`" + + if p.suffix.lower() != ".wav": + return False, f"Voice sample must be a .wav file, got '{p.suffix}'" + + try: + with wave.open(str(p), "rb") as wf: + channels = wf.getnchannels() + sample_width = wf.getsampwidth() + framerate = wf.getframerate() + nframes = wf.getnframes() + duration = nframes / framerate + except Exception as exc: + return False, f"Could not read .wav file: {exc}" + + issues = [] + if channels != 1: + issues.append(f"expected mono (1 channel), got {channels}") + if framerate < 16000: + issues.append(f"sample rate {framerate} Hz is too low (min 16000)") + if duration < 2: + issues.append(f"sample is {duration:.1f}s — too short (min 2s, recommended 3s)") + elif duration > 5: + issues.append(f"sample is {duration:.1f}s — too long (max 5s, recommended 3s)") + if sample_width != 2: + issues.append(f"expected 16-bit audio, got {sample_width * 8}-bit") + + if issues: + return False, f"Voice sample issues: {'; '.join(issues)}" + + return True, f"Voice sample OK: {duration:.1f}s, {framerate} Hz, mono, 16-bit" + + +# --------------------------------------------------------------------------- +# Voice sample recorder +# --------------------------------------------------------------------------- +def record_voice_sample(output_path: str | Path, duration: int = REC_DURATION) -> Path: + """ + Record a short voice sample from the microphone for voice cloning. + + The user will hear a countdown and should speak naturally for the + full duration. The recording is saved as a 16 kHz mono 16-bit .wav file. + + Args: + output_path: Where to save the .wav file. + duration: Recording length in seconds (default 3). + + Returns: + Path to the saved .wav file. + """ + output_path = Path(output_path) + output_path.parent.mkdir(parents=True, exist_ok=True) + + print(f"\n🎙️ Recording a {duration}-second voice sample for Echo...") + print(f" Speak naturally in a clear voice. No background noise.") + print(f" Saving to: {output_path.resolve()}\n") + + pa = pyaudio.PyAudio() + stream = pa.open( + format=REC_FORMAT, + channels=REC_CHANNELS, + rate=REC_RATE, + input=True, + frames_per_buffer=REC_CHUNK, + ) + + frames = [] + for i in range(int(REC_RATE / REC_CHUNK * duration)): + frame = stream.read(REC_CHUNK, exception_on_overflow=False) + frames.append(frame) + remaining = duration - (i + 1) * REC_CHUNK / REC_RATE + if int(remaining) != int(remaining + REC_CHUNK / REC_RATE): + print(f" ... {int(remaining)}s remaining") + + stream.stop_stream() + stream.close() + pa.terminate() + + # Write .wav + with wave.open(str(output_path), "wb") as wf: + wf.setnchannels(REC_CHANNELS) + wf.setsampwidth(2) # 16-bit + wf.setframerate(REC_RATE) + wf.writeframes(b"".join(frames)) + + ok, msg = validate_voice_sample(output_path) + if ok: + print(f"\n✅ {msg}") + else: + print(f"\n⚠️ {msg}") + print(f" File saved: {output_path.resolve()}\n") + + return output_path + class TTSEngine: """ - Wrapper around Qwen3-TTS for generating speech from text. + Wrapper around Qwen3-TTS for generating speech with a cloned voice. The engine lazily loads the model on first use to avoid slow startup. """ @@ -44,12 +174,12 @@ class TTSEngine: def __init__( self, model_name: str = DEFAULT_MODEL, - voice: str = DEFAULT_VOICE, + voice_sample: str = DEFAULT_VOICE_SAMPLE, instruction: str = DEFAULT_INSTRUCTION, output_dir: str | Path = OUTPUT_DIR, ): self.model_name = model_name - self.voice = voice + self.voice_sample = Path(voice_sample) self.instruction = instruction self.output_dir = Path(output_dir) self.output_dir.mkdir(parents=True, exist_ok=True) @@ -58,15 +188,32 @@ class TTSEngine: self._processor = None self._lock = asyncio.Lock() # prevent concurrent generation + # Validate voice sample on init + self._validate_voice() + + def _validate_voice(self): + """Check the voice sample and log warnings if it's not usable.""" + ok, msg = validate_voice_sample(self.voice_sample) + if ok: + logger.info("🎤 Voice: %s", msg) + else: + logger.warning("🎤 Voice sample issue — %s", msg) + logger.warning( + " Record a sample with: python tts.py record" + ) + # ---- lazy model loading ---- def _ensure_loaded(self): """Load model and processor on first call (lazy init).""" if self._model is not None: return - logger.info("Loading Qwen3-TTS model '%s' (this may take a moment)...", self.model_name) + logger.info( + "Loading Qwen3-TTS model '%s' (this may take a moment)...", + self.model_name, + ) try: - from qwen_tts import QwenTTSProcessor, QwenTTSModel + from qwen_tts import QwenTTSModel, QwenTTSProcessor self._processor = QwenTTSProcessor() self._model = QwenTTSModel.from_pretrained(self.model_name) @@ -81,7 +228,7 @@ class TTSEngine: # ---- generation ---- async def generate(self, text: str, instruction: str | None = None) -> Path | None: """ - Generate speech audio from text and save as .wav. + Generate speech audio from text using the cloned voice. Args: text: The text to convert to speech. @@ -102,16 +249,24 @@ class TTSEngine: """Synchronous generation (runs in thread pool).""" self._ensure_loaded() + # Double-check voice sample before generating + if not self.voice_sample.exists(): + logger.error( + "Voice sample missing at '%s' — cannot generate speech", + self.voice_sample.resolve(), + ) + return None + output_path = self.output_dir / f"echo_{os.urandom(4).hex()}.wav" try: - # Build voice reference: preset name or custom .wav path - voice_ref = self.voice - if Path(self.voice).exists(): - voice_ref = str(Path(self.voice).resolve()) + voice_ref = str(self.voice_sample.resolve()) - # Generate audio - logger.info("Generating speech: '%s' (voice=%s)", text[:60], self.voice) + logger.info( + "Generating speech: '%s' (voice=%s)", + text[:60], + self.voice_sample.name, + ) audio_array = self._model.generate( processor=self._processor, text=text, @@ -119,12 +274,15 @@ class TTSEngine: instruction=instruction, ) - # Save to file import soundfile as sf sample_rate = self._processor.sampling_rate sf.write(str(output_path), audio_array, sample_rate) - logger.info("Audio saved to %s (%.1fs)", output_path, len(audio_array) / sample_rate) + logger.info( + "Audio saved to %s (%.1fs)", + output_path, + len(audio_array) / sample_rate, + ) return output_path except Exception: @@ -158,7 +316,6 @@ class TTSEngine: pygame.mixer.music.load(str(wav_path)) pygame.mixer.music.play() - # Wait for playback to finish while pygame.mixer.music.get_busy(): await asyncio.sleep(0.05) @@ -170,12 +327,30 @@ class TTSEngine: logger.exception("Playback failed for %s", wav_path) return False - def set_voice(self, voice: str): - """Switch to a different voice preset or custom sample path.""" - self.voice = voice - logger.info("Voice set to: %s", voice) + def set_voice_sample(self, path: str): + """Switch to a different voice sample .wav file.""" + self.voice_sample = Path(path) + self._validate_voice() + logger.info("Voice sample set to: %s", self.voice_sample.resolve()) def set_instruction(self, instruction: str): """Update the default style instruction.""" self.instruction = instruction logger.info("TTS instruction updated: %s", instruction) + + +# --------------------------------------------------------------------------- +# CLI — record a voice sample directly +# --------------------------------------------------------------------------- +if __name__ == "__main__": + import sys + + if len(sys.argv) > 1 and sys.argv[1] == "record": + output = sys.argv[2] if len(sys.argv) > 2 else DEFAULT_VOICE_SAMPLE + record_voice_sample(output) + else: + print("Usage:") + print(f" python {Path(__file__).name} record [output.wav]") + print() + print("Records a 3-second voice sample for Qwen3-TTS cloning.") + print(f"Default output: {DEFAULT_VOICE_SAMPLE}")