feat: switch TTS to cloned voice mode with sample validation & recorder
- tts.py: voice_sample param replaces preset voice, add validate_voice_sample(), add record_voice_sample() with CLI (python tts.py record), validate .wav format/duration/channels on init - main.py: warn at startup if voice sample missing, show voice status in banner - .env.example: QWEN_TTS_VOICE now points to voices/echo_voice.wav - .gitignore: voice samples gitignored (personal data) - voices/README.md: instructions for recording & placing voice samples
This commit is contained in:
parent
d6b64d04d1
commit
19a283ec0f
@ -16,9 +16,9 @@ OPENROUTER_MODEL=qwen/qwen-3-235b-a22b
|
||||
VOSK_MODEL_PATH=models/vosk-model-small-en-us
|
||||
WAKE_WORD=echo
|
||||
|
||||
# --- Qwen3-TTS (optional overrides) ---
|
||||
# Available preset voices: Ryan, Serena, Diana, etc.
|
||||
# Or set a path to a 3-second .wav sample for voice cloning
|
||||
# --- Qwen3-TTS — Cloned Voice (required) ---
|
||||
# Path to your 3-second .wav voice sample (16 kHz mono, 16-bit).
|
||||
# Record one with: python tts.py record
|
||||
QWEN_TTS_MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice
|
||||
QWEN_TTS_VOICE=Ryan
|
||||
QWEN_TTS_VOICE=voices/echo_voice.wav
|
||||
QWEN_TTS_INSTRUCT=Speak clearly with a warm, friendly tone. Be natural and conversational.
|
||||
|
||||
5
.gitignore
vendored
5
.gitignore
vendored
@ -21,6 +21,11 @@ models/
|
||||
audio_output/
|
||||
!audio_output/.gitkeep
|
||||
|
||||
# Voice samples (personal — keep local)
|
||||
voices/
|
||||
!voices/.gitkeep
|
||||
!voices/README.md
|
||||
|
||||
# Environment & secrets
|
||||
.env
|
||||
.env.local
|
||||
|
||||
15
main.py
15
main.py
@ -78,10 +78,11 @@ class EchoAssistant:
|
||||
model=os.environ.get("OPENROUTER_MODEL", "qwen/qwen-3-235b-a22b"),
|
||||
)
|
||||
|
||||
# --- TTS ---
|
||||
# --- TTS (Cloned Voice) ---
|
||||
voice_path = os.environ.get("QWEN_TTS_VOICE", "voices/echo_voice.wav")
|
||||
self.tts = TTSEngine(
|
||||
model_name=os.environ.get("QWEN_TTS_MODEL", "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"),
|
||||
voice=os.environ.get("QWEN_TTS_VOICE", "Ryan"),
|
||||
voice_sample=voice_path,
|
||||
instruction=os.environ.get(
|
||||
"QWEN_TTS_INSTRUCT",
|
||||
"Speak clearly with a warm, friendly tone. Be natural and conversational.",
|
||||
@ -92,6 +93,13 @@ class EchoAssistant:
|
||||
self._processing = False # guard against concurrent commands
|
||||
self._shutdown_event = asyncio.Event()
|
||||
|
||||
if not Path(voice_path).exists():
|
||||
logger.warning(
|
||||
"No voice sample at '%s' — TTS will not work until you record one. "
|
||||
"Run: python tts.py record",
|
||||
voice_path,
|
||||
)
|
||||
|
||||
logger.info("Echo assistant initialized (wake word: '%s')", wake_word)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
@ -238,7 +246,10 @@ class EchoAssistant:
|
||||
async def start(self):
|
||||
"""Start the Echo assistant."""
|
||||
logger.info("=" * 60)
|
||||
voice_path = os.environ.get("QWEN_TTS_VOICE", "voices/echo_voice.wav")
|
||||
logger.info(" ECHO VOICE ASSISTANT")
|
||||
logger.info(" Voice: %s (%s)", voice_path,
|
||||
"✅" if Path(voice_path).exists() else "❌ missing")
|
||||
logger.info(" Say '%s' to activate", os.environ.get("WAKE_WORD", "echo").upper())
|
||||
logger.info(" Press Ctrl+C to quit")
|
||||
logger.info("=" * 60)
|
||||
|
||||
233
tts.py
233
tts.py
@ -3,40 +3,170 @@ tts.py — Text-To-Speech Module (Qwen3-TTS)
|
||||
|
||||
Responsibilities:
|
||||
1. Accept text (full or partial sentence) and generate a .wav audio file
|
||||
using the Qwen3-TTS model running locally.
|
||||
2. Support voice selection (preset voices or custom voice cloning).
|
||||
3. Support instruction-based style control (e.g., energy, tone).
|
||||
4. Play the generated audio immediately.
|
||||
using the Qwen3-TTS model running locally with a **cloned voice**.
|
||||
2. Validate the voice sample on init (must be a 2–5 second .wav file).
|
||||
3. Provide a built-in recorder so users can create their voice sample
|
||||
directly from the assistant.
|
||||
4. Support instruction-based style control (e.g., energy, tone).
|
||||
5. Play the generated audio immediately.
|
||||
|
||||
Cloned Voice Workflow:
|
||||
- Qwen3-TTS-12Hz-1.7B-CustomVoice can clone a voice from a short audio
|
||||
sample (recommended: 3 seconds, clean speech, no background noise).
|
||||
- Place your sample at the path specified by QWEN_TTS_VOICE (default:
|
||||
voices/echo_voice.wav).
|
||||
- Or run `python tts.py` to record a 3-second sample interactively.
|
||||
|
||||
Environment Variables:
|
||||
QWEN_TTS_MODEL — model name or local path (default: Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice)
|
||||
QWEN_TTS_VOICE — preset voice name or path to 3s .wav sample
|
||||
QWEN_TTS_MODEL — model name or local path
|
||||
QWEN_TTS_VOICE — path to .wav voice sample (required for cloning)
|
||||
QWEN_TTS_INSTRUCT — default style instruction for speech generation
|
||||
|
||||
Dependencies:
|
||||
pip install qwen-tts torch soundfile pygame
|
||||
pip install qwen-tts torch soundfile pygame pyaudio
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import array
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
import struct
|
||||
import wave
|
||||
from pathlib import Path
|
||||
|
||||
import pyaudio
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Configuration
|
||||
# ---------------------------------------------------------------------------
|
||||
DEFAULT_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
|
||||
DEFAULT_VOICE = "Ryan" # preset voice; alternatives: "Serena", "Diana", etc.
|
||||
DEFAULT_INSTRUCTION = "Speak clearly with a warm, friendly tone. Be natural and conversational."
|
||||
DEFAULT_VOICE_SAMPLE = "voices/echo_voice.wav"
|
||||
DEFAULT_INSTRUCTION = (
|
||||
"Speak clearly with a warm, friendly tone. Be natural and conversational."
|
||||
)
|
||||
OUTPUT_DIR = Path("audio_output")
|
||||
|
||||
# Recording constants
|
||||
REC_FORMAT = pyaudio.paInt16
|
||||
REC_CHANNELS = 1
|
||||
REC_RATE = 16000
|
||||
REC_CHUNK = 1024
|
||||
REC_DURATION = 3 # seconds — optimal for Qwen3 voice cloning
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Voice sample validation
|
||||
# ---------------------------------------------------------------------------
|
||||
def validate_voice_sample(path: str | Path) -> tuple[bool, str]:
|
||||
"""
|
||||
Check that a voice sample file exists and meets Qwen3-TTS requirements.
|
||||
|
||||
Returns:
|
||||
(is_valid, reason)
|
||||
"""
|
||||
p = Path(path)
|
||||
|
||||
if not p.exists():
|
||||
return False, f"Voice sample not found at: {p.resolve()}. Record one with `python tts.py`"
|
||||
|
||||
if p.suffix.lower() != ".wav":
|
||||
return False, f"Voice sample must be a .wav file, got '{p.suffix}'"
|
||||
|
||||
try:
|
||||
with wave.open(str(p), "rb") as wf:
|
||||
channels = wf.getnchannels()
|
||||
sample_width = wf.getsampwidth()
|
||||
framerate = wf.getframerate()
|
||||
nframes = wf.getnframes()
|
||||
duration = nframes / framerate
|
||||
except Exception as exc:
|
||||
return False, f"Could not read .wav file: {exc}"
|
||||
|
||||
issues = []
|
||||
if channels != 1:
|
||||
issues.append(f"expected mono (1 channel), got {channels}")
|
||||
if framerate < 16000:
|
||||
issues.append(f"sample rate {framerate} Hz is too low (min 16000)")
|
||||
if duration < 2:
|
||||
issues.append(f"sample is {duration:.1f}s — too short (min 2s, recommended 3s)")
|
||||
elif duration > 5:
|
||||
issues.append(f"sample is {duration:.1f}s — too long (max 5s, recommended 3s)")
|
||||
if sample_width != 2:
|
||||
issues.append(f"expected 16-bit audio, got {sample_width * 8}-bit")
|
||||
|
||||
if issues:
|
||||
return False, f"Voice sample issues: {'; '.join(issues)}"
|
||||
|
||||
return True, f"Voice sample OK: {duration:.1f}s, {framerate} Hz, mono, 16-bit"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Voice sample recorder
|
||||
# ---------------------------------------------------------------------------
|
||||
def record_voice_sample(output_path: str | Path, duration: int = REC_DURATION) -> Path:
|
||||
"""
|
||||
Record a short voice sample from the microphone for voice cloning.
|
||||
|
||||
The user will hear a countdown and should speak naturally for the
|
||||
full duration. The recording is saved as a 16 kHz mono 16-bit .wav file.
|
||||
|
||||
Args:
|
||||
output_path: Where to save the .wav file.
|
||||
duration: Recording length in seconds (default 3).
|
||||
|
||||
Returns:
|
||||
Path to the saved .wav file.
|
||||
"""
|
||||
output_path = Path(output_path)
|
||||
output_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
print(f"\n🎙️ Recording a {duration}-second voice sample for Echo...")
|
||||
print(f" Speak naturally in a clear voice. No background noise.")
|
||||
print(f" Saving to: {output_path.resolve()}\n")
|
||||
|
||||
pa = pyaudio.PyAudio()
|
||||
stream = pa.open(
|
||||
format=REC_FORMAT,
|
||||
channels=REC_CHANNELS,
|
||||
rate=REC_RATE,
|
||||
input=True,
|
||||
frames_per_buffer=REC_CHUNK,
|
||||
)
|
||||
|
||||
frames = []
|
||||
for i in range(int(REC_RATE / REC_CHUNK * duration)):
|
||||
frame = stream.read(REC_CHUNK, exception_on_overflow=False)
|
||||
frames.append(frame)
|
||||
remaining = duration - (i + 1) * REC_CHUNK / REC_RATE
|
||||
if int(remaining) != int(remaining + REC_CHUNK / REC_RATE):
|
||||
print(f" ... {int(remaining)}s remaining")
|
||||
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
pa.terminate()
|
||||
|
||||
# Write .wav
|
||||
with wave.open(str(output_path), "wb") as wf:
|
||||
wf.setnchannels(REC_CHANNELS)
|
||||
wf.setsampwidth(2) # 16-bit
|
||||
wf.setframerate(REC_RATE)
|
||||
wf.writeframes(b"".join(frames))
|
||||
|
||||
ok, msg = validate_voice_sample(output_path)
|
||||
if ok:
|
||||
print(f"\n✅ {msg}")
|
||||
else:
|
||||
print(f"\n⚠️ {msg}")
|
||||
print(f" File saved: {output_path.resolve()}\n")
|
||||
|
||||
return output_path
|
||||
|
||||
|
||||
class TTSEngine:
|
||||
"""
|
||||
Wrapper around Qwen3-TTS for generating speech from text.
|
||||
Wrapper around Qwen3-TTS for generating speech with a cloned voice.
|
||||
|
||||
The engine lazily loads the model on first use to avoid slow startup.
|
||||
"""
|
||||
@ -44,12 +174,12 @@ class TTSEngine:
|
||||
def __init__(
|
||||
self,
|
||||
model_name: str = DEFAULT_MODEL,
|
||||
voice: str = DEFAULT_VOICE,
|
||||
voice_sample: str = DEFAULT_VOICE_SAMPLE,
|
||||
instruction: str = DEFAULT_INSTRUCTION,
|
||||
output_dir: str | Path = OUTPUT_DIR,
|
||||
):
|
||||
self.model_name = model_name
|
||||
self.voice = voice
|
||||
self.voice_sample = Path(voice_sample)
|
||||
self.instruction = instruction
|
||||
self.output_dir = Path(output_dir)
|
||||
self.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
@ -58,15 +188,32 @@ class TTSEngine:
|
||||
self._processor = None
|
||||
self._lock = asyncio.Lock() # prevent concurrent generation
|
||||
|
||||
# Validate voice sample on init
|
||||
self._validate_voice()
|
||||
|
||||
def _validate_voice(self):
|
||||
"""Check the voice sample and log warnings if it's not usable."""
|
||||
ok, msg = validate_voice_sample(self.voice_sample)
|
||||
if ok:
|
||||
logger.info("🎤 Voice: %s", msg)
|
||||
else:
|
||||
logger.warning("🎤 Voice sample issue — %s", msg)
|
||||
logger.warning(
|
||||
" Record a sample with: python tts.py record"
|
||||
)
|
||||
|
||||
# ---- lazy model loading ----
|
||||
def _ensure_loaded(self):
|
||||
"""Load model and processor on first call (lazy init)."""
|
||||
if self._model is not None:
|
||||
return
|
||||
|
||||
logger.info("Loading Qwen3-TTS model '%s' (this may take a moment)...", self.model_name)
|
||||
logger.info(
|
||||
"Loading Qwen3-TTS model '%s' (this may take a moment)...",
|
||||
self.model_name,
|
||||
)
|
||||
try:
|
||||
from qwen_tts import QwenTTSProcessor, QwenTTSModel
|
||||
from qwen_tts import QwenTTSModel, QwenTTSProcessor
|
||||
|
||||
self._processor = QwenTTSProcessor()
|
||||
self._model = QwenTTSModel.from_pretrained(self.model_name)
|
||||
@ -81,7 +228,7 @@ class TTSEngine:
|
||||
# ---- generation ----
|
||||
async def generate(self, text: str, instruction: str | None = None) -> Path | None:
|
||||
"""
|
||||
Generate speech audio from text and save as .wav.
|
||||
Generate speech audio from text using the cloned voice.
|
||||
|
||||
Args:
|
||||
text: The text to convert to speech.
|
||||
@ -102,16 +249,24 @@ class TTSEngine:
|
||||
"""Synchronous generation (runs in thread pool)."""
|
||||
self._ensure_loaded()
|
||||
|
||||
# Double-check voice sample before generating
|
||||
if not self.voice_sample.exists():
|
||||
logger.error(
|
||||
"Voice sample missing at '%s' — cannot generate speech",
|
||||
self.voice_sample.resolve(),
|
||||
)
|
||||
return None
|
||||
|
||||
output_path = self.output_dir / f"echo_{os.urandom(4).hex()}.wav"
|
||||
|
||||
try:
|
||||
# Build voice reference: preset name or custom .wav path
|
||||
voice_ref = self.voice
|
||||
if Path(self.voice).exists():
|
||||
voice_ref = str(Path(self.voice).resolve())
|
||||
voice_ref = str(self.voice_sample.resolve())
|
||||
|
||||
# Generate audio
|
||||
logger.info("Generating speech: '%s' (voice=%s)", text[:60], self.voice)
|
||||
logger.info(
|
||||
"Generating speech: '%s' (voice=%s)",
|
||||
text[:60],
|
||||
self.voice_sample.name,
|
||||
)
|
||||
audio_array = self._model.generate(
|
||||
processor=self._processor,
|
||||
text=text,
|
||||
@ -119,12 +274,15 @@ class TTSEngine:
|
||||
instruction=instruction,
|
||||
)
|
||||
|
||||
# Save to file
|
||||
import soundfile as sf
|
||||
|
||||
sample_rate = self._processor.sampling_rate
|
||||
sf.write(str(output_path), audio_array, sample_rate)
|
||||
logger.info("Audio saved to %s (%.1fs)", output_path, len(audio_array) / sample_rate)
|
||||
logger.info(
|
||||
"Audio saved to %s (%.1fs)",
|
||||
output_path,
|
||||
len(audio_array) / sample_rate,
|
||||
)
|
||||
return output_path
|
||||
|
||||
except Exception:
|
||||
@ -158,7 +316,6 @@ class TTSEngine:
|
||||
pygame.mixer.music.load(str(wav_path))
|
||||
pygame.mixer.music.play()
|
||||
|
||||
# Wait for playback to finish
|
||||
while pygame.mixer.music.get_busy():
|
||||
await asyncio.sleep(0.05)
|
||||
|
||||
@ -170,12 +327,30 @@ class TTSEngine:
|
||||
logger.exception("Playback failed for %s", wav_path)
|
||||
return False
|
||||
|
||||
def set_voice(self, voice: str):
|
||||
"""Switch to a different voice preset or custom sample path."""
|
||||
self.voice = voice
|
||||
logger.info("Voice set to: %s", voice)
|
||||
def set_voice_sample(self, path: str):
|
||||
"""Switch to a different voice sample .wav file."""
|
||||
self.voice_sample = Path(path)
|
||||
self._validate_voice()
|
||||
logger.info("Voice sample set to: %s", self.voice_sample.resolve())
|
||||
|
||||
def set_instruction(self, instruction: str):
|
||||
"""Update the default style instruction."""
|
||||
self.instruction = instruction
|
||||
logger.info("TTS instruction updated: %s", instruction)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI — record a voice sample directly
|
||||
# ---------------------------------------------------------------------------
|
||||
if __name__ == "__main__":
|
||||
import sys
|
||||
|
||||
if len(sys.argv) > 1 and sys.argv[1] == "record":
|
||||
output = sys.argv[2] if len(sys.argv) > 2 else DEFAULT_VOICE_SAMPLE
|
||||
record_voice_sample(output)
|
||||
else:
|
||||
print("Usage:")
|
||||
print(f" python {Path(__file__).name} record [output.wav]")
|
||||
print()
|
||||
print("Records a 3-second voice sample for Qwen3-TTS cloning.")
|
||||
print(f"Default output: {DEFAULT_VOICE_SAMPLE}")
|
||||
|
||||
Loading…
Reference in New Issue
Block a user