feat: initial Echo voice assistant — Vosk + OpenRouter + Qwen3-TTS
- stt.py: WakeWordListener (openWakeWord) + Transcriber (Vosk) - brain.py: Async OpenRouter streaming client with command parsing - tts.py: Qwen3-TTS engine with voice selection & instruction control - actions.py: 10 local OS commands (open_app, set_timer, search, etc.) - main.py: Async orchestrator with Phase 5 parallel TTS streaming
This commit is contained in:
parent
722596bb09
commit
d6b64d04d1
24
.env.example
Normal file
24
.env.example
Normal file
@ -0,0 +1,24 @@
|
||||
# ===========================================================
|
||||
# Echo Voice Assistant — Environment Configuration
|
||||
# ===========================================================
|
||||
# Copy this file to .env and fill in your values:
|
||||
# cp .env.example .env
|
||||
# ===========================================================
|
||||
|
||||
# --- OpenRouter (required) ---
|
||||
# Get your key at: https://openrouter.ai/keys
|
||||
OPENROUTER_API_KEY=sk-or-v1-xxxxxxxxxxxxxxxxxxxxxxxx
|
||||
OPENROUTER_MODEL=qwen/qwen-3-235b-a22b
|
||||
|
||||
# --- Vosk STT (optional overrides) ---
|
||||
# Download models from: https://alphacephei.com/vosk/models
|
||||
# Set to a local path relative to the project root
|
||||
VOSK_MODEL_PATH=models/vosk-model-small-en-us
|
||||
WAKE_WORD=echo
|
||||
|
||||
# --- Qwen3-TTS (optional overrides) ---
|
||||
# Available preset voices: Ryan, Serena, Diana, etc.
|
||||
# Or set a path to a 3-second .wav sample for voice cloning
|
||||
QWEN_TTS_MODEL=Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice
|
||||
QWEN_TTS_VOICE=Ryan
|
||||
QWEN_TTS_INSTRUCT=Speak clearly with a warm, friendly tone. Be natural and conversational.
|
||||
39
.gitignore
vendored
Normal file
39
.gitignore
vendored
Normal file
@ -0,0 +1,39 @@
|
||||
# Echo Voice Assistant — Git Ignore
|
||||
|
||||
# Python
|
||||
__pycache__/
|
||||
*.py[cod]
|
||||
*.egg-info/
|
||||
dist/
|
||||
build/
|
||||
*.egg
|
||||
|
||||
# Virtual environments
|
||||
venv/
|
||||
.venv/
|
||||
env/
|
||||
|
||||
# Model weights (large files — download separately)
|
||||
models/
|
||||
!models/.gitkeep
|
||||
|
||||
# Generated audio
|
||||
audio_output/
|
||||
!audio_output/.gitkeep
|
||||
|
||||
# Environment & secrets
|
||||
.env
|
||||
.env.local
|
||||
|
||||
# IDE
|
||||
.vscode/
|
||||
.idea/
|
||||
*.swp
|
||||
*.swo
|
||||
|
||||
# OS
|
||||
.DS_Store
|
||||
Thumbs.db
|
||||
|
||||
# Logs
|
||||
*.log
|
||||
275
actions.py
Normal file
275
actions.py
Normal file
@ -0,0 +1,275 @@
|
||||
"""
|
||||
actions.py — Local OS Command Execution
|
||||
|
||||
Responsibilities:
|
||||
1. Provide a registry of local actions the assistant can perform.
|
||||
2. Map action names from the LLM's JSON commands to Python functions.
|
||||
3. Execute commands and return a spoken summary for TTS feedback.
|
||||
|
||||
Supported actions:
|
||||
open_app — Launch a desktop application
|
||||
set_timer — Start a countdown timer with audible alarm
|
||||
get_time — Return the current time
|
||||
get_date — Return today's date
|
||||
get_weather — (stub) Return weather info
|
||||
create_reminder — (stub) Save a reminder note
|
||||
control_volume — Adjust system volume (Linux/macOS)
|
||||
search_web — Open a web search in the default browser
|
||||
calculate — Evaluate a math expression safely
|
||||
shutdown — System shutdown (with confirmation)
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import platform
|
||||
import subprocess
|
||||
import threading
|
||||
import webbrowser
|
||||
from datetime import datetime
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Action Registry
|
||||
# ---------------------------------------------------------------------------
|
||||
_REGISTRY: dict[str, callable] = {}
|
||||
|
||||
|
||||
def register(name: str):
|
||||
"""Decorator to register an action function by name."""
|
||||
def decorator(func):
|
||||
_REGISTRY[name] = func
|
||||
return func
|
||||
return decorator
|
||||
|
||||
|
||||
def execute(action: str, params: dict | None = None) -> str:
|
||||
"""
|
||||
Execute a registered action and return a spoken summary.
|
||||
|
||||
Args:
|
||||
action: The action name (e.g., "open_app").
|
||||
params: Optional dict of parameters.
|
||||
|
||||
Returns:
|
||||
A short text description of what was done (for TTS feedback).
|
||||
"""
|
||||
params = params or {}
|
||||
func = _REGISTRY.get(action)
|
||||
|
||||
if not func:
|
||||
logger.warning("Unknown action: %s", action)
|
||||
return f"Sorry, I don't know how to {action.replace('_', ' ')}."
|
||||
|
||||
try:
|
||||
result = func(**params)
|
||||
logger.info("Action '%s' executed: %s", action, result)
|
||||
return result
|
||||
except Exception as exc:
|
||||
logger.exception("Action '%s' failed", action)
|
||||
return f"Something went wrong: {exc}"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Action Implementations
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@register("get_time")
|
||||
def get_time(**_) -> str:
|
||||
now = datetime.now().strftime("%-I:%M %p")
|
||||
return f"It's currently {now}."
|
||||
|
||||
|
||||
@register("get_date")
|
||||
def get_date(**_) -> str:
|
||||
today = datetime.now().strftime("%A, %B %d, %Y")
|
||||
return f"Today is {today}."
|
||||
|
||||
|
||||
@register("open_app")
|
||||
def open_app(app_name: str = "", **_) -> str:
|
||||
if not app_name:
|
||||
return "What app would you like me to open?"
|
||||
|
||||
system = platform.system()
|
||||
app_lower = app_name.lower().strip()
|
||||
|
||||
try:
|
||||
if system == "Darwin": # macOS
|
||||
subprocess.Popen(["open", "-a", app_name], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
elif system == "Windows":
|
||||
subprocess.Popen(f"start {app_name}", shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
||||
else: # Linux
|
||||
# Try common app launchers
|
||||
app_map = {
|
||||
"chrome": "google-chrome",
|
||||
"firefox": "firefox",
|
||||
"terminal": "gnome-terminal",
|
||||
"files": "nautilus",
|
||||
"calculator": "gnome-calculator",
|
||||
"settings": "gnome-control-center",
|
||||
"browser": "xdg-open",
|
||||
"vs code": "code",
|
||||
"vscode": "code",
|
||||
}
|
||||
cmd = app_map.get(app_lower, app_lower)
|
||||
subprocess.Popen(
|
||||
[cmd], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL
|
||||
)
|
||||
|
||||
return f"Opening {app_name}."
|
||||
except FileNotFoundError:
|
||||
return f"Sorry, I couldn't find {app_name} on this system."
|
||||
|
||||
|
||||
@register("set_timer")
|
||||
def set_timer(seconds: int = 60, label: str = "Timer", **_) -> str:
|
||||
"""
|
||||
Start a background timer that rings after the given number of seconds.
|
||||
Uses the terminal bell when the timer completes.
|
||||
"""
|
||||
try:
|
||||
duration = int(seconds)
|
||||
except (ValueError, TypeError):
|
||||
return "I need a number of seconds for the timer."
|
||||
|
||||
def _timer_thread():
|
||||
import time
|
||||
|
||||
logger.info("Timer '%s' started for %d seconds", label, duration)
|
||||
time.sleep(duration)
|
||||
# Terminal bell
|
||||
print(f"\a")
|
||||
logger.info("Timer '%s' finished!", label)
|
||||
# Try to use TTS to announce (if available — soft dependency)
|
||||
try:
|
||||
import pygame
|
||||
pygame.mixer.init()
|
||||
# Generate a simple beep
|
||||
import numpy as np
|
||||
sample_rate = 22050
|
||||
t = np.linspace(0, 0.5, int(sample_rate * 0.5), dtype=np.float32)
|
||||
tone = np.sin(2 * np.pi * 880 * t) * 0.5
|
||||
tone = (tone * 32767).astype(np.int16)
|
||||
# Save and play
|
||||
import soundfile as sf
|
||||
beep_path = f"/tmp/echo_timer_{os.urandom(4).hex()}.wav"
|
||||
sf.write(beep_path, tone, sample_rate)
|
||||
pygame.mixer.music.load(beep_path)
|
||||
pygame.mixer.music.play()
|
||||
while pygame.mixer.music.get_busy():
|
||||
pygame.time.wait(50)
|
||||
pygame.mixer.quit()
|
||||
os.unlink(beep_path)
|
||||
except Exception:
|
||||
pass # Fall back to terminal bell only
|
||||
|
||||
threading.Thread(target=_timer_thread, daemon=True, name=f"timer-{label}").start()
|
||||
|
||||
minutes, secs = divmod(duration, 60)
|
||||
if minutes:
|
||||
return f"{label} set for {minutes} minute{'s' if minutes != 1 else ''} and {secs} seconds."
|
||||
return f"{label} set for {secs} seconds."
|
||||
|
||||
|
||||
@register("get_weather")
|
||||
def get_weather(location: str = "", **_) -> str:
|
||||
"""Stub — can be expanded with a weather API integration."""
|
||||
return (
|
||||
"I don't have a weather service connected yet. "
|
||||
"You can ask me again once the weather API is configured."
|
||||
)
|
||||
|
||||
|
||||
@register("create_reminder")
|
||||
def create_reminder(text: str = "", **_) -> str:
|
||||
"""Save a reminder to a local file."""
|
||||
if not text:
|
||||
return "What would you like me to remind you about?"
|
||||
|
||||
reminders_dir = Path.home() / ".echo" / "reminders"
|
||||
reminders_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
reminder_file = reminders_dir / f"{timestamp}.txt"
|
||||
reminder_file.write_text(f"[{datetime.now().isoformat()}] {text}\n")
|
||||
|
||||
return f"Reminder saved: {text}"
|
||||
|
||||
|
||||
@register("control_volume")
|
||||
def control_volume(level: int = 50, **_) -> str:
|
||||
"""Adjust system volume (Linux/macOS only)."""
|
||||
try:
|
||||
vol = int(level)
|
||||
vol = max(0, min(100, vol))
|
||||
except (ValueError, TypeError):
|
||||
return "Please specify a volume level between 0 and 100."
|
||||
|
||||
system = platform.system()
|
||||
try:
|
||||
if system == "Darwin":
|
||||
subprocess.run(["osascript", "-e", f"set volume output volume {vol}"],
|
||||
check=True, capture_output=True)
|
||||
elif system == "Linux":
|
||||
subprocess.run(
|
||||
["pactl", "set-sink-volume", "@DEFAULT_SINK@", f"{vol}%"],
|
||||
check=True, capture_output=True,
|
||||
)
|
||||
else:
|
||||
return "Volume control isn't supported on Windows yet."
|
||||
|
||||
return f"Volume set to {vol}%."
|
||||
except subprocess.CalledProcessError:
|
||||
return "I couldn't adjust the volume."
|
||||
|
||||
|
||||
@register("search_web")
|
||||
def search_web(query: str = "", **_) -> str:
|
||||
if not query:
|
||||
return "What would you like to search for?"
|
||||
|
||||
url = f"https://www.google.com/search?q={query.replace(' ', '+')}"
|
||||
webbrowser.open(url)
|
||||
return f"Searching the web for: {query}"
|
||||
|
||||
|
||||
@register("calculate")
|
||||
def calculate(expression: str = "", **_) -> str:
|
||||
if not expression:
|
||||
return "What would you like me to calculate?"
|
||||
|
||||
# Whitelist only safe math operations
|
||||
import ast
|
||||
allowed = {
|
||||
ast.Add, ast.Sub, ast.Mult, ast.Div, ast.Pow,
|
||||
ast.USub, ast.UAdd, ast.Constant, ast.Num,
|
||||
}
|
||||
try:
|
||||
tree = ast.parse(expression.strip(), mode="eval")
|
||||
for node in ast.walk(tree):
|
||||
if type(node) not in allowed:
|
||||
raise ValueError("Unsafe expression")
|
||||
result = eval(compile(tree, "<calc>", "eval")) # noqa: S307
|
||||
return f"The result is {result}"
|
||||
except ZeroDivisionError:
|
||||
return "You can't divide by zero."
|
||||
except Exception:
|
||||
return f"I couldn't calculate that expression."
|
||||
|
||||
|
||||
@register("shutdown")
|
||||
def shutdown(confirm: bool = False, **_) -> str:
|
||||
if not confirm:
|
||||
return "Are you sure? Please confirm the shutdown command."
|
||||
system = platform.system()
|
||||
try:
|
||||
if system == "Darwin":
|
||||
subprocess.run(["sudo", "shutdown", "-h", "now"], check=True)
|
||||
elif system == "Windows":
|
||||
subprocess.run(["shutdown", "/s", "/t", "5"], check=True)
|
||||
else:
|
||||
subprocess.run(["sudo", "shutdown", "-h", "now"], check=True)
|
||||
return "Shutting down now. Goodbye!"
|
||||
except Exception:
|
||||
return "I don't have permission to shut down the system."
|
||||
159
brain.py
Normal file
159
brain.py
Normal file
@ -0,0 +1,159 @@
|
||||
"""
|
||||
brain.py — OpenRouter LLM Client (Streaming)
|
||||
|
||||
Responsibilities:
|
||||
1. Send transcribed text to OpenRouter with a system prompt that
|
||||
instructs the model to produce:
|
||||
a) A concise verbal response (≤ 2 sentences for voice).
|
||||
b) An optional JSON command block for local execution.
|
||||
2. Stream tokens back so the TTS engine can start early (Phase 5).
|
||||
3. Parse any JSON command block and return it alongside the spoken text.
|
||||
|
||||
Environment Variables:
|
||||
OPENROUTER_API_KEY — your OpenRouter API key
|
||||
OPENROUTER_MODEL — model identifier (default: qwen/qwen-3-235b-a22b)
|
||||
OPENROUTER_BASE_URL — (optional) custom base URL
|
||||
|
||||
Dependencies:
|
||||
pip install openai
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from typing import AsyncIterator
|
||||
|
||||
from openai import AsyncOpenAI
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Configuration
|
||||
# ---------------------------------------------------------------------------
|
||||
DEFAULT_MODEL = "qwen/qwen-3-235b-a22b"
|
||||
|
||||
SYSTEM_PROMPT = """\
|
||||
You are Echo, a concise, helpful voice assistant. Follow these rules strictly:
|
||||
|
||||
1. **Verbal response**: Reply in ≤ 2 short sentences so it sounds natural
|
||||
when spoken aloud. Be direct and conversational.
|
||||
2. **Local commands** (optional): If the user's request can be fulfilled by a
|
||||
local OS action (e.g. opening an app, setting a timer, checking the time,
|
||||
creating a reminder), include a single JSON block at the very end of your
|
||||
response using this exact format:
|
||||
|
||||
```json
|
||||
{"action": "<command_name>", "params": {"key": "value"}}
|
||||
```
|
||||
|
||||
Supported actions: open_app, set_timer, get_time, get_date, get_weather,
|
||||
create_reminder, control_volume, shutdown, search_web, calculate.
|
||||
|
||||
3. Do NOT include the JSON block in your spoken text. The spoken text is
|
||||
everything BEFORE the JSON block.
|
||||
4. If no local action is needed, just respond normally without any JSON.
|
||||
5. Never use markdown formatting, bullet points, or headers in the spoken text.
|
||||
"""
|
||||
|
||||
|
||||
class Brain:
|
||||
"""Async client for OpenRouter LLM with streaming support."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
api_key: str | None = None,
|
||||
model: str = DEFAULT_MODEL,
|
||||
base_url: str = "https://openrouter.ai/api/v1",
|
||||
):
|
||||
self.model = model
|
||||
self.client = AsyncOpenAI(
|
||||
api_key=api_key or os.environ.get("OPENROUTER_API_KEY", ""),
|
||||
base_url=base_url,
|
||||
)
|
||||
# Conversation history (rolling window)
|
||||
self._history: list[dict[str, str]] = []
|
||||
self._max_history = 20 # keep last 20 messages for context
|
||||
|
||||
async def think(
|
||||
self,
|
||||
user_text: str,
|
||||
) -> AsyncIterator[dict]:
|
||||
"""
|
||||
Stream the LLM response token-by-token.
|
||||
|
||||
Yields:
|
||||
dict with keys:
|
||||
- "type": "token" | "command" | "done"
|
||||
- "text": the token string (for "token" type)
|
||||
- "command": parsed JSON dict (for "command" type)
|
||||
"""
|
||||
self._history.append({"role": "user", "content": user_text})
|
||||
if len(self._history) > self._max_history:
|
||||
self._history = self._history[-self._max_history:]
|
||||
|
||||
messages = [{"role": "system", "content": SYSTEM_PROMPT}] + self._history
|
||||
|
||||
logger.info("Sending to OpenRouter (%s): %s", self.model, user_text[:80])
|
||||
|
||||
full_response = ""
|
||||
try:
|
||||
stream = await self.client.chat.completions.create(
|
||||
model=self.model,
|
||||
messages=messages,
|
||||
stream=True,
|
||||
temperature=0.7,
|
||||
max_tokens=300,
|
||||
)
|
||||
async for chunk in stream:
|
||||
delta = chunk.choices[0].delta
|
||||
if delta.content:
|
||||
full_response += delta.content
|
||||
yield {"type": "token", "text": delta.content}
|
||||
except Exception:
|
||||
logger.exception("OpenRouter request failed")
|
||||
yield {"type": "token", "text": "Sorry, I had trouble thinking about that."}
|
||||
|
||||
# Parse any JSON command block from the full response
|
||||
command = self._extract_command(full_response)
|
||||
if command:
|
||||
logger.info("Parsed command: %s", command)
|
||||
yield {"type": "command", "command": command}
|
||||
|
||||
# Clean spoken text (strip JSON block and thinking tags)
|
||||
spoken_text = self._clean_spoken_text(full_response)
|
||||
self._history.append({"role": "assistant", "content": full_response})
|
||||
|
||||
yield {"type": "done", "text": spoken_text}
|
||||
|
||||
def _extract_command(self, text: str) -> dict | None:
|
||||
"""Extract the JSON command block from the LLM response."""
|
||||
try:
|
||||
# Find JSON code block
|
||||
if "```json" in text:
|
||||
start = text.index("```json") + 7
|
||||
end = text.index("```", start)
|
||||
json_str = text[start:end].strip()
|
||||
return json.loads(json_str)
|
||||
# Try bare JSON at the end
|
||||
for i in range(len(text) - 1, -1, -1):
|
||||
if text[i] == "{":
|
||||
candidate = text[i:].strip()
|
||||
return json.loads(candidate)
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
pass
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _clean_spoken_text(text: str) -> str:
|
||||
"""Remove JSON blocks and Qwen thinking tags from spoken text."""
|
||||
import re
|
||||
|
||||
# Remove Qwen3 <think/> blocks
|
||||
cleaned = re.sub(r"<think[^>]*>.*?</think\s*>", "", text, flags=re.DOTALL)
|
||||
# Remove JSON code blocks
|
||||
cleaned = re.sub(r"```json.*?```", "", cleaned, flags=re.DOTALL)
|
||||
# Remove any trailing bare JSON object
|
||||
cleaned = re.sub(r"\{[^}]*\"action\"[^}]*\}", "", cleaned)
|
||||
# Clean up whitespace
|
||||
cleaned = cleaned.strip().strip(".")
|
||||
return cleaned
|
||||
283
main.py
Normal file
283
main.py
Normal file
@ -0,0 +1,283 @@
|
||||
"""
|
||||
main.py — Echo Voice Assistant Orchestrator
|
||||
|
||||
Ties together all modules:
|
||||
1. WakeWordListener (stt.py) — continuously listens for "echo"
|
||||
2. Transcriber (stt.py) — captures & transcribes voice commands
|
||||
3. Brain (brain.py) — sends text to OpenRouter, streams response
|
||||
4. TTSEngine (tts.py) — generates speech from text (Qwen3-TTS)
|
||||
5. Actions (actions.py) — executes local OS commands
|
||||
|
||||
Phase 5 Parallel Processing:
|
||||
As soon as the first complete sentence is received from the Brain's
|
||||
streamed response, TTS generation begins immediately — before the
|
||||
full LLM response has finished streaming.
|
||||
|
||||
Usage:
|
||||
python main.py
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import signal
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
from stt import WakeWordListener, Transcriber
|
||||
from brain import Brain
|
||||
from tts import TTSEngine
|
||||
from actions import execute as execute_action
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Logging setup
|
||||
# ---------------------------------------------------------------------------
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s │ %(name)-18s │ %(levelname)-7s │ %(message)s",
|
||||
datefmt="%H:%M:%S",
|
||||
)
|
||||
logger = logging.getLogger("echo")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Load environment
|
||||
# ---------------------------------------------------------------------------
|
||||
load_dotenv(Path(__file__).parent / ".env")
|
||||
|
||||
|
||||
class EchoAssistant:
|
||||
"""
|
||||
Main orchestrator for the Echo voice assistant.
|
||||
|
||||
Lifecycle:
|
||||
1. Start wake word listener (background thread).
|
||||
2. On wake word detected → transcribe command.
|
||||
3. Stream LLM response → start TTS on first sentence (parallel).
|
||||
4. Execute any local commands from the LLM response.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
# --- STT ---
|
||||
model_path = os.environ.get(
|
||||
"VOSK_MODEL_PATH", "models/vosk-model-small-en-us"
|
||||
)
|
||||
wake_word = os.environ.get("WAKE_WORD", "echo")
|
||||
|
||||
self.transcriber = Transcriber(model_path=model_path)
|
||||
self.wake_listener = WakeWordListener(
|
||||
wake_word=wake_word,
|
||||
on_detected=self._on_wake_word,
|
||||
)
|
||||
|
||||
# --- Brain (LLM) ---
|
||||
self.brain = Brain(
|
||||
api_key=os.environ.get("OPENROUTER_API_KEY"),
|
||||
model=os.environ.get("OPENROUTER_MODEL", "qwen/qwen-3-235b-a22b"),
|
||||
)
|
||||
|
||||
# --- TTS ---
|
||||
self.tts = TTSEngine(
|
||||
model_name=os.environ.get("QWEN_TTS_MODEL", "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"),
|
||||
voice=os.environ.get("QWEN_TTS_VOICE", "Ryan"),
|
||||
instruction=os.environ.get(
|
||||
"QWEN_TTS_INSTRUCT",
|
||||
"Speak clearly with a warm, friendly tone. Be natural and conversational.",
|
||||
),
|
||||
)
|
||||
|
||||
# --- State ---
|
||||
self._processing = False # guard against concurrent commands
|
||||
self._shutdown_event = asyncio.Event()
|
||||
|
||||
logger.info("Echo assistant initialized (wake word: '%s')", wake_word)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Wake word callback (runs in background thread)
|
||||
# ------------------------------------------------------------------
|
||||
def _on_wake_word(self, wake_word: str):
|
||||
"""Called by WakeWordListener when the wake word is detected."""
|
||||
if self._processing:
|
||||
logger.info("Still processing previous command — ignoring wake word")
|
||||
return
|
||||
# Schedule the command processing in the async event loop
|
||||
try:
|
||||
loop = asyncio.get_running_loop()
|
||||
loop.call_soon_threadsafe(loop.create_task, self._handle_command())
|
||||
except RuntimeError:
|
||||
logger.warning("No running event loop for wake word callback")
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Main command pipeline
|
||||
# ------------------------------------------------------------------
|
||||
async def _handle_command(self):
|
||||
"""Full pipeline: transcribe → think → speak → act."""
|
||||
if self._processing:
|
||||
return
|
||||
self._processing = True
|
||||
|
||||
try:
|
||||
# Play a brief acknowledgment tone
|
||||
logger.info("🔊 Wake word detected — listening...")
|
||||
|
||||
# Step 1: Transcribe
|
||||
text = self.transcriber.listen_and_transcribe()
|
||||
if not text:
|
||||
logger.info("No transcription — returning to idle")
|
||||
return
|
||||
|
||||
logger.info("📝 You said: '%s'", text)
|
||||
|
||||
# Step 2: Stream LLM response with early TTS (Phase 5)
|
||||
await self._stream_and_speak(text)
|
||||
|
||||
except Exception:
|
||||
logger.exception("Error in command pipeline")
|
||||
finally:
|
||||
self._processing = False
|
||||
logger.info("Returning to idle...")
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Phase 5: Parallel Streaming + TTS
|
||||
# ------------------------------------------------------------------
|
||||
async def _stream_and_speak(self, user_text: str):
|
||||
"""
|
||||
Stream the LLM response and start TTS generation as soon as the
|
||||
first complete sentence is available — minimizing perceived latency.
|
||||
"""
|
||||
buffer = ""
|
||||
first_sentence_spoken = False
|
||||
remaining_text = ""
|
||||
pending_command = None
|
||||
tts_tasks: list[asyncio.Task] = []
|
||||
|
||||
async for event in self.brain.think(user_text):
|
||||
if event["type"] == "token":
|
||||
buffer += event["text"]
|
||||
|
||||
# Check if we have a complete sentence
|
||||
if not first_sentence_spoken and self._has_complete_sentence(buffer):
|
||||
# Split: first sentence goes to TTS immediately
|
||||
sentences = self._split_first_sentence(buffer)
|
||||
first_sentence = sentences[0]
|
||||
remaining_text = sentences[1] if len(sentences) > 1 else ""
|
||||
|
||||
if first_sentence.strip():
|
||||
logger.info("⚡ Early TTS trigger: '%s'", first_sentence[:60])
|
||||
task = asyncio.create_task(
|
||||
self.tts.speak(first_sentence.strip())
|
||||
)
|
||||
tts_tasks.append(task)
|
||||
|
||||
first_sentence_spoken = True
|
||||
buffer = remaining_text
|
||||
|
||||
elif event["type"] == "command":
|
||||
pending_command = event["command"]
|
||||
|
||||
elif event["type"] == "done":
|
||||
# Any remaining text after the first sentence
|
||||
final_text = buffer.strip()
|
||||
if final_text and final_text != remaining_text:
|
||||
final_text = event["text"]
|
||||
# Remove the already-spoken first sentence
|
||||
if first_sentence_spoken and remaining_text:
|
||||
pass # remaining_text already has what we need
|
||||
else:
|
||||
remaining_text = final_text
|
||||
|
||||
# Step 3: Speak the remaining text after first sentence finishes
|
||||
remaining_text = remaining_text.strip()
|
||||
if remaining_text:
|
||||
# Wait for first sentence TTS to finish
|
||||
for task in tts_tasks:
|
||||
await task
|
||||
await self.tts.speak(remaining_text)
|
||||
|
||||
# Wait for all TTS tasks to complete
|
||||
for task in tts_tasks:
|
||||
if not task.done():
|
||||
await task
|
||||
|
||||
# Step 4: Execute any local command
|
||||
if pending_command:
|
||||
action_name = pending_command.get("action", "")
|
||||
params = pending_command.get("params", {})
|
||||
logger.info("🔧 Executing action: %s %s", action_name, params)
|
||||
result = execute_action(action_name, params)
|
||||
if result:
|
||||
await self.tts.speak(result)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Text utilities
|
||||
# ------------------------------------------------------------------
|
||||
@staticmethod
|
||||
def _has_complete_sentence(text: str) -> bool:
|
||||
"""Check if the text buffer contains at least one complete sentence."""
|
||||
# A sentence is considered complete if it ends with . ! ? or ...
|
||||
return bool(re.search(r'[.!?]\s+|[.!?]$', text))
|
||||
|
||||
@staticmethod
|
||||
def _split_first_sentence(text: str) -> list[str]:
|
||||
"""Split text at the first sentence boundary."""
|
||||
match = re.search(r'([.!?])\s+', text)
|
||||
if match:
|
||||
end = match.start() + 1
|
||||
return [text[:end], text[end:].strip()]
|
||||
# Check for ending punctuation without trailing space
|
||||
match = re.search(r'[.!?]$', text.strip())
|
||||
if match:
|
||||
return [text.strip()]
|
||||
return [text]
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Lifecycle
|
||||
# ------------------------------------------------------------------
|
||||
async def start(self):
|
||||
"""Start the Echo assistant."""
|
||||
logger.info("=" * 60)
|
||||
logger.info(" ECHO VOICE ASSISTANT")
|
||||
logger.info(" Say '%s' to activate", os.environ.get("WAKE_WORD", "echo").upper())
|
||||
logger.info(" Press Ctrl+C to quit")
|
||||
logger.info("=" * 60)
|
||||
|
||||
# Start wake word listener (runs in background thread)
|
||||
self.wake_listener.start()
|
||||
|
||||
# Keep the async loop alive until shutdown
|
||||
await self._shutdown_event.wait()
|
||||
|
||||
def shutdown(self):
|
||||
"""Signal the assistant to stop."""
|
||||
logger.info("Shutting down Echo...")
|
||||
self.wake_listener.stop()
|
||||
self._shutdown_event.set()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Entry Point
|
||||
# ---------------------------------------------------------------------------
|
||||
def main():
|
||||
assistant = EchoAssistant()
|
||||
|
||||
# Graceful shutdown on Ctrl+C
|
||||
def _signal_handler(sig, frame):
|
||||
assistant.shutdown()
|
||||
|
||||
signal.signal(signal.SIGINT, _signal_handler)
|
||||
signal.signal(signal.SIGTERM, _signal_handler)
|
||||
|
||||
# Run the async event loop
|
||||
try:
|
||||
asyncio.run(assistant.start())
|
||||
except KeyboardInterrupt:
|
||||
pass
|
||||
finally:
|
||||
assistant.shutdown()
|
||||
logger.info("Echo has shut down. Goodbye!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
31
requirements.txt
Normal file
31
requirements.txt
Normal file
@ -0,0 +1,31 @@
|
||||
# ===========================================================
|
||||
# Echo Voice Assistant — Dependencies
|
||||
# ===========================================================
|
||||
# Install with: pip install -r requirements.txt
|
||||
#
|
||||
# Note: For GPU-accelerated TTS, install PyTorch with CUDA
|
||||
# support first: https://pytorch.org/get-started/locally/
|
||||
# ===========================================================
|
||||
|
||||
# --- Core ---
|
||||
vosk>=0.3.45
|
||||
pyaudio>=0.2.14
|
||||
openwakeword>=0.5.0
|
||||
|
||||
# --- LLM ---
|
||||
openai>=1.30.0
|
||||
python-dotenv>=1.0.0
|
||||
|
||||
# --- TTS (Qwen3-TTS) ---
|
||||
# Install from source or PyPI once available:
|
||||
# pip install qwen-tts
|
||||
torch>=2.1.0
|
||||
soundfile>=0.12.1
|
||||
transformers>=4.40.0
|
||||
accelerate>=0.27.0
|
||||
|
||||
# --- Audio Playback ---
|
||||
pygame>=2.5.0
|
||||
|
||||
# --- Utilities ---
|
||||
numpy>=1.26.0
|
||||
195
stt.py
Normal file
195
stt.py
Normal file
@ -0,0 +1,195 @@
|
||||
"""
|
||||
stt.py — Speech-To-Text Module (Vosk + PyAudio + openWakeWord)
|
||||
|
||||
Responsibilities:
|
||||
1. Continuously monitor the microphone for a wake word ("echo").
|
||||
2. Once triggered, capture and transcribe the full spoken command.
|
||||
3. Return the transcribed text to the orchestrator.
|
||||
|
||||
Environment Variables:
|
||||
VOSK_MODEL_PATH — path to a Vosk model directory (default: models/vosk-model-small-en-us)
|
||||
WAKE_WORD — wake phrase to listen for (default: "echo")
|
||||
|
||||
Dependencies:
|
||||
pip install vosk pyaudio openwakeword
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import queue
|
||||
import threading
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import openwakeword
|
||||
import pyaudio
|
||||
from vosk import KaldiRecognizer, Model, SetLogLevel
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Constants
|
||||
# ---------------------------------------------------------------------------
|
||||
DEFAULT_VOSK_MODEL = "models/vosk-model-small-en-us"
|
||||
DEFAULT_WAKE_WORD = "echo"
|
||||
FORMAT = pyaudio.paInt16
|
||||
CHANNELS = 1
|
||||
RATE = 16000
|
||||
CHUNK = 1024
|
||||
RECORD_SECONDS = 10 # max length of a voice command after wake word
|
||||
SILENCE_LIMIT = 1.5 # seconds of silence before we stop recording
|
||||
|
||||
|
||||
class WakeWordListener:
|
||||
"""Background thread that listens for the wake word using openWakeWord."""
|
||||
|
||||
def __init__(self, wake_word: str = DEFAULT_WAKE_WORD, on_detected=None):
|
||||
self.wake_word = wake_word.lower()
|
||||
self.on_detected = on_detected # callback(wake_word)
|
||||
self._running = False
|
||||
self._thread: threading.Thread | None = None
|
||||
self._audio_queue: queue.Queue = queue.Queue()
|
||||
|
||||
# ---- audio callback fed to PyAudio stream ----
|
||||
def _audio_callback(self, in_data, frame_count, time_info, status):
|
||||
self._audio_queue.put(in_data)
|
||||
return (in_data, pyaudio.paContinue)
|
||||
|
||||
def start(self):
|
||||
if self._running:
|
||||
return
|
||||
self._running = True
|
||||
self._thread = threading.Thread(target=self._listen_loop, daemon=True)
|
||||
self._thread.start()
|
||||
logger.info("WakeWordListener started — listening for '%s'", self.wake_word)
|
||||
|
||||
def stop(self):
|
||||
self._running = False
|
||||
if self._thread:
|
||||
self._thread.join(timeout=5)
|
||||
logger.info("WakeWordListener stopped")
|
||||
|
||||
def _listen_loop(self):
|
||||
"""Open PyAudio, feed frames to openWakeWord, fire callback on match."""
|
||||
oww = openwakeword.Model(
|
||||
wakeword_models=[self.wake_word],
|
||||
inference_framework="onnx",
|
||||
)
|
||||
|
||||
pa = pyaudio.PyAudio()
|
||||
stream = pa.open(
|
||||
format=FORMAT,
|
||||
channels=CHANNELS,
|
||||
rate=RATE,
|
||||
input=True,
|
||||
frames_per_buffer=CHUNK,
|
||||
stream_callback=self._audio_callback,
|
||||
)
|
||||
stream.start_stream()
|
||||
|
||||
try:
|
||||
while self._running:
|
||||
try:
|
||||
frame = self._audio_queue.get(timeout=0.5)
|
||||
except queue.Empty:
|
||||
continue
|
||||
# openWakeWord expects 16 kHz mono int16 — matches our format
|
||||
prediction = oww.process(frame)
|
||||
for model_name, score in prediction.items():
|
||||
if score >= 0.5: # threshold
|
||||
logger.info("Wake word '%s' detected (score=%.2f)", model_name, score)
|
||||
if self.on_detected:
|
||||
self.on_detected(self.wake_word)
|
||||
finally:
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
pa.terminate()
|
||||
|
||||
|
||||
class Transcriber:
|
||||
"""Captures microphone audio after wake word and transcribes via Vosk."""
|
||||
|
||||
def __init__(self, model_path: str = DEFAULT_VOSK_MODEL):
|
||||
resolved = Path(model_path)
|
||||
if not resolved.exists():
|
||||
raise FileNotFoundError(
|
||||
f"Vosk model not found at {resolved.resolve()}. "
|
||||
"Download one from https://alphacephei.com/vosk/models"
|
||||
)
|
||||
SetLogLevel(-1) # suppress Vosk internal noise
|
||||
self._model = Model(str(resolved))
|
||||
self._recognizer = KaldiRecognizer(self._model, RATE)
|
||||
|
||||
def listen_and_transcribe(self) -> str | None:
|
||||
"""
|
||||
Open the mic, record until silence or timeout, and return the
|
||||
best-effort transcription of the spoken command.
|
||||
|
||||
Returns:
|
||||
Transcribed text (str) or None if nothing was understood.
|
||||
"""
|
||||
pa = pyaudio.PyAudio()
|
||||
stream = pa.open(
|
||||
format=FORMAT,
|
||||
channels=CHANNELS,
|
||||
rate=RATE,
|
||||
input=True,
|
||||
frames_per_buffer=CHUNK,
|
||||
)
|
||||
stream.start_stream()
|
||||
logger.info("Recording voice command...")
|
||||
|
||||
all_data = b""
|
||||
silence_start: float | None = None
|
||||
started_speaking = False
|
||||
|
||||
try:
|
||||
while True:
|
||||
data = stream.read(CHUNK, exception_on_overflow=False)
|
||||
all_data += data
|
||||
|
||||
# Quick RMS check for silence detection
|
||||
rms = self._rms(data)
|
||||
if rms > 300:
|
||||
started_speaking = True
|
||||
silence_start = None
|
||||
elif started_speaking and silence_start is None:
|
||||
silence_start = time.time()
|
||||
|
||||
# Stop on silence timeout or max duration
|
||||
if silence_start and (time.time() - silence_start) > SILENCE_LIMIT:
|
||||
logger.debug("Silence detected — ending recording")
|
||||
break
|
||||
if len(all_data) > RATE * RECORD_SECONDS * 2: # bytes
|
||||
logger.debug("Max recording duration reached")
|
||||
break
|
||||
finally:
|
||||
stream.stop_stream()
|
||||
stream.close()
|
||||
pa.terminate()
|
||||
|
||||
if not started_speaking:
|
||||
logger.info("No speech detected after wake word")
|
||||
return None
|
||||
|
||||
# Feed all collected audio to Vosk for final transcription
|
||||
self._recognizer.Reset()
|
||||
if self._recognizer.AcceptWaveform(all_data):
|
||||
result = json.loads(self._recognizer.Result())
|
||||
text = result.get("text", "").strip()
|
||||
else:
|
||||
partial = json.loads(self._recognizer.PartialResult())
|
||||
text = partial.get("partial", "").strip()
|
||||
|
||||
logger.info("Transcription: '%s'", text)
|
||||
return text if text else None
|
||||
|
||||
@staticmethod
|
||||
def _rms(data: bytes) -> float:
|
||||
"""Compute Root Mean Square of a byte buffer of int16 samples."""
|
||||
import array
|
||||
samples = array.array("h", data)
|
||||
if not samples:
|
||||
return 0.0
|
||||
sum_sq = sum(s * s for s in samples)
|
||||
return (sum_sq / len(samples)) ** 0.5
|
||||
181
tts.py
Normal file
181
tts.py
Normal file
@ -0,0 +1,181 @@
|
||||
"""
|
||||
tts.py — Text-To-Speech Module (Qwen3-TTS)
|
||||
|
||||
Responsibilities:
|
||||
1. Accept text (full or partial sentence) and generate a .wav audio file
|
||||
using the Qwen3-TTS model running locally.
|
||||
2. Support voice selection (preset voices or custom voice cloning).
|
||||
3. Support instruction-based style control (e.g., energy, tone).
|
||||
4. Play the generated audio immediately.
|
||||
|
||||
Environment Variables:
|
||||
QWEN_TTS_MODEL — model name or local path (default: Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice)
|
||||
QWEN_TTS_VOICE — preset voice name or path to 3s .wav sample
|
||||
QWEN_TTS_INSTRUCT — default style instruction for speech generation
|
||||
|
||||
Dependencies:
|
||||
pip install qwen-tts torch soundfile pygame
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import logging
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Configuration
|
||||
# ---------------------------------------------------------------------------
|
||||
DEFAULT_MODEL = "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"
|
||||
DEFAULT_VOICE = "Ryan" # preset voice; alternatives: "Serena", "Diana", etc.
|
||||
DEFAULT_INSTRUCTION = "Speak clearly with a warm, friendly tone. Be natural and conversational."
|
||||
OUTPUT_DIR = Path("audio_output")
|
||||
|
||||
|
||||
class TTSEngine:
|
||||
"""
|
||||
Wrapper around Qwen3-TTS for generating speech from text.
|
||||
|
||||
The engine lazily loads the model on first use to avoid slow startup.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model_name: str = DEFAULT_MODEL,
|
||||
voice: str = DEFAULT_VOICE,
|
||||
instruction: str = DEFAULT_INSTRUCTION,
|
||||
output_dir: str | Path = OUTPUT_DIR,
|
||||
):
|
||||
self.model_name = model_name
|
||||
self.voice = voice
|
||||
self.instruction = instruction
|
||||
self.output_dir = Path(output_dir)
|
||||
self.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self._model = None
|
||||
self._processor = None
|
||||
self._lock = asyncio.Lock() # prevent concurrent generation
|
||||
|
||||
# ---- lazy model loading ----
|
||||
def _ensure_loaded(self):
|
||||
"""Load model and processor on first call (lazy init)."""
|
||||
if self._model is not None:
|
||||
return
|
||||
|
||||
logger.info("Loading Qwen3-TTS model '%s' (this may take a moment)...", self.model_name)
|
||||
try:
|
||||
from qwen_tts import QwenTTSProcessor, QwenTTSModel
|
||||
|
||||
self._processor = QwenTTSProcessor()
|
||||
self._model = QwenTTSModel.from_pretrained(self.model_name)
|
||||
logger.info("Qwen3-TTS model loaded successfully")
|
||||
except ImportError:
|
||||
raise ImportError(
|
||||
"qwen-tts is not installed. Install it with:\n"
|
||||
" pip install qwen-tts torch soundfile\n"
|
||||
"Also ensure you have CUDA-capable GPU for low-latency inference."
|
||||
)
|
||||
|
||||
# ---- generation ----
|
||||
async def generate(self, text: str, instruction: str | None = None) -> Path | None:
|
||||
"""
|
||||
Generate speech audio from text and save as .wav.
|
||||
|
||||
Args:
|
||||
text: The text to convert to speech.
|
||||
instruction: Optional style instruction override.
|
||||
|
||||
Returns:
|
||||
Path to the generated .wav file, or None on failure.
|
||||
"""
|
||||
if not text or not text.strip():
|
||||
return None
|
||||
|
||||
async with self._lock:
|
||||
return await asyncio.to_thread(
|
||||
self._generate_sync, text.strip(), instruction or self.instruction
|
||||
)
|
||||
|
||||
def _generate_sync(self, text: str, instruction: str) -> Path | None:
|
||||
"""Synchronous generation (runs in thread pool)."""
|
||||
self._ensure_loaded()
|
||||
|
||||
output_path = self.output_dir / f"echo_{os.urandom(4).hex()}.wav"
|
||||
|
||||
try:
|
||||
# Build voice reference: preset name or custom .wav path
|
||||
voice_ref = self.voice
|
||||
if Path(self.voice).exists():
|
||||
voice_ref = str(Path(self.voice).resolve())
|
||||
|
||||
# Generate audio
|
||||
logger.info("Generating speech: '%s' (voice=%s)", text[:60], self.voice)
|
||||
audio_array = self._model.generate(
|
||||
processor=self._processor,
|
||||
text=text,
|
||||
voice=voice_ref,
|
||||
instruction=instruction,
|
||||
)
|
||||
|
||||
# Save to file
|
||||
import soundfile as sf
|
||||
|
||||
sample_rate = self._processor.sampling_rate
|
||||
sf.write(str(output_path), audio_array, sample_rate)
|
||||
logger.info("Audio saved to %s (%.1fs)", output_path, len(audio_array) / sample_rate)
|
||||
return output_path
|
||||
|
||||
except Exception:
|
||||
logger.exception("TTS generation failed for: '%s'", text[:60])
|
||||
return None
|
||||
|
||||
# ---- playback ----
|
||||
async def speak(self, text: str, instruction: str | None = None) -> bool:
|
||||
"""
|
||||
Generate speech from text and play it immediately.
|
||||
|
||||
Returns:
|
||||
True if playback succeeded, False otherwise.
|
||||
"""
|
||||
wav_path = await self.generate(text, instruction)
|
||||
if not wav_path:
|
||||
return False
|
||||
return await self._play(wav_path)
|
||||
|
||||
async def speak_file(self, wav_path: Path) -> bool:
|
||||
"""Play a previously generated .wav file."""
|
||||
return await self._play(wav_path)
|
||||
|
||||
@staticmethod
|
||||
async def _play(wav_path: Path) -> bool:
|
||||
"""Play a .wav file using pygame.mixer (async-friendly)."""
|
||||
try:
|
||||
import pygame
|
||||
|
||||
pygame.mixer.init(frequency=22050, size=-16, channels=1, buffer=2048)
|
||||
pygame.mixer.music.load(str(wav_path))
|
||||
pygame.mixer.music.play()
|
||||
|
||||
# Wait for playback to finish
|
||||
while pygame.mixer.music.get_busy():
|
||||
await asyncio.sleep(0.05)
|
||||
|
||||
pygame.mixer.music.stop()
|
||||
pygame.mixer.quit()
|
||||
logger.info("Playback finished: %s", wav_path.name)
|
||||
return True
|
||||
except Exception:
|
||||
logger.exception("Playback failed for %s", wav_path)
|
||||
return False
|
||||
|
||||
def set_voice(self, voice: str):
|
||||
"""Switch to a different voice preset or custom sample path."""
|
||||
self.voice = voice
|
||||
logger.info("Voice set to: %s", voice)
|
||||
|
||||
def set_instruction(self, instruction: str):
|
||||
"""Update the default style instruction."""
|
||||
self.instruction = instruction
|
||||
logger.info("TTS instruction updated: %s", instruction)
|
||||
Loading…
Reference in New Issue
Block a user