moxieTalking/server.py

"""
server.py — Echo Voice Assistant Web Server (FastAPI + WebSocket)

Starts a web server on port 8001 that serves:
  - Web UI (static files from web/)
  - WebSocket endpoint for streaming chat
  - REST API for health, audio, and settings

Usage:
  python server.py
  # Then open http://localhost:8001 in your browser

Environment Variables (see .env.example):
  OPENROUTER_API_KEY  — required for LLM responses
  SERVER_PORT         — port to run on (default: 8001)
  SERVER_HOST         — host to bind to (default: 0.0.0.0)

Dependencies:
  pip install fastapi uvicorn python-multipart websockets
"""

import asyncio
import json
import logging
import os
import uuid
from pathlib import Path

from dotenv import load_dotenv
from fastapi import FastAPI, WebSocket, WebSocketDisconnect
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse, HTMLResponse
from fastapi.staticfiles import StaticFiles

from brain import Brain
from tts import TTSEngine
from actions import execute as execute_action

# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s │ %(name)-18s │ %(levelname)-7s │ %(message)s",
    datefmt="%H:%M:%S",
)
logger = logging.getLogger("echo.server")

# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------
load_dotenv(Path(__file__).parent / ".env")

SERVER_HOST = os.environ.get("SERVER_HOST", "0.0.0.0")
SERVER_PORT = int(os.environ.get("SERVER_PORT", "8001"))
WEB_DIR = Path(__file__).parent / "web"

# ---------------------------------------------------------------------------
# FastAPI app
# ---------------------------------------------------------------------------
app = FastAPI(title="Echo Voice Assistant", version="1.0.0")

app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# ---------------------------------------------------------------------------
# Initialize engines
# ---------------------------------------------------------------------------
brain = Brain(
    api_key=os.environ.get("OPENROUTER_API_KEY"),
    model=os.environ.get("OPENROUTER_MODEL", "qwen/qwen-3-235b-a22b"),
)

tts = TTSEngine(
    model_name=os.environ.get("QWEN_TTS_MODEL", "Qwen/Qwen3-TTS-12Hz-1.7B-CustomVoice"),
    voice_sample=os.environ.get("QWEN_TTS_VOICE", "voices/echo_voice.wav"),
    instruction=os.environ.get(
        "QWEN_TTS_INSTRUCT",
        "Speak clearly with a warm, friendly tone. Be natural and conversational.",
    ),
)

# ---------------------------------------------------------------------------
# Per-session state
# ---------------------------------------------------------------------------
sessions: dict[str, dict] = {}


def get_session(ws_id: str) -> dict:
    if ws_id not in sessions:
        sessions[ws_id] = {"id": ws_id, "history": []}
    return sessions[ws_id]


# ---------------------------------------------------------------------------
# Routes — Web UI
# ---------------------------------------------------------------------------
@app.get("/")
async def serve_index():
    index = WEB_DIR / "index.html"
    if index.exists():
        return FileResponse(index)
    return HTMLResponse("<h1>Echo Web UI not found — place files in web/ directory</h1>")


# Serve any static files from web/
@app.get("/{path:path}")
async def serve_static(path: str):
    file_path = WEB_DIR / path
    if file_path.exists() and file_path.is_file():
        return FileResponse(file_path)
    return HTMLResponse("Not found", status_code=404)


# ---------------------------------------------------------------------------
# Routes — API
# ---------------------------------------------------------------------------
@app.get("/api/health")
async def health():
    voice_ok = Path(os.environ.get("QWEN_TTS_VOICE", "voices/echo_voice.wav")).exists()
    return {
        "status": "ok",
        "voice_sample": "loaded" if voice_ok else "missing",
        "model": os.environ.get("OPENROUTER_MODEL", "qwen/qwen-3-235b-a22b"),
    }


@app.get("/api/audio/{filename}")
async def get_audio(filename: str):
    """Serve a generated audio file."""
    audio_path = Path("audio_output") / filename
    if audio_path.exists():
        return FileResponse(audio_path, media_type="audio/wav")
    return HTMLResponse("Not found", status_code=404)


# ---------------------------------------------------------------------------
# WebSocket — Streaming Chat
# ---------------------------------------------------------------------------
@app.websocket("/ws/chat")
async def websocket_chat(ws: WebSocket):
    await ws.accept()
    ws_id = str(uuid.uuid4())[:8]
    session = get_session(ws_id)
    logger.info("Client connected: session=%s", ws_id)

    try:
        while True:
            data = json.loads(await ws.receive_text())
            msg_type = data.get("type", "chat")
            payload = data.get("payload", {})

            if msg_type == "chat":
                message = payload.get("message", "").strip()
                if not message:
                    await ws.send_json({"type": "error", "text": "Empty message"})
                    continue

                session["history"].append({"role": "user", "content": message})

                # Stream tokens from the brain
                full_text = ""
                audio_url = None
                pending_command = None

                try:
                    async for event in brain.think(message):
                        if event["type"] == "token":
                            full_text += event["text"]
                            await ws.send_json({
                                "type": "token",
                                "text": event["text"],
                            })

                        elif event["type"] == "command":
                            pending_command = event["command"]

                        elif event["type"] == "done":
                            spoken = event["text"]

                            # Send completion with final text
                            await ws.send_json({
                                "type": "done",
                                "text": spoken,
                                "full_text": full_text,
                            })

                            # Store in session history
                            session["history"].append({
                                "role": "assistant",
                                "content": spoken,
                            })
                            # Keep last 20 messages
                            if len(session["history"]) > 20:
                                session["history"] = session["history"][-20:]

                            # Execute any local command
                            if pending_command:
                                action_name = pending_command.get("action", "")
                                action_params = pending_command.get("params", {})
                                logger.info(
                                    "Executing action: %s %s", action_name, action_params
                                )
                                action_result = execute_action(action_name, action_params)
                                await ws.send_json({
                                    "type": "action",
                                    "action": action_name,
                                    "result": action_result,
                                })

                            # Generate TTS audio (async, non-blocking)
                            try:
                                wav_path = await tts.generate(spoken)
                                if wav_path:
                                    audio_url = f"/api/audio/{wav_path.name}"
                                    await ws.send_json({
                                        "type": "audio",
                                        "url": audio_url,
                                    })
                            except Exception as exc:
                                logger.warning("TTS generation skipped: %s", exc)

                            await ws.send_json({"type": "ready"})

                except Exception as exc:
                    logger.exception("Error processing chat")
                    await ws.send_json({
                        "type": "error",
                        "text": f"Error: {exc}",
                    })
                    await ws.send_json({"type": "ready"})

            elif msg_type == "clear":
                session["history"] = []
                await ws.send_json({"type": "cleared"})

    except WebSocketDisconnect:
        logger.info("Client disconnected: session=%s", ws_id)
    except Exception:
        logger.exception("WebSocket error for session=%s", ws_id)
    finally:
        if ws_id in sessions:
            del sessions[ws_id]


# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------
if __name__ == "__main__":
    import uvicorn

    logger.info("=" * 50)
    logger.info("  ECHO VOICE ASSISTANT — Web Server")
    logger.info("  http://%s:%d", SERVER_HOST, SERVER_PORT)
    logger.info("=" * 50)

    uvicorn.run(
        app,
        host=SERVER_HOST,
        port=SERVER_PORT,
        log_level="warning",
    )