Switch from ZAI SDK to OpenRouter with openrouter/free model

- Replace z-ai-web-dev-sdk with openai SDK
- Add OPENROUTER_API_KEY and OPENROUTER_BASE_URL config
- Update AsyncOpenAI client for OpenRouter
- Update generate_response and stream_chat_completion
- Update .env.example with OpenRouter settings
This commit is contained in:
Z User 2026-03-29 04:35:54 +00:00
parent 10e61dd2f1
commit b23964b35a
3 changed files with 59 additions and 67 deletions

View File

@ -1,19 +1,21 @@
# DocRAG Configuration # DocRAG Configuration
# Copy this file to .env and fill in your values # Copy this file to .env and fill in your values
# Server Configuration # Server settings
HOST=0.0.0.0 HOST=0.0.0.0
PORT=8000 PORT=8000
DEBUG=false DEBUG=false
# Model Configuration # Model settings
MODEL_NAME=DocRAG-GLM-4.7 MODEL_NAME=DocRAG
UPSTREAM_MODEL=glm-4.7 UPSTREAM_MODEL=openrouter/free
# API Keys # OpenRouter API settings (required for LLM responses)
ZAI_API_KEY=your-zai-api-key-here # Get your API key from https://openrouter.ai/keys
OPENROUTER_API_KEY=your_openrouter_api_key_here
OPENROUTER_BASE_URL=https://openrouter.ai/api/v1
# RAG Configuration # RAG settings
EMBEDDING_MODEL=text-embedding-3-small EMBEDDING_MODEL=text-embedding-3-small
VECTOR_STORE_PATH=./data/vectors VECTOR_STORE_PATH=./data/vectors
DOCUMENTS_PATH=./data/documents DOCUMENTS_PATH=./data/documents
@ -21,6 +23,6 @@ CHUNK_SIZE=1000
CHUNK_OVERLAP=200 CHUNK_OVERLAP=200
TOP_K_RESULTS=5 TOP_K_RESULTS=5
# Tool Configuration # Tool settings
ENABLE_TOOLS=true ENABLE_TOOLS=true
MAX_TOOL_ITERATIONS=3 MAX_TOOL_ITERATIONS=3

102
main.py
View File

@ -26,6 +26,10 @@ import uuid
from contextlib import asynccontextmanager from contextlib import asynccontextmanager
from typing import Any, AsyncIterator, Optional from typing import Any, AsyncIterator, Optional
# Load environment variables from .env file
from dotenv import load_dotenv
load_dotenv()
# Configure logging # Configure logging
logging.basicConfig( logging.basicConfig(
level=logging.INFO, level=logging.INFO,
@ -47,12 +51,8 @@ from rag.document_processor import DocumentProcessor
# Import tools # Import tools
from tools import ToolManager, get_tool_manager from tools import ToolManager, get_tool_manager
# Import SDK for GLM-4.7-Flash # Import OpenAI client for OpenRouter
try: from openai import AsyncOpenAI
from zai import ZaiClient as ZAI
except ImportError:
ZAI = None
log.warning("z-ai-web-dev-sdk not installed. Install with: pip install z-ai-web-dev-sdk")
# ============================================================================= # =============================================================================
@ -68,11 +68,12 @@ class Config:
DEBUG: bool = os.getenv("DEBUG", "false").lower() == "true" DEBUG: bool = os.getenv("DEBUG", "false").lower() == "true"
# Model settings # Model settings
MODEL_NAME: str = os.getenv("MODEL_NAME", "DocRAG-GLM-4.7") MODEL_NAME: str = os.getenv("MODEL_NAME", "DocRAG")
UPSTREAM_MODEL: str = os.getenv("UPSTREAM_MODEL", "glm-4.7") UPSTREAM_MODEL: str = os.getenv("UPSTREAM_MODEL", "openrouter/free")
# API Key for upstream LLM # OpenRouter API settings
ZAI_API_KEY: str = os.getenv("ZAI_API_KEY", "") OPENROUTER_API_KEY: str = os.getenv("OPENROUTER_API_KEY", "")
OPENROUTER_BASE_URL: str = os.getenv("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1")
# RAG settings # RAG settings
EMBEDDING_MODEL: str = os.getenv("EMBEDDING_MODEL", "text-embedding-3-small") EMBEDDING_MODEL: str = os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")
@ -105,7 +106,7 @@ class ChatMessage(BaseModel):
class ChatCompletionRequest(BaseModel): class ChatCompletionRequest(BaseModel):
"""OpenAI chat completion request format.""" """OpenAI chat completion request format."""
model: str = "DocRAG-GLM-4.7" model: str = "DocRAG"
messages: list[ChatMessage] messages: list[ChatMessage]
temperature: Optional[float] = 0.7 temperature: Optional[float] = 0.7
top_p: Optional[float] = 1.0 top_p: Optional[float] = 1.0
@ -166,7 +167,7 @@ class AppState:
"""Global application state.""" """Global application state."""
rag_system: Optional[RAGSystem] = None rag_system: Optional[RAGSystem] = None
tool_manager: Optional[ToolManager] = None tool_manager: Optional[ToolManager] = None
zai_client: Any = None llm_client: Optional[AsyncOpenAI] = None
startup_time: float = time.time() startup_time: float = time.time()
@ -206,18 +207,21 @@ async def lifespan(app: FastAPI):
log.warning(f"Tool manager initialization failed: {e}") log.warning(f"Tool manager initialization failed: {e}")
state.tool_manager = None state.tool_manager = None
# Initialize ZAI client for upstream LLM # Initialize OpenRouter client for upstream LLM
try: try:
if config.ZAI_API_KEY and ZAI is not None: if config.OPENROUTER_API_KEY:
log.info("Initializing ZAI client...") log.info("Initializing OpenRouter client...")
state.zai_client = ZAI(api_key=config.ZAI_API_KEY) state.llm_client = AsyncOpenAI(
log.info("ZAI client initialized successfully") api_key=config.OPENROUTER_API_KEY,
base_url=config.OPENROUTER_BASE_URL,
)
log.info(f"OpenRouter client initialized successfully (model: {config.UPSTREAM_MODEL})")
else: else:
log.warning("No ZAI_API_KEY provided or SDK not installed - using mock responses") log.warning("No OPENROUTER_API_KEY provided - using mock responses")
state.zai_client = None state.llm_client = None
except Exception as e: except Exception as e:
log.error(f"Failed to initialize ZAI client: {e}") log.error(f"Failed to initialize OpenRouter client: {e}")
state.zai_client = None state.llm_client = None
log.info(f"DocRAG server started on {config.HOST}:{config.PORT}") log.info(f"DocRAG server started on {config.HOST}:{config.PORT}")
@ -236,7 +240,7 @@ async def lifespan(app: FastAPI):
app = FastAPI( app = FastAPI(
title="DocRAG API", title="DocRAG API",
description="OpenAI-compatible RAG server powered by GLM-4.7-Flash", description="OpenAI-compatible RAG server powered by OpenRouter",
version="1.0.0", version="1.0.0",
lifespan=lifespan, lifespan=lifespan,
) )
@ -262,7 +266,7 @@ async def list_models():
return ModelList( return ModelList(
data=[ data=[
ModelInfo(id=config.MODEL_NAME, owned_by="docrag"), ModelInfo(id=config.MODEL_NAME, owned_by="docrag"),
ModelInfo(id="DocRAG-GLM-4.7", owned_by="docrag"), ModelInfo(id="DocRAG", owned_by="docrag"),
] ]
) )
@ -271,7 +275,7 @@ async def list_models():
@app.get("/models/{model_id}") @app.get("/models/{model_id}")
async def get_model(model_id: str): async def get_model(model_id: str):
"""Get model information (OpenAI-compatible).""" """Get model information (OpenAI-compatible)."""
if model_id not in [config.MODEL_NAME, "DocRAG-GLM-4.7"]: if model_id not in [config.MODEL_NAME, "DocRAG"]:
raise HTTPException(status_code=404, detail="Model not found") raise HTTPException(status_code=404, detail="Model not found")
return ModelInfo(id=model_id, owned_by="docrag") return ModelInfo(id=model_id, owned_by="docrag")
@ -496,26 +500,18 @@ async def stream_chat_completion(
created = int(time.time()) created = int(time.time())
try: try:
if state.zai_client: if state.llm_client:
# Use actual GLM-4.7-Flash # Use OpenRouter with streaming
response = state.zai_client.chat.completions.create( stream = await state.llm_client.chat.completions.create(
model=config.UPSTREAM_MODEL, model=config.UPSTREAM_MODEL,
messages=[{"role": m.role, "content": m.content} for m in enhanced_messages if m.content], messages=[{"role": m.role, "content": m.content} for m in enhanced_messages if m.content],
temperature=request.temperature or 0.7, temperature=request.temperature or 0.7,
max_tokens=request.max_tokens or 4096, max_tokens=request.max_tokens or 4096,
stream=True, stream=True,
thinking={"type": "enabled"},
) )
for chunk in response: async for chunk in stream:
# Handle reasoning content (thinking) if chunk.choices and chunk.choices[0].delta.content:
if hasattr(chunk.choices[0].delta, 'reasoning_content') and chunk.choices[0].delta.reasoning_content:
# Don't expose thinking to user - this is internal RAG processing
log.debug(f"Thinking: {chunk.choices[0].delta.reasoning_content[:100]}...")
continue
# Stream actual content
if hasattr(chunk.choices[0].delta, 'content') and chunk.choices[0].delta.content:
content = chunk.choices[0].delta.content content = chunk.choices[0].delta.content
yield f"data: {json.dumps({ yield f"data: {json.dumps({
'id': request_id, 'id': request_id,
@ -551,7 +547,7 @@ async def stream_chat_completion(
mock_response += f"Processed {download_info.get('pages')} pages into {download_info.get('chunks')} chunks.\n\n" mock_response += f"Processed {download_info.get('pages')} pages into {download_info.get('chunks')} chunks.\n\n"
if context: if context:
mock_response += f"Based on my knowledge base, here's what I found:\n\n{context[:1000]}...\n\n" mock_response += f"Based on my knowledge base, here's what I found:\n\n{context[:1000]}...\n\n"
mock_response += "\n\n[Demo mode - configure ZAI_API_KEY for full LLM responses]" mock_response += "\n\n[Demo mode - configure OPENROUTER_API_KEY for full LLM responses]"
for char in mock_response: for char in mock_response:
yield f"data: {json.dumps({ yield f"data: {json.dumps({
@ -629,31 +625,25 @@ async def generate_response(
temperature: float = 0.7, temperature: float = 0.7,
max_tokens: int = 4096, max_tokens: int = 4096,
) -> str: ) -> str:
"""Generate response using upstream LLM.""" """Generate response using upstream LLM via OpenRouter."""
if state.zai_client: if state.llm_client:
try: try:
response = state.zai_client.chat.completions.create( response = await state.llm_client.chat.completions.create(
model=config.UPSTREAM_MODEL, model=config.UPSTREAM_MODEL,
messages=[{"role": m.role, "content": m.content} for m in messages if m.content], messages=[{"role": m.role, "content": m.content} for m in messages if m.content],
temperature=temperature, temperature=temperature,
max_tokens=max_tokens, max_tokens=max_tokens,
stream=False,
thinking={"type": "enabled"},
) )
# Extract content from response # Extract content from response
content = "" if response.choices:
for chunk in response: message_content = response.choices[0].message.content
if hasattr(chunk.choices[0], 'message') and chunk.choices[0].message: return message_content or "I apologize, but I couldn't generate a response."
content = chunk.choices[0].message.content or ""
break return "I apologize, but I couldn't generate a response."
if hasattr(chunk.choices[0].delta, 'content') and chunk.choices[0].delta.content:
content += chunk.choices[0].delta.content
return content or "I apologize, but I couldn't generate a response."
except Exception as e: except Exception as e:
log.error(f"Upstream LLM call failed: {e}") log.error(f"OpenRouter LLM call failed: {e}")
return f"I encountered an error: {str(e)}" return f"I encountered an error: {str(e)}"
else: else:
@ -663,7 +653,7 @@ async def generate_response(
if msg.role == "user" and msg.content: if msg.role == "user" and msg.content:
user_msg = msg.content user_msg = msg.content
break break
return f"Demo mode response. Your question: {user_msg[:100]}... Configure ZAI_API_KEY for full functionality." return f"Demo mode response. Your question: {user_msg[:100]}... Configure OPENROUTER_API_KEY for full functionality."
# ============================================================================= # =============================================================================
@ -880,7 +870,7 @@ async def health_check():
"uptime": time.time() - state.startup_time, "uptime": time.time() - state.startup_time,
"rag_enabled": state.rag_system is not None, "rag_enabled": state.rag_system is not None,
"tools_enabled": state.tool_manager is not None, "tools_enabled": state.tool_manager is not None,
"llm_connected": state.zai_client is not None, "llm_connected": state.llm_client is not None,
} }
@ -890,7 +880,7 @@ async def root():
return { return {
"name": "DocRAG API", "name": "DocRAG API",
"version": "1.0.0", "version": "1.0.0",
"description": "OpenAI-compatible RAG server powered by GLM-4.7-Flash. Auto-downloads and analyzes websites when users ask about them.", "description": "OpenAI-compatible RAG server powered by OpenRouter. Auto-downloads and analyzes websites when users ask about them.",
"endpoints": { "endpoints": {
"chat": "/v1/chat/completions", "chat": "/v1/chat/completions",
"models": "/v1/models", "models": "/v1/models",

View File

@ -3,6 +3,7 @@ fastapi~=0.115.0
uvicorn[standard]~=0.32.0 uvicorn[standard]~=0.32.0
pydantic~=2.10.0 pydantic~=2.10.0
python-multipart~=0.0.20 python-multipart~=0.0.20
python-dotenv~=1.0.0
# HTTP and async # HTTP and async
aiohttp~=3.11.0 aiohttp~=3.11.0
@ -18,9 +19,8 @@ urllib3~=2.5.0
PyMuPDF~=1.25.0 PyMuPDF~=1.25.0
python-docx~=1.1.0 python-docx~=1.1.0
# Optional: For using z-ai-web-dev-sdk with GLM-4.7-Flash # LLM API client (for OpenRouter)
# Uncomment the following line if you have access to the SDK openai~=1.0.0
# z-ai-web-dev-sdk>=1.0.0
# Vector store alternatives (uncomment as needed) # Vector store alternatives (uncomment as needed)
# chromadb~=0.5.0 # chromadb~=0.5.0