Switch from ZAI SDK to OpenRouter with openrouter/free model
- Replace z-ai-web-dev-sdk with openai SDK - Add OPENROUTER_API_KEY and OPENROUTER_BASE_URL config - Update AsyncOpenAI client for OpenRouter - Update generate_response and stream_chat_completion - Update .env.example with OpenRouter settings
This commit is contained in:
parent
10e61dd2f1
commit
b23964b35a
18
.env.example
18
.env.example
@ -1,19 +1,21 @@
|
|||||||
# DocRAG Configuration
|
# DocRAG Configuration
|
||||||
# Copy this file to .env and fill in your values
|
# Copy this file to .env and fill in your values
|
||||||
|
|
||||||
# Server Configuration
|
# Server settings
|
||||||
HOST=0.0.0.0
|
HOST=0.0.0.0
|
||||||
PORT=8000
|
PORT=8000
|
||||||
DEBUG=false
|
DEBUG=false
|
||||||
|
|
||||||
# Model Configuration
|
# Model settings
|
||||||
MODEL_NAME=DocRAG-GLM-4.7
|
MODEL_NAME=DocRAG
|
||||||
UPSTREAM_MODEL=glm-4.7
|
UPSTREAM_MODEL=openrouter/free
|
||||||
|
|
||||||
# API Keys
|
# OpenRouter API settings (required for LLM responses)
|
||||||
ZAI_API_KEY=your-zai-api-key-here
|
# Get your API key from https://openrouter.ai/keys
|
||||||
|
OPENROUTER_API_KEY=your_openrouter_api_key_here
|
||||||
|
OPENROUTER_BASE_URL=https://openrouter.ai/api/v1
|
||||||
|
|
||||||
# RAG Configuration
|
# RAG settings
|
||||||
EMBEDDING_MODEL=text-embedding-3-small
|
EMBEDDING_MODEL=text-embedding-3-small
|
||||||
VECTOR_STORE_PATH=./data/vectors
|
VECTOR_STORE_PATH=./data/vectors
|
||||||
DOCUMENTS_PATH=./data/documents
|
DOCUMENTS_PATH=./data/documents
|
||||||
@ -21,6 +23,6 @@ CHUNK_SIZE=1000
|
|||||||
CHUNK_OVERLAP=200
|
CHUNK_OVERLAP=200
|
||||||
TOP_K_RESULTS=5
|
TOP_K_RESULTS=5
|
||||||
|
|
||||||
# Tool Configuration
|
# Tool settings
|
||||||
ENABLE_TOOLS=true
|
ENABLE_TOOLS=true
|
||||||
MAX_TOOL_ITERATIONS=3
|
MAX_TOOL_ITERATIONS=3
|
||||||
|
|||||||
102
main.py
102
main.py
@ -26,6 +26,10 @@ import uuid
|
|||||||
from contextlib import asynccontextmanager
|
from contextlib import asynccontextmanager
|
||||||
from typing import Any, AsyncIterator, Optional
|
from typing import Any, AsyncIterator, Optional
|
||||||
|
|
||||||
|
# Load environment variables from .env file
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
# Configure logging
|
# Configure logging
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
@ -47,12 +51,8 @@ from rag.document_processor import DocumentProcessor
|
|||||||
# Import tools
|
# Import tools
|
||||||
from tools import ToolManager, get_tool_manager
|
from tools import ToolManager, get_tool_manager
|
||||||
|
|
||||||
# Import SDK for GLM-4.7-Flash
|
# Import OpenAI client for OpenRouter
|
||||||
try:
|
from openai import AsyncOpenAI
|
||||||
from zai import ZaiClient as ZAI
|
|
||||||
except ImportError:
|
|
||||||
ZAI = None
|
|
||||||
log.warning("z-ai-web-dev-sdk not installed. Install with: pip install z-ai-web-dev-sdk")
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
@ -68,11 +68,12 @@ class Config:
|
|||||||
DEBUG: bool = os.getenv("DEBUG", "false").lower() == "true"
|
DEBUG: bool = os.getenv("DEBUG", "false").lower() == "true"
|
||||||
|
|
||||||
# Model settings
|
# Model settings
|
||||||
MODEL_NAME: str = os.getenv("MODEL_NAME", "DocRAG-GLM-4.7")
|
MODEL_NAME: str = os.getenv("MODEL_NAME", "DocRAG")
|
||||||
UPSTREAM_MODEL: str = os.getenv("UPSTREAM_MODEL", "glm-4.7")
|
UPSTREAM_MODEL: str = os.getenv("UPSTREAM_MODEL", "openrouter/free")
|
||||||
|
|
||||||
# API Key for upstream LLM
|
# OpenRouter API settings
|
||||||
ZAI_API_KEY: str = os.getenv("ZAI_API_KEY", "")
|
OPENROUTER_API_KEY: str = os.getenv("OPENROUTER_API_KEY", "")
|
||||||
|
OPENROUTER_BASE_URL: str = os.getenv("OPENROUTER_BASE_URL", "https://openrouter.ai/api/v1")
|
||||||
|
|
||||||
# RAG settings
|
# RAG settings
|
||||||
EMBEDDING_MODEL: str = os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")
|
EMBEDDING_MODEL: str = os.getenv("EMBEDDING_MODEL", "text-embedding-3-small")
|
||||||
@ -105,7 +106,7 @@ class ChatMessage(BaseModel):
|
|||||||
|
|
||||||
class ChatCompletionRequest(BaseModel):
|
class ChatCompletionRequest(BaseModel):
|
||||||
"""OpenAI chat completion request format."""
|
"""OpenAI chat completion request format."""
|
||||||
model: str = "DocRAG-GLM-4.7"
|
model: str = "DocRAG"
|
||||||
messages: list[ChatMessage]
|
messages: list[ChatMessage]
|
||||||
temperature: Optional[float] = 0.7
|
temperature: Optional[float] = 0.7
|
||||||
top_p: Optional[float] = 1.0
|
top_p: Optional[float] = 1.0
|
||||||
@ -166,7 +167,7 @@ class AppState:
|
|||||||
"""Global application state."""
|
"""Global application state."""
|
||||||
rag_system: Optional[RAGSystem] = None
|
rag_system: Optional[RAGSystem] = None
|
||||||
tool_manager: Optional[ToolManager] = None
|
tool_manager: Optional[ToolManager] = None
|
||||||
zai_client: Any = None
|
llm_client: Optional[AsyncOpenAI] = None
|
||||||
startup_time: float = time.time()
|
startup_time: float = time.time()
|
||||||
|
|
||||||
|
|
||||||
@ -206,18 +207,21 @@ async def lifespan(app: FastAPI):
|
|||||||
log.warning(f"Tool manager initialization failed: {e}")
|
log.warning(f"Tool manager initialization failed: {e}")
|
||||||
state.tool_manager = None
|
state.tool_manager = None
|
||||||
|
|
||||||
# Initialize ZAI client for upstream LLM
|
# Initialize OpenRouter client for upstream LLM
|
||||||
try:
|
try:
|
||||||
if config.ZAI_API_KEY and ZAI is not None:
|
if config.OPENROUTER_API_KEY:
|
||||||
log.info("Initializing ZAI client...")
|
log.info("Initializing OpenRouter client...")
|
||||||
state.zai_client = ZAI(api_key=config.ZAI_API_KEY)
|
state.llm_client = AsyncOpenAI(
|
||||||
log.info("ZAI client initialized successfully")
|
api_key=config.OPENROUTER_API_KEY,
|
||||||
|
base_url=config.OPENROUTER_BASE_URL,
|
||||||
|
)
|
||||||
|
log.info(f"OpenRouter client initialized successfully (model: {config.UPSTREAM_MODEL})")
|
||||||
else:
|
else:
|
||||||
log.warning("No ZAI_API_KEY provided or SDK not installed - using mock responses")
|
log.warning("No OPENROUTER_API_KEY provided - using mock responses")
|
||||||
state.zai_client = None
|
state.llm_client = None
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.error(f"Failed to initialize ZAI client: {e}")
|
log.error(f"Failed to initialize OpenRouter client: {e}")
|
||||||
state.zai_client = None
|
state.llm_client = None
|
||||||
|
|
||||||
log.info(f"DocRAG server started on {config.HOST}:{config.PORT}")
|
log.info(f"DocRAG server started on {config.HOST}:{config.PORT}")
|
||||||
|
|
||||||
@ -236,7 +240,7 @@ async def lifespan(app: FastAPI):
|
|||||||
|
|
||||||
app = FastAPI(
|
app = FastAPI(
|
||||||
title="DocRAG API",
|
title="DocRAG API",
|
||||||
description="OpenAI-compatible RAG server powered by GLM-4.7-Flash",
|
description="OpenAI-compatible RAG server powered by OpenRouter",
|
||||||
version="1.0.0",
|
version="1.0.0",
|
||||||
lifespan=lifespan,
|
lifespan=lifespan,
|
||||||
)
|
)
|
||||||
@ -262,7 +266,7 @@ async def list_models():
|
|||||||
return ModelList(
|
return ModelList(
|
||||||
data=[
|
data=[
|
||||||
ModelInfo(id=config.MODEL_NAME, owned_by="docrag"),
|
ModelInfo(id=config.MODEL_NAME, owned_by="docrag"),
|
||||||
ModelInfo(id="DocRAG-GLM-4.7", owned_by="docrag"),
|
ModelInfo(id="DocRAG", owned_by="docrag"),
|
||||||
]
|
]
|
||||||
)
|
)
|
||||||
|
|
||||||
@ -271,7 +275,7 @@ async def list_models():
|
|||||||
@app.get("/models/{model_id}")
|
@app.get("/models/{model_id}")
|
||||||
async def get_model(model_id: str):
|
async def get_model(model_id: str):
|
||||||
"""Get model information (OpenAI-compatible)."""
|
"""Get model information (OpenAI-compatible)."""
|
||||||
if model_id not in [config.MODEL_NAME, "DocRAG-GLM-4.7"]:
|
if model_id not in [config.MODEL_NAME, "DocRAG"]:
|
||||||
raise HTTPException(status_code=404, detail="Model not found")
|
raise HTTPException(status_code=404, detail="Model not found")
|
||||||
return ModelInfo(id=model_id, owned_by="docrag")
|
return ModelInfo(id=model_id, owned_by="docrag")
|
||||||
|
|
||||||
@ -496,26 +500,18 @@ async def stream_chat_completion(
|
|||||||
created = int(time.time())
|
created = int(time.time())
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if state.zai_client:
|
if state.llm_client:
|
||||||
# Use actual GLM-4.7-Flash
|
# Use OpenRouter with streaming
|
||||||
response = state.zai_client.chat.completions.create(
|
stream = await state.llm_client.chat.completions.create(
|
||||||
model=config.UPSTREAM_MODEL,
|
model=config.UPSTREAM_MODEL,
|
||||||
messages=[{"role": m.role, "content": m.content} for m in enhanced_messages if m.content],
|
messages=[{"role": m.role, "content": m.content} for m in enhanced_messages if m.content],
|
||||||
temperature=request.temperature or 0.7,
|
temperature=request.temperature or 0.7,
|
||||||
max_tokens=request.max_tokens or 4096,
|
max_tokens=request.max_tokens or 4096,
|
||||||
stream=True,
|
stream=True,
|
||||||
thinking={"type": "enabled"},
|
|
||||||
)
|
)
|
||||||
|
|
||||||
for chunk in response:
|
async for chunk in stream:
|
||||||
# Handle reasoning content (thinking)
|
if chunk.choices and chunk.choices[0].delta.content:
|
||||||
if hasattr(chunk.choices[0].delta, 'reasoning_content') and chunk.choices[0].delta.reasoning_content:
|
|
||||||
# Don't expose thinking to user - this is internal RAG processing
|
|
||||||
log.debug(f"Thinking: {chunk.choices[0].delta.reasoning_content[:100]}...")
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Stream actual content
|
|
||||||
if hasattr(chunk.choices[0].delta, 'content') and chunk.choices[0].delta.content:
|
|
||||||
content = chunk.choices[0].delta.content
|
content = chunk.choices[0].delta.content
|
||||||
yield f"data: {json.dumps({
|
yield f"data: {json.dumps({
|
||||||
'id': request_id,
|
'id': request_id,
|
||||||
@ -551,7 +547,7 @@ async def stream_chat_completion(
|
|||||||
mock_response += f"Processed {download_info.get('pages')} pages into {download_info.get('chunks')} chunks.\n\n"
|
mock_response += f"Processed {download_info.get('pages')} pages into {download_info.get('chunks')} chunks.\n\n"
|
||||||
if context:
|
if context:
|
||||||
mock_response += f"Based on my knowledge base, here's what I found:\n\n{context[:1000]}...\n\n"
|
mock_response += f"Based on my knowledge base, here's what I found:\n\n{context[:1000]}...\n\n"
|
||||||
mock_response += "\n\n[Demo mode - configure ZAI_API_KEY for full LLM responses]"
|
mock_response += "\n\n[Demo mode - configure OPENROUTER_API_KEY for full LLM responses]"
|
||||||
|
|
||||||
for char in mock_response:
|
for char in mock_response:
|
||||||
yield f"data: {json.dumps({
|
yield f"data: {json.dumps({
|
||||||
@ -629,31 +625,25 @@ async def generate_response(
|
|||||||
temperature: float = 0.7,
|
temperature: float = 0.7,
|
||||||
max_tokens: int = 4096,
|
max_tokens: int = 4096,
|
||||||
) -> str:
|
) -> str:
|
||||||
"""Generate response using upstream LLM."""
|
"""Generate response using upstream LLM via OpenRouter."""
|
||||||
if state.zai_client:
|
if state.llm_client:
|
||||||
try:
|
try:
|
||||||
response = state.zai_client.chat.completions.create(
|
response = await state.llm_client.chat.completions.create(
|
||||||
model=config.UPSTREAM_MODEL,
|
model=config.UPSTREAM_MODEL,
|
||||||
messages=[{"role": m.role, "content": m.content} for m in messages if m.content],
|
messages=[{"role": m.role, "content": m.content} for m in messages if m.content],
|
||||||
temperature=temperature,
|
temperature=temperature,
|
||||||
max_tokens=max_tokens,
|
max_tokens=max_tokens,
|
||||||
stream=False,
|
|
||||||
thinking={"type": "enabled"},
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Extract content from response
|
# Extract content from response
|
||||||
content = ""
|
if response.choices:
|
||||||
for chunk in response:
|
message_content = response.choices[0].message.content
|
||||||
if hasattr(chunk.choices[0], 'message') and chunk.choices[0].message:
|
return message_content or "I apologize, but I couldn't generate a response."
|
||||||
content = chunk.choices[0].message.content or ""
|
|
||||||
break
|
return "I apologize, but I couldn't generate a response."
|
||||||
if hasattr(chunk.choices[0].delta, 'content') and chunk.choices[0].delta.content:
|
|
||||||
content += chunk.choices[0].delta.content
|
|
||||||
|
|
||||||
return content or "I apologize, but I couldn't generate a response."
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
log.error(f"Upstream LLM call failed: {e}")
|
log.error(f"OpenRouter LLM call failed: {e}")
|
||||||
return f"I encountered an error: {str(e)}"
|
return f"I encountered an error: {str(e)}"
|
||||||
|
|
||||||
else:
|
else:
|
||||||
@ -663,7 +653,7 @@ async def generate_response(
|
|||||||
if msg.role == "user" and msg.content:
|
if msg.role == "user" and msg.content:
|
||||||
user_msg = msg.content
|
user_msg = msg.content
|
||||||
break
|
break
|
||||||
return f"Demo mode response. Your question: {user_msg[:100]}... Configure ZAI_API_KEY for full functionality."
|
return f"Demo mode response. Your question: {user_msg[:100]}... Configure OPENROUTER_API_KEY for full functionality."
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
@ -880,7 +870,7 @@ async def health_check():
|
|||||||
"uptime": time.time() - state.startup_time,
|
"uptime": time.time() - state.startup_time,
|
||||||
"rag_enabled": state.rag_system is not None,
|
"rag_enabled": state.rag_system is not None,
|
||||||
"tools_enabled": state.tool_manager is not None,
|
"tools_enabled": state.tool_manager is not None,
|
||||||
"llm_connected": state.zai_client is not None,
|
"llm_connected": state.llm_client is not None,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
@ -890,7 +880,7 @@ async def root():
|
|||||||
return {
|
return {
|
||||||
"name": "DocRAG API",
|
"name": "DocRAG API",
|
||||||
"version": "1.0.0",
|
"version": "1.0.0",
|
||||||
"description": "OpenAI-compatible RAG server powered by GLM-4.7-Flash. Auto-downloads and analyzes websites when users ask about them.",
|
"description": "OpenAI-compatible RAG server powered by OpenRouter. Auto-downloads and analyzes websites when users ask about them.",
|
||||||
"endpoints": {
|
"endpoints": {
|
||||||
"chat": "/v1/chat/completions",
|
"chat": "/v1/chat/completions",
|
||||||
"models": "/v1/models",
|
"models": "/v1/models",
|
||||||
|
|||||||
@ -3,6 +3,7 @@ fastapi~=0.115.0
|
|||||||
uvicorn[standard]~=0.32.0
|
uvicorn[standard]~=0.32.0
|
||||||
pydantic~=2.10.0
|
pydantic~=2.10.0
|
||||||
python-multipart~=0.0.20
|
python-multipart~=0.0.20
|
||||||
|
python-dotenv~=1.0.0
|
||||||
|
|
||||||
# HTTP and async
|
# HTTP and async
|
||||||
aiohttp~=3.11.0
|
aiohttp~=3.11.0
|
||||||
@ -18,9 +19,8 @@ urllib3~=2.5.0
|
|||||||
PyMuPDF~=1.25.0
|
PyMuPDF~=1.25.0
|
||||||
python-docx~=1.1.0
|
python-docx~=1.1.0
|
||||||
|
|
||||||
# Optional: For using z-ai-web-dev-sdk with GLM-4.7-Flash
|
# LLM API client (for OpenRouter)
|
||||||
# Uncomment the following line if you have access to the SDK
|
openai~=1.0.0
|
||||||
# z-ai-web-dev-sdk>=1.0.0
|
|
||||||
|
|
||||||
# Vector store alternatives (uncomment as needed)
|
# Vector store alternatives (uncomment as needed)
|
||||||
# chromadb~=0.5.0
|
# chromadb~=0.5.0
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user