Implement tool calling loop for LLM

- Pass all registered tools to LLM during chat completion - Handle tool_calls from LLM response - Execute tools and feed results back to LLM - Loop until LLM returns final response - Updated system prompt to encourage tool use - Updated streaming to handle tool calls - Increased MAX_TOOL_ITERATIONS to 5
2026-03-29 16:07:56 +00:00 · 2026-03-29 16:07:56 +00:00 · b811162f78
commit b811162f78
parent 4394e7d6f9
19 changed files with 153 additions and 51 deletions
--- a/.gitignore
+++ b/.gitignore
--- a/README.md
+++ b/README.md
--- a/main.py
+++ b/main.py
@ -91,7 +91,7 @@ class Config:

    # Tool settings
    ENABLE_TOOLS: bool = os.getenv("ENABLE_TOOLS", "true").lower() == "true"
-    MAX_TOOL_ITERATIONS: int = int(os.getenv("MAX_TOOL_ITERATIONS", "3"))
+    MAX_TOOL_ITERATIONS: int = int(os.getenv("MAX_TOOL_ITERATIONS", "5"))


 config = Config()
@ -526,29 +526,51 @@ async def stream_chat_completion(

    try:
        if state.llm_client:
-            # Use OpenRouter with streaming
-            stream = await state.llm_client.chat.completions.create(
-                model=config.UPSTREAM_MODEL,
-                messages=[{"role": m.role, "content": m.content} for m in enhanced_messages if m.content],
-                temperature=request.temperature or 0.7,
-                max_tokens=request.max_tokens or 4096,
-                stream=True,
-            )
+            # For streaming with tools, we need to handle tool calls first
+            # Then stream the final response
+            if state.tool_manager and config.ENABLE_TOOLS:
+                # Use non-streaming for tool calls, then stream the result
+                response_content = await generate_response(
+                    enhanced_messages,
+                    temperature=request.temperature or 0.7,
+                    max_tokens=request.max_tokens or 4096,
+                )
+                # Stream the final response as a single chunk
+                yield f"data: {json.dumps({
+                    'id': request_id,
+                    'object': 'chat.completion.chunk',
+                    'created': created,
+                    'model': config.MODEL_NAME,
+                    'choices': [{
+                        'index': 0,
+                        'delta': {'content': response_content},
+                        'finish_reason': None
+                    }]
+                })}\n\n"
+            else:
+                # No tools - use regular streaming
+                stream = await state.llm_client.chat.completions.create(
+                    model=config.UPSTREAM_MODEL,
+                    messages=[{"role": m.role, "content": m.content} for m in enhanced_messages if m.content],
+                    temperature=request.temperature or 0.7,
+                    max_tokens=request.max_tokens or 4096,
+                    stream=True,
+                )

-            async for chunk in stream:
-                if chunk.choices and chunk.choices[0].delta.content:
-                    content = chunk.choices[0].delta.content
-                    yield f"data: {json.dumps({
-                        'id': request_id,
-                        'object': 'chat.completion.chunk',
-                        'created': created,
-                        'model': config.MODEL_NAME,
-                        'choices': [{
-                            'index': 0,
-                            'delta': {'content': content},
-                            'finish_reason': None
-                        }]
-                    })}\n\n"
+                async for chunk in stream:
+                    if chunk.choices and chunk.choices[0].delta.content:
+                        content = chunk.choices[0].delta.content
+                        yield f"data: {json.dumps({
+                            'id': request_id,
+                            'object': 'chat.completion.chunk',
+                            'created': created,
+                            'model': config.MODEL_NAME,
+                            'choices': [{
+                                'index': 0,
+                                'delta': {'content': content},
+                                'finish_reason': None
+                            }]
+                        })}\n\n"

            # Send final chunk
            yield f"data: {json.dumps({
@ -615,13 +637,20 @@ def build_enhanced_messages(
    """Build enhanced messages with RAG context."""
    enhanced = []

-    # Add system message with RAG context
+    # Add system message with RAG context and tool instructions
    system_content = (
-        "You are a helpful AI assistant with the ability to access and analyze websites on-demand. "
-        "When a user asks about a website, you can download and analyze its content directly. "
-        "Use the provided context from the knowledge base to give accurate and helpful responses. "
-        "If context from a website is provided, use it to answer the user's question directly with specific information. "
-        "Be helpful, detailed, and provide the specific information the user is asking for (headlines, summaries, etc.)."
+        "You are a helpful AI assistant with access to real-time data through various tools. "
+        "You MUST use these tools to get current information when the user asks about:\n"
+        "- Stocks, crypto, or financial data → use finance_get_stock_info, finance_get_crypto_price, etc.\n"
+        "- Weather → use weather_get_current, weather_get_forecast\n"
+        "- News → use news_search_hackernews, news_get_reddit, news_aggregate\n"
+        "- Medical/health topics → use medical_search_pubmed, medical_search_fda\n"
+        "- Scientific papers → use science_search_arxiv, science_search_semantic_scholar\n"
+        "- General web search → use web_search, web_search_and_fetch\n"
+        "- Wikipedia → use wikipedia_search, wikipedia_get_article\n\n"
+        "IMPORTANT: Always use tools to get CURRENT data. Do not say you cannot access real-time data. "
+        "When asked about stock prices, crypto prices, weather, or news, you MUST call the appropriate tool. "
+        "Be concise and factual. Report the exact data returned by tools."
    )

    if download_info and download_info.get("downloaded"):
@ -650,28 +679,8 @@ async def generate_response(
    temperature: float = 0.7,
    max_tokens: int = 4096,
 ) -> str:
-    """Generate response using upstream LLM via OpenRouter."""
-    if state.llm_client:
-        try:
-            response = await state.llm_client.chat.completions.create(
-                model=config.UPSTREAM_MODEL,
-                messages=[{"role": m.role, "content": m.content} for m in messages if m.content],
-                temperature=temperature,
-                max_tokens=max_tokens,
-            )
-
-            # Extract content from response
-            if response.choices:
-                message_content = response.choices[0].message.content
-                return message_content or "I apologize, but I couldn't generate a response."
-            
-            return "I apologize, but I couldn't generate a response."
-
-        except Exception as e:
-            log.error(f"OpenRouter LLM call failed: {e}")
-            return f"I encountered an error: {str(e)}"
-
-    else:
+    """Generate response using upstream LLM via OpenRouter with tool calling support."""
+    if not state.llm_client:
        # Mock response for testing
        user_msg = ""
        for msg in reversed(messages):
@ -680,6 +689,99 @@ async def generate_response(
                break
        return f"Demo mode response. Your question: {user_msg[:100]}... Configure OPENROUTER_API_KEY for full functionality."

+    try:
+        # Convert messages to dict format
+        messages_dict = []
+        for m in messages:
+            if m.content:
+                messages_dict.append({"role": m.role, "content": m.content})
+
+        # Get available tools
+        tools = None
+        if state.tool_manager and config.ENABLE_TOOLS:
+            tools = state.tool_manager.get_all_schemas()
+            log.info(f"Passing {len(tools)} tools to LLM")
+
+        # Tool calling loop
+        max_iterations = config.MAX_TOOL_ITERATIONS
+        iteration = 0
+
+        while iteration < max_iterations:
+            iteration += 1
+            log.info(f"LLM call iteration {iteration}")
+
+            # Call LLM with tools
+            response = await state.llm_client.chat.completions.create(
+                model=config.UPSTREAM_MODEL,
+                messages=messages_dict,
+                temperature=temperature,
+                max_tokens=max_tokens,
+                tools=tools,
+                tool_choice="auto" if tools else None,
+            )
+
+            if not response.choices:
+                return "I apologize, but I couldn't generate a response."
+
+            message = response.choices[0].message
+
+            # Check if LLM wants to call tools
+            if message.tool_calls:
+                log.info(f"LLM requested {len(message.tool_calls)} tool calls")
+
+                # Add assistant message with tool calls to history
+                messages_dict.append({
+                    "role": "assistant",
+                    "content": message.content,
+                    "tool_calls": [
+                        {
+                            "id": tc.id,
+                            "type": "function",
+                            "function": {
+                                "name": tc.function.name,
+                                "arguments": tc.function.arguments,
+                            }
+                        }
+                        for tc in message.tool_calls
+                    ]
+                })
+
+                # Execute each tool call
+                for tool_call in message.tool_calls:
+                    tool_name = tool_call.function.name
+                    tool_args = tool_call.function.arguments
+
+                    log.info(f"Executing tool: {tool_name}")
+
+                    # Execute the tool
+                    if state.tool_manager:
+                        result = state.tool_manager.execute_tool_from_json(tool_name, tool_args)
+                    else:
+                        result = {"success": False, "error": "Tool manager not available"}
+
+                    # Add tool result to messages
+                    messages_dict.append({
+                        "role": "tool",
+                        "tool_call_id": tool_call.id,
+                        "name": tool_name,
+                        "content": json.dumps(result),
+                    })
+
+                    log.info(f"Tool {tool_name} result: success={result.get('success', False)}")
+
+                # Continue loop to get final response
+                continue
+
+            # No tool calls - return the final response
+            return message.content or "I apologize, but I couldn't generate a response."
+
+        # Max iterations reached
+        return "I reached the maximum number of tool calls. Please try a more specific question."
+
+    except Exception as e:
+        log.error(f"OpenRouter LLM call failed: {e}")
+        return f"I encountered an error: {str(e)}"
+

 # =============================================================================
 # Document Management Endpoints
--- a/rag/init.py
+++ b/rag/init.py
--- a/rag/document_processor.py
+++ b/rag/document_processor.py
--- a/rag/retriever.py
+++ b/rag/retriever.py
--- a/rag/vector_store.py
+++ b/rag/vector_store.py
--- a/requirements.txt
+++ b/requirements.txt
--- a/tools.md
+++ b/tools.md
--- a/tools/init.py
+++ b/tools/init.py
--- a/tools/finance_tool.py
+++ b/tools/finance_tool.py
--- a/tools/medical_tool.py
+++ b/tools/medical_tool.py
--- a/tools/news_tool.py
+++ b/tools/news_tool.py
--- a/tools/science_tool.py
+++ b/tools/science_tool.py
--- a/tools/weather_tool.py
+++ b/tools/weather_tool.py
--- a/tools/web_tool.py
+++ b/tools/web_tool.py
--- a/tools/wikipedia_tool.py
+++ b/tools/wikipedia_tool.py
--- a/website_downloader.py
+++ b/website_downloader.py
--- a/website_downloader_tool.py
+++ b/website_downloader_tool.py