docrag/tools/web_tool.py
Z User b811162f78 Implement tool calling loop for LLM
- Pass all registered tools to LLM during chat completion
- Handle tool_calls from LLM response
- Execute tools and feed results back to LLM
- Loop until LLM returns final response
- Updated system prompt to encourage tool use
- Updated streaming to handle tool calls
- Increased MAX_TOOL_ITERATIONS to 5
2026-03-29 16:07:56 +00:00

428 lines
11 KiB
Python
Executable File

"""
Web Search Tool - General web search capabilities
Free sources used:
- DuckDuckGo Instant Answer API (completely free)
- DuckDuckGo HTML search (free, no API key)
- Wikipedia API (as fallback)
All completely free, no API keys required.
"""
from __future__ import annotations
import logging
from datetime import datetime
from typing import Optional
from urllib.parse import quote_plus, unquote_plus
import requests
log = logging.getLogger(__name__)
# Free search endpoints
DUCKDUCKGO_API = "https://api.duckduckgo.com"
DUCKDUCKGO_HTML = "https://html.duckduckgo.com/html"
def web_search(
query: str,
max_results: int = 10,
) -> dict:
"""
Search the web using DuckDuckGo.
Args:
query: Search query
max_results: Maximum number of results (default: 10)
Returns:
Dictionary with search results
"""
try:
# Use DuckDuckGo HTML search (free, no API key)
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
params = {"q": query}
response = requests.get(
DUCKDUCKGO_HTML,
params=params,
headers=headers,
timeout=15
)
response.raise_for_status()
# Parse HTML results
results = _parse_ddg_html(response.text, max_results)
return {
"success": True,
"source": "duckduckgo",
"query": query,
"results": results,
"count": len(results),
}
except Exception as e:
log.error(f"Web search failed: {e}")
return {
"success": False,
"error": str(e),
"source": "duckduckgo",
}
def _parse_ddg_html(html: str, max_results: int) -> list:
"""Parse DuckDuckGo HTML results."""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, "html.parser")
results = []
# Find result links
for result in soup.select(".result")[:max_results]:
try:
link_elem = result.select_one(".result__a")
snippet_elem = result.select_one(".result__snippet")
if link_elem:
url = link_elem.get("href", "")
# Extract actual URL from redirect
if "uddg=" in url:
url = url.split("uddg=")[-1].split("&")[0]
url = unquote_plus(url)
results.append({
"title": link_elem.get_text(strip=True),
"url": url,
"snippet": snippet_elem.get_text(strip=True) if snippet_elem else "",
})
except Exception:
continue
return results
def web_instant_answer(
query: str,
) -> dict:
"""
Get instant answer from DuckDuckGo.
Args:
query: Query for instant answer
Returns:
Dictionary with instant answer
"""
try:
params = {
"q": query,
"format": "json",
"no_html": 1,
"skip_disambig": 0,
}
response = requests.get(DUCKDUCKGO_API, params=params, timeout=10)
response.raise_for_status()
data = response.json()
result = {
"success": True,
"source": "duckduckgo",
"query": query,
}
# Abstract (main answer)
if data.get("Abstract"):
result["abstract"] = data.get("Abstract")
result["abstract_source"] = data.get("AbstractSource")
result["abstract_url"] = data.get("AbstractURL")
result["image"] = data.get("Image")
# Definition
if data.get("Definition"):
result["definition"] = data.get("Definition")
result["definition_source"] = data.get("DefinitionSource")
# Answer
if data.get("Answer"):
result["answer"] = data.get("Answer")
# Related topics
related = []
for topic in data.get("RelatedTopics", [])[:5]:
if isinstance(topic, dict) and topic.get("Text"):
related.append({
"text": topic.get("Text"),
"url": topic.get("FirstURL"),
})
if related:
result["related_topics"] = related
# Infobox
if data.get("Infobox"):
result["infobox"] = data.get("Infobox")
return result
except Exception as e:
log.error(f"Instant answer failed: {e}")
return {
"success": False,
"error": str(e),
"source": "duckduckgo",
}
def web_get_page_content(
url: str,
max_length: int = 5000,
) -> dict:
"""
Fetch and extract text content from a web page.
Args:
url: URL to fetch
max_length: Maximum content length (default: 5000 chars)
Returns:
Dictionary with page content
"""
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
response = requests.get(url, headers=headers, timeout=15)
response.raise_for_status()
# Parse and extract text
from bs4 import BeautifulSoup
soup = BeautifulSoup(response.text, "html.parser")
# Remove script and style elements
for element in soup(["script", "style", "nav", "header", "footer"]):
element.decompose()
# Get title
title = ""
if soup.title:
title = soup.title.get_text(strip=True)
# Get main content
text = soup.get_text(separator="\n", strip=True)
# Clean up whitespace
lines = [line.strip() for line in text.splitlines() if line.strip()]
text = "\n".join(lines)
# Truncate if needed
if len(text) > max_length:
text = text[:max_length] + "..."
return {
"success": True,
"source": "web",
"url": url,
"title": title,
"content": text,
"content_length": len(text),
}
except Exception as e:
log.error(f"Page content fetch failed: {e}")
return {
"success": False,
"error": str(e),
"source": "web",
"url": url,
}
def web_search_and_fetch(
query: str,
max_results: int = 3,
max_content_length: int = 3000,
) -> dict:
"""
Search web and fetch content from top results.
Args:
query: Search query
max_results: Number of results to fetch (default: 3)
max_content_length: Max content per page (default: 3000)
Returns:
Dictionary with search results and fetched content
"""
try:
# First, search
search_result = web_search(query, max_results)
if not search_result.get("success"):
return search_result
results = search_result.get("results", [])
# Fetch content from each result
enriched_results = []
for result in results:
if result.get("url"):
content = web_get_page_content(result["url"], max_content_length)
result["fetched_content"] = content.get("content", "") if content.get("success") else ""
enriched_results.append(result)
return {
"success": True,
"source": "duckduckgo",
"query": query,
"results": enriched_results,
"count": len(enriched_results),
}
except Exception as e:
log.error(f"Search and fetch failed: {e}")
return {
"success": False,
"error": str(e),
"source": "duckduckgo",
}
def web_get_headers(
url: str,
) -> dict:
"""
Get HTTP headers for a URL.
Args:
url: URL to check
Returns:
Dictionary with HTTP headers
"""
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
}
response = requests.head(url, headers=headers, timeout=10, allow_redirects=True)
return {
"success": True,
"source": "web",
"url": url,
"status_code": response.status_code,
"headers": dict(response.headers),
"final_url": response.url,
}
except Exception as e:
log.error(f"Header fetch failed: {e}")
return {
"success": False,
"error": str(e),
"source": "web",
"url": url,
}
# Tool schemas for OpenAI function calling
WEB_SEARCH_SCHEMA = {
"type": "function",
"function": {
"name": "web_search",
"description": "Search the web using DuckDuckGo. Returns search results with titles, URLs, and snippets. Free, no API key required.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "Search query",
},
"max_results": {
"type": "integer",
"description": "Maximum number of results (default: 10)",
"default": 10,
},
},
"required": ["query"],
},
},
}
WEB_INSTANT_ANSWER_SCHEMA = {
"type": "function",
"function": {
"name": "web_instant_answer",
"description": "Get instant answer from DuckDuckGo for facts, definitions, and summaries. Good for quick facts.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "Query for instant answer",
},
},
"required": ["query"],
},
},
}
WEB_GET_PAGE_CONTENT_SCHEMA = {
"type": "function",
"function": {
"name": "web_get_page_content",
"description": "Fetch and extract text content from a web page URL. Use after web_search to get full content.",
"parameters": {
"type": "object",
"properties": {
"url": {
"type": "string",
"description": "URL to fetch",
},
"max_length": {
"type": "integer",
"description": "Maximum content length in characters (default: 5000)",
"default": 5000,
},
},
"required": ["url"],
},
},
}
WEB_SEARCH_AND_FETCH_SCHEMA = {
"type": "function",
"function": {
"name": "web_search_and_fetch",
"description": "Search web and automatically fetch content from top results. Best for comprehensive research.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "Search query",
},
"max_results": {
"type": "integer",
"description": "Number of results to fetch (default: 3)",
"default": 3,
},
"max_content_length": {
"type": "integer",
"description": "Max content per page (default: 3000)",
"default": 3000,
},
},
"required": ["query"],
},
},
}