- Pass all registered tools to LLM during chat completion - Handle tool_calls from LLM response - Execute tools and feed results back to LLM - Loop until LLM returns final response - Updated system prompt to encourage tool use - Updated streaming to handle tool calls - Increased MAX_TOOL_ITERATIONS to 5
428 lines
11 KiB
Python
Executable File
428 lines
11 KiB
Python
Executable File
"""
|
|
Web Search Tool - General web search capabilities
|
|
|
|
Free sources used:
|
|
- DuckDuckGo Instant Answer API (completely free)
|
|
- DuckDuckGo HTML search (free, no API key)
|
|
- Wikipedia API (as fallback)
|
|
|
|
All completely free, no API keys required.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import logging
|
|
from datetime import datetime
|
|
from typing import Optional
|
|
from urllib.parse import quote_plus, unquote_plus
|
|
|
|
import requests
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
# Free search endpoints
|
|
DUCKDUCKGO_API = "https://api.duckduckgo.com"
|
|
DUCKDUCKGO_HTML = "https://html.duckduckgo.com/html"
|
|
|
|
|
|
def web_search(
|
|
query: str,
|
|
max_results: int = 10,
|
|
) -> dict:
|
|
"""
|
|
Search the web using DuckDuckGo.
|
|
|
|
Args:
|
|
query: Search query
|
|
max_results: Maximum number of results (default: 10)
|
|
|
|
Returns:
|
|
Dictionary with search results
|
|
"""
|
|
try:
|
|
# Use DuckDuckGo HTML search (free, no API key)
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
|
}
|
|
|
|
params = {"q": query}
|
|
|
|
response = requests.get(
|
|
DUCKDUCKGO_HTML,
|
|
params=params,
|
|
headers=headers,
|
|
timeout=15
|
|
)
|
|
response.raise_for_status()
|
|
|
|
# Parse HTML results
|
|
results = _parse_ddg_html(response.text, max_results)
|
|
|
|
return {
|
|
"success": True,
|
|
"source": "duckduckgo",
|
|
"query": query,
|
|
"results": results,
|
|
"count": len(results),
|
|
}
|
|
|
|
except Exception as e:
|
|
log.error(f"Web search failed: {e}")
|
|
return {
|
|
"success": False,
|
|
"error": str(e),
|
|
"source": "duckduckgo",
|
|
}
|
|
|
|
|
|
def _parse_ddg_html(html: str, max_results: int) -> list:
|
|
"""Parse DuckDuckGo HTML results."""
|
|
from bs4 import BeautifulSoup
|
|
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
results = []
|
|
|
|
# Find result links
|
|
for result in soup.select(".result")[:max_results]:
|
|
try:
|
|
link_elem = result.select_one(".result__a")
|
|
snippet_elem = result.select_one(".result__snippet")
|
|
|
|
if link_elem:
|
|
url = link_elem.get("href", "")
|
|
|
|
# Extract actual URL from redirect
|
|
if "uddg=" in url:
|
|
url = url.split("uddg=")[-1].split("&")[0]
|
|
url = unquote_plus(url)
|
|
|
|
results.append({
|
|
"title": link_elem.get_text(strip=True),
|
|
"url": url,
|
|
"snippet": snippet_elem.get_text(strip=True) if snippet_elem else "",
|
|
})
|
|
except Exception:
|
|
continue
|
|
|
|
return results
|
|
|
|
|
|
def web_instant_answer(
|
|
query: str,
|
|
) -> dict:
|
|
"""
|
|
Get instant answer from DuckDuckGo.
|
|
|
|
Args:
|
|
query: Query for instant answer
|
|
|
|
Returns:
|
|
Dictionary with instant answer
|
|
"""
|
|
try:
|
|
params = {
|
|
"q": query,
|
|
"format": "json",
|
|
"no_html": 1,
|
|
"skip_disambig": 0,
|
|
}
|
|
|
|
response = requests.get(DUCKDUCKGO_API, params=params, timeout=10)
|
|
response.raise_for_status()
|
|
data = response.json()
|
|
|
|
result = {
|
|
"success": True,
|
|
"source": "duckduckgo",
|
|
"query": query,
|
|
}
|
|
|
|
# Abstract (main answer)
|
|
if data.get("Abstract"):
|
|
result["abstract"] = data.get("Abstract")
|
|
result["abstract_source"] = data.get("AbstractSource")
|
|
result["abstract_url"] = data.get("AbstractURL")
|
|
result["image"] = data.get("Image")
|
|
|
|
# Definition
|
|
if data.get("Definition"):
|
|
result["definition"] = data.get("Definition")
|
|
result["definition_source"] = data.get("DefinitionSource")
|
|
|
|
# Answer
|
|
if data.get("Answer"):
|
|
result["answer"] = data.get("Answer")
|
|
|
|
# Related topics
|
|
related = []
|
|
for topic in data.get("RelatedTopics", [])[:5]:
|
|
if isinstance(topic, dict) and topic.get("Text"):
|
|
related.append({
|
|
"text": topic.get("Text"),
|
|
"url": topic.get("FirstURL"),
|
|
})
|
|
if related:
|
|
result["related_topics"] = related
|
|
|
|
# Infobox
|
|
if data.get("Infobox"):
|
|
result["infobox"] = data.get("Infobox")
|
|
|
|
return result
|
|
|
|
except Exception as e:
|
|
log.error(f"Instant answer failed: {e}")
|
|
return {
|
|
"success": False,
|
|
"error": str(e),
|
|
"source": "duckduckgo",
|
|
}
|
|
|
|
|
|
def web_get_page_content(
|
|
url: str,
|
|
max_length: int = 5000,
|
|
) -> dict:
|
|
"""
|
|
Fetch and extract text content from a web page.
|
|
|
|
Args:
|
|
url: URL to fetch
|
|
max_length: Maximum content length (default: 5000 chars)
|
|
|
|
Returns:
|
|
Dictionary with page content
|
|
"""
|
|
try:
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
|
}
|
|
|
|
response = requests.get(url, headers=headers, timeout=15)
|
|
response.raise_for_status()
|
|
|
|
# Parse and extract text
|
|
from bs4 import BeautifulSoup
|
|
|
|
soup = BeautifulSoup(response.text, "html.parser")
|
|
|
|
# Remove script and style elements
|
|
for element in soup(["script", "style", "nav", "header", "footer"]):
|
|
element.decompose()
|
|
|
|
# Get title
|
|
title = ""
|
|
if soup.title:
|
|
title = soup.title.get_text(strip=True)
|
|
|
|
# Get main content
|
|
text = soup.get_text(separator="\n", strip=True)
|
|
|
|
# Clean up whitespace
|
|
lines = [line.strip() for line in text.splitlines() if line.strip()]
|
|
text = "\n".join(lines)
|
|
|
|
# Truncate if needed
|
|
if len(text) > max_length:
|
|
text = text[:max_length] + "..."
|
|
|
|
return {
|
|
"success": True,
|
|
"source": "web",
|
|
"url": url,
|
|
"title": title,
|
|
"content": text,
|
|
"content_length": len(text),
|
|
}
|
|
|
|
except Exception as e:
|
|
log.error(f"Page content fetch failed: {e}")
|
|
return {
|
|
"success": False,
|
|
"error": str(e),
|
|
"source": "web",
|
|
"url": url,
|
|
}
|
|
|
|
|
|
def web_search_and_fetch(
|
|
query: str,
|
|
max_results: int = 3,
|
|
max_content_length: int = 3000,
|
|
) -> dict:
|
|
"""
|
|
Search web and fetch content from top results.
|
|
|
|
Args:
|
|
query: Search query
|
|
max_results: Number of results to fetch (default: 3)
|
|
max_content_length: Max content per page (default: 3000)
|
|
|
|
Returns:
|
|
Dictionary with search results and fetched content
|
|
"""
|
|
try:
|
|
# First, search
|
|
search_result = web_search(query, max_results)
|
|
|
|
if not search_result.get("success"):
|
|
return search_result
|
|
|
|
results = search_result.get("results", [])
|
|
|
|
# Fetch content from each result
|
|
enriched_results = []
|
|
for result in results:
|
|
if result.get("url"):
|
|
content = web_get_page_content(result["url"], max_content_length)
|
|
result["fetched_content"] = content.get("content", "") if content.get("success") else ""
|
|
enriched_results.append(result)
|
|
|
|
return {
|
|
"success": True,
|
|
"source": "duckduckgo",
|
|
"query": query,
|
|
"results": enriched_results,
|
|
"count": len(enriched_results),
|
|
}
|
|
|
|
except Exception as e:
|
|
log.error(f"Search and fetch failed: {e}")
|
|
return {
|
|
"success": False,
|
|
"error": str(e),
|
|
"source": "duckduckgo",
|
|
}
|
|
|
|
|
|
def web_get_headers(
|
|
url: str,
|
|
) -> dict:
|
|
"""
|
|
Get HTTP headers for a URL.
|
|
|
|
Args:
|
|
url: URL to check
|
|
|
|
Returns:
|
|
Dictionary with HTTP headers
|
|
"""
|
|
try:
|
|
headers = {
|
|
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
|
}
|
|
|
|
response = requests.head(url, headers=headers, timeout=10, allow_redirects=True)
|
|
|
|
return {
|
|
"success": True,
|
|
"source": "web",
|
|
"url": url,
|
|
"status_code": response.status_code,
|
|
"headers": dict(response.headers),
|
|
"final_url": response.url,
|
|
}
|
|
|
|
except Exception as e:
|
|
log.error(f"Header fetch failed: {e}")
|
|
return {
|
|
"success": False,
|
|
"error": str(e),
|
|
"source": "web",
|
|
"url": url,
|
|
}
|
|
|
|
|
|
# Tool schemas for OpenAI function calling
|
|
WEB_SEARCH_SCHEMA = {
|
|
"type": "function",
|
|
"function": {
|
|
"name": "web_search",
|
|
"description": "Search the web using DuckDuckGo. Returns search results with titles, URLs, and snippets. Free, no API key required.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"query": {
|
|
"type": "string",
|
|
"description": "Search query",
|
|
},
|
|
"max_results": {
|
|
"type": "integer",
|
|
"description": "Maximum number of results (default: 10)",
|
|
"default": 10,
|
|
},
|
|
},
|
|
"required": ["query"],
|
|
},
|
|
},
|
|
}
|
|
|
|
WEB_INSTANT_ANSWER_SCHEMA = {
|
|
"type": "function",
|
|
"function": {
|
|
"name": "web_instant_answer",
|
|
"description": "Get instant answer from DuckDuckGo for facts, definitions, and summaries. Good for quick facts.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"query": {
|
|
"type": "string",
|
|
"description": "Query for instant answer",
|
|
},
|
|
},
|
|
"required": ["query"],
|
|
},
|
|
},
|
|
}
|
|
|
|
WEB_GET_PAGE_CONTENT_SCHEMA = {
|
|
"type": "function",
|
|
"function": {
|
|
"name": "web_get_page_content",
|
|
"description": "Fetch and extract text content from a web page URL. Use after web_search to get full content.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"url": {
|
|
"type": "string",
|
|
"description": "URL to fetch",
|
|
},
|
|
"max_length": {
|
|
"type": "integer",
|
|
"description": "Maximum content length in characters (default: 5000)",
|
|
"default": 5000,
|
|
},
|
|
},
|
|
"required": ["url"],
|
|
},
|
|
},
|
|
}
|
|
|
|
WEB_SEARCH_AND_FETCH_SCHEMA = {
|
|
"type": "function",
|
|
"function": {
|
|
"name": "web_search_and_fetch",
|
|
"description": "Search web and automatically fetch content from top results. Best for comprehensive research.",
|
|
"parameters": {
|
|
"type": "object",
|
|
"properties": {
|
|
"query": {
|
|
"type": "string",
|
|
"description": "Search query",
|
|
},
|
|
"max_results": {
|
|
"type": "integer",
|
|
"description": "Number of results to fetch (default: 3)",
|
|
"default": 3,
|
|
},
|
|
"max_content_length": {
|
|
"type": "integer",
|
|
"description": "Max content per page (default: 3000)",
|
|
"default": 3000,
|
|
},
|
|
},
|
|
"required": ["query"],
|
|
},
|
|
},
|
|
}
|