docrag/tools/news_tool.py
Z User 4394e7d6f9 Add comprehensive set of free data tools for RAG
Tools added:
- Wikipedia: search, get article, get full article
- News: Hacker News, Reddit, aggregated news search
- Finance: stocks (yfinance), crypto (CoinGecko), exchange rates
- Medical: PubMed, FDA, disease data, health topics
- Weather: current, forecast, air quality (Open-Meteo)
- Science: arXiv, Semantic Scholar, DOAJ
- Web: DuckDuckGo search, instant answers, page content

All tools use completely free APIs with no authentication required.
2026-03-29 06:27:32 +00:00

435 lines
13 KiB
Python

"""
News Tool - Fetch news from free sources
Free sources used:
- GNews API (free tier: 100 requests/day)
- Currents API (free tier: 200 requests/day)
- Hacker News (completely free)
- Reddit (free JSON feeds)
No API key required for Hacker News and Reddit.
"""
from __future__ import annotations
import logging
from datetime import datetime, timedelta
from typing import Optional
import requests
log = logging.getLogger(__name__)
# Free news APIs (no key required for some)
GNEWS_API = "https://gnews.io/api/v4"
CURRENTS_API = "https://api.currentsapi.services/v1"
HACKER_NEWS_API = "https://hacker-news.firebaseio.com/v0"
REDDIT_API = "https://www.reddit.com"
def news_search_hackernews(
query: str,
limit: int = 10,
) -> dict:
"""
Search Hacker News for stories.
Args:
query: Search query
limit: Maximum number of results (default: 10)
Returns:
Dictionary with search results
"""
try:
# Use Hacker News Algolia API for search (free, no key)
search_url = "https://hn.algolia.com/api/v1/search"
params = {
"query": query,
"hitsPerPage": limit,
"tags": "story",
}
response = requests.get(search_url, params=params, timeout=10)
response.raise_for_status()
data = response.json()
results = []
for hit in data.get("hits", []):
results.append({
"title": hit.get("title", ""),
"url": hit.get("url", ""),
"points": hit.get("points", 0),
"author": hit.get("author", ""),
"created_at": hit.get("created_at", ""),
"comments": hit.get("num_comments", 0),
"hn_link": f"https://news.ycombinator.com/item?id={hit.get('objectID', '')}",
})
return {
"success": True,
"source": "hacker_news",
"query": query,
"results": results,
"count": len(results),
}
except Exception as e:
log.error(f"Hacker News search failed: {e}")
return {
"success": False,
"error": str(e),
"source": "hacker_news",
}
def news_get_top_stories(
limit: int = 15,
) -> dict:
"""
Get top stories from Hacker News.
Args:
limit: Maximum number of stories (default: 15)
Returns:
Dictionary with top stories
"""
try:
# Get top story IDs
response = requests.get(f"{HACKER_NEWS_API}/topstories.json", timeout=10)
response.raise_for_status()
story_ids = response.json()[:limit]
results = []
for story_id in story_ids:
try:
story_response = requests.get(
f"{HACKER_NEWS_API}/item/{story_id}.json",
timeout=10
)
story = story_response.json()
if story:
results.append({
"title": story.get("title", ""),
"url": story.get("url", ""),
"points": story.get("score", 0),
"author": story.get("by", ""),
"time": datetime.fromtimestamp(story.get("time", 0)).isoformat(),
"comments": story.get("descendants", 0),
"hn_link": f"https://news.ycombinator.com/item?id={story_id}",
})
except Exception:
continue
return {
"success": True,
"source": "hacker_news",
"results": results,
"count": len(results),
}
except Exception as e:
log.error(f"Hacker News top stories failed: {e}")
return {
"success": False,
"error": str(e),
"source": "hacker_news",
}
def news_get_reddit(
subreddit: str = "worldnews",
limit: int = 15,
timeframe: str = "day",
) -> dict:
"""
Get top posts from a Reddit subreddit.
Args:
subreddit: Subreddit name (default: worldnews)
limit: Maximum number of posts (default: 15)
timeframe: Time period (hour, day, week, month, year, all)
Returns:
Dictionary with Reddit posts
"""
try:
# Reddit provides free JSON feeds
url = f"{REDDIT_API}/r/{subreddit}/top.json"
headers = {"User-Agent": "DocRAG/1.0"}
params = {
"limit": limit,
"t": timeframe,
}
response = requests.get(url, headers=headers, params=params, timeout=10)
response.raise_for_status()
data = response.json()
results = []
for child in data.get("data", {}).get("children", []):
post = child.get("data", {})
results.append({
"title": post.get("title", ""),
"url": post.get("url", ""),
"author": post.get("author", ""),
"score": post.get("score", 0),
"comments": post.get("num_comments", 0),
"subreddit": post.get("subreddit", ""),
"created": datetime.fromtimestamp(post.get("created_utc", 0)).isoformat(),
"permalink": f"https://reddit.com{post.get('permalink', '')}",
"selftext": post.get("selftext", "")[:500] if post.get("selftext") else "",
})
return {
"success": True,
"source": "reddit",
"subreddit": subreddit,
"timeframe": timeframe,
"results": results,
"count": len(results),
}
except Exception as e:
log.error(f"Reddit fetch failed: {e}")
return {
"success": False,
"error": str(e),
"source": "reddit",
}
def news_search_reddit(
query: str,
subreddit: str = "all",
limit: int = 15,
) -> dict:
"""
Search Reddit for posts matching a query.
Args:
query: Search query
subreddit: Subreddit to search (default: all)
limit: Maximum number of results (default: 15)
Returns:
Dictionary with search results
"""
try:
url = f"{REDDIT_API}/r/{subreddit}/search.json"
headers = {"User-Agent": "DocRAG/1.0"}
params = {
"q": query,
"limit": limit,
"sort": "relevance",
"restrict_sr": "true" if subreddit != "all" else "false",
}
response = requests.get(url, headers=headers, params=params, timeout=10)
response.raise_for_status()
data = response.json()
results = []
for child in data.get("data", {}).get("children", []):
post = child.get("data", {})
results.append({
"title": post.get("title", ""),
"url": post.get("url", ""),
"author": post.get("author", ""),
"score": post.get("score", 0),
"comments": post.get("num_comments", 0),
"subreddit": post.get("subreddit", ""),
"created": datetime.fromtimestamp(post.get("created_utc", 0)).isoformat(),
"permalink": f"https://reddit.com{post.get('permalink', '')}",
"selftext": post.get("selftext", "")[:500] if post.get("selftext") else "",
})
return {
"success": True,
"source": "reddit",
"query": query,
"subreddit": subreddit,
"results": results,
"count": len(results),
}
except Exception as e:
log.error(f"Reddit search failed: {e}")
return {
"success": False,
"error": str(e),
"source": "reddit",
}
def news_aggregate(
query: str,
limit: int = 10,
) -> dict:
"""
Aggregate news from multiple free sources.
Args:
query: Search query
limit: Maximum results per source (default: 10)
Returns:
Dictionary with aggregated news from multiple sources
"""
results = []
errors = []
# Search Hacker News
hn_result = news_search_hackernews(query, limit)
if hn_result.get("success"):
results.extend([
{**r, "source": "hacker_news"} for r in hn_result.get("results", [])
])
else:
errors.append(f"Hacker News: {hn_result.get('error')}")
# Search Reddit
reddit_result = news_search_reddit(query, "all", limit)
if reddit_result.get("success"):
results.extend([
{**r, "source": "reddit"} for r in reddit_result.get("results", [])
])
else:
errors.append(f"Reddit: {reddit_result.get('error')}")
return {
"success": True,
"query": query,
"results": results,
"count": len(results),
"sources_checked": ["hacker_news", "reddit"],
"errors": errors if errors else None,
}
# Tool schemas for OpenAI function calling
NEWS_SEARCH_HACKERNEWS_SCHEMA = {
"type": "function",
"function": {
"name": "news_search_hackernews",
"description": "Search Hacker News for tech news and discussions. Best for technology, startups, programming topics.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "Search query",
},
"limit": {
"type": "integer",
"description": "Maximum number of results (default: 10)",
"default": 10,
},
},
"required": ["query"],
},
},
}
NEWS_GET_TOP_STORIES_SCHEMA = {
"type": "function",
"function": {
"name": "news_get_top_stories",
"description": "Get current top stories from Hacker News. Use for general tech news browsing.",
"parameters": {
"type": "object",
"properties": {
"limit": {
"type": "integer",
"description": "Maximum number of stories (default: 15)",
"default": 15,
},
},
"required": [],
},
},
}
NEWS_GET_REDDIT_SCHEMA = {
"type": "function",
"function": {
"name": "news_get_reddit",
"description": "Get top posts from a Reddit subreddit. Great for news, discussions, and community content.",
"parameters": {
"type": "object",
"properties": {
"subreddit": {
"type": "string",
"description": "Subreddit name (e.g., worldnews, technology, science)",
"default": "worldnews",
},
"limit": {
"type": "integer",
"description": "Maximum number of posts (default: 15)",
"default": 15,
},
"timeframe": {
"type": "string",
"description": "Time period (hour, day, week, month, year, all)",
"default": "day",
"enum": ["hour", "day", "week", "month", "year", "all"],
},
},
"required": [],
},
},
}
NEWS_SEARCH_REDDIT_SCHEMA = {
"type": "function",
"function": {
"name": "news_search_reddit",
"description": "Search Reddit for posts matching a query across all subreddits.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "Search query",
},
"subreddit": {
"type": "string",
"description": "Subreddit to search (default: all)",
"default": "all",
},
"limit": {
"type": "integer",
"description": "Maximum number of results (default: 15)",
"default": 15,
},
},
"required": ["query"],
},
},
}
NEWS_AGGREGATE_SCHEMA = {
"type": "function",
"function": {
"name": "news_aggregate",
"description": "Search for news from multiple sources (Hacker News, Reddit) in one call. Best for comprehensive news coverage.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "Search query",
},
"limit": {
"type": "integer",
"description": "Maximum results per source (default: 10)",
"default": 10,
},
},
"required": ["query"],
},
},
}