docrag/tools/science_tool.py
Z User 4394e7d6f9 Add comprehensive set of free data tools for RAG
Tools added:
- Wikipedia: search, get article, get full article
- News: Hacker News, Reddit, aggregated news search
- Finance: stocks (yfinance), crypto (CoinGecko), exchange rates
- Medical: PubMed, FDA, disease data, health topics
- Weather: current, forecast, air quality (Open-Meteo)
- Science: arXiv, Semantic Scholar, DOAJ
- Web: DuckDuckGo search, instant answers, page content

All tools use completely free APIs with no authentication required.
2026-03-29 06:27:32 +00:00

465 lines
14 KiB
Python

"""
Scientific/Academic Tool - Search scientific papers and research
Free sources used:
- arXiv API (completely free, no key required)
- Semantic Scholar API (free tier)
- DOAJ (Directory of Open Access Journals - free)
- CORE API (free access to research papers)
All APIs are free for basic use.
"""
from __future__ import annotations
import logging
from datetime import datetime
from typing import Optional
import xml.etree.ElementTree as ET
import requests
log = logging.getLogger(__name__)
# Free academic APIs
ARXIV_API = "http://export.arxiv.org/api/query"
SEMANTIC_SCHOLAR_API = "https://api.semanticscholar.org/graph/v1"
DOAJ_API = "https://api.doaj.org"
def science_search_arxiv(
query: str,
max_results: int = 10,
category: Optional[str] = None,
) -> dict:
"""
Search arXiv for scientific preprints.
Args:
query: Search query
max_results: Maximum number of results (default: 10)
category: arXiv category filter (e.g., cs.AI, physics, math.CO)
Returns:
Dictionary with arXiv search results
"""
try:
# Build search query
search_query = query
if category:
search_query = f"cat:{category} AND {query}"
params = {
"search_query": search_query,
"start": 0,
"max_results": max_results,
"sortBy": "relevance",
"sortOrder": "descending",
}
response = requests.get(ARXIV_API, params=params, timeout=30)
response.raise_for_status()
# Parse XML response
root = ET.fromstring(response.content)
# Define namespace
ns = {"atom": "http://www.w3.org/2005/Atom"}
results = []
for entry in root.findall("atom:entry", ns):
title = entry.find("atom:title", ns)
summary = entry.find("atom:summary", ns)
published = entry.find("atom:published", ns)
updated = entry.find("atom:updated", ns)
link = entry.find("atom:id", ns)
authors = []
for author in entry.findall("atom:author", ns):
name = author.find("atom:name", ns)
if name is not None:
authors.append(name.text)
# Get categories
categories = []
for cat in entry.findall("atom:category", ns):
term = cat.get("term")
if term:
categories.append(term)
results.append({
"title": title.text.strip() if title is not None else "",
"abstract": summary.text.strip()[:1000] if summary is not None else "",
"authors": authors,
"published": published.text if published is not None else "",
"updated": updated.text if updated is not None else "",
"link": link.text if link is not None else "",
"pdf_link": link.text.replace("/abs/", "/pdf/") if link is not None else "",
"categories": categories,
})
return {
"success": True,
"source": "arxiv",
"query": query,
"category": category,
"results": results,
"count": len(results),
}
except Exception as e:
log.error(f"arXiv search failed: {e}")
return {
"success": False,
"error": str(e),
"source": "arxiv",
}
def science_search_semantic_scholar(
query: str,
limit: int = 10,
year: Optional[str] = None,
) -> dict:
"""
Search Semantic Scholar for academic papers.
Args:
query: Search query
limit: Maximum number of results (default: 10)
year: Year filter (e.g., "2020-", "2018-2022")
Returns:
Dictionary with Semantic Scholar results
"""
try:
url = f"{SEMANTIC_SCHOLAR_API}/paper/search"
params = {
"query": query,
"limit": limit,
"fields": "title,abstract,authors,year,venue,citationCount,openAccessPdf,url",
}
if year:
params["year"] = year
response = requests.get(url, params=params, timeout=15)
response.raise_for_status()
data = response.json()
results = []
for paper in data.get("data", []):
authors = [a.get("name", "") for a in paper.get("authors", [])]
pdf_url = None
if paper.get("openAccessPdf"):
pdf_url = paper["openAccessPdf"].get("url")
results.append({
"paper_id": paper.get("paperId"),
"title": paper.get("title", ""),
"abstract": paper.get("abstract", "")[:1000] if paper.get("abstract") else "",
"authors": authors,
"year": paper.get("year"),
"venue": paper.get("venue", ""),
"citations": paper.get("citationCount", 0),
"url": paper.get("url"),
"pdf_url": pdf_url,
})
return {
"success": True,
"source": "semantic_scholar",
"query": query,
"year_filter": year,
"results": results,
"count": len(results),
"total": data.get("total", len(results)),
}
except Exception as e:
log.error(f"Semantic Scholar search failed: {e}")
return {
"success": False,
"error": str(e),
"source": "semantic_scholar",
}
def science_get_paper_details(
paper_id: str,
) -> dict:
"""
Get detailed information about a paper from Semantic Scholar.
Args:
paper_id: Semantic Scholar paper ID or DOI
Returns:
Dictionary with paper details
"""
try:
url = f"{SEMANTIC_SCHOLAR_API}/paper/{paper_id}"
params = {
"fields": "title,abstract,authors,year,venue,citationCount,referenceCount,openAccessPdf,url,journal,publicationVenue,tldr",
}
response = requests.get(url, params=params, timeout=15)
response.raise_for_status()
paper = response.json()
authors = [a.get("name", "") for a in paper.get("authors", [])]
pdf_url = None
if paper.get("openAccessPdf"):
pdf_url = paper["openAccessPdf"].get("url")
tldr = None
if paper.get("tldr"):
tldr = paper["tldr"].get("text")
return {
"success": True,
"source": "semantic_scholar",
"paper_id": paper.get("paperId"),
"title": paper.get("title", ""),
"abstract": paper.get("abstract", ""),
"authors": authors,
"year": paper.get("year"),
"venue": paper.get("venue", ""),
"journal": paper.get("journal", {}).get("name") if paper.get("journal") else None,
"citations": paper.get("citationCount", 0),
"references": paper.get("referenceCount", 0),
"url": paper.get("url"),
"pdf_url": pdf_url,
"tldr": tldr,
}
except Exception as e:
log.error(f"Paper details fetch failed: {e}")
return {
"success": False,
"error": str(e),
"source": "semantic_scholar",
}
def science_search_doaj(
query: str,
limit: int = 10,
) -> dict:
"""
Search DOAJ (Directory of Open Access Journals).
Args:
query: Search query
limit: Maximum number of results (default: 10)
Returns:
Dictionary with DOAJ results
"""
try:
url = f"{DOAJ_API}/search/articles/{query}"
params = {
"pageSize": limit,
"page": 1,
}
headers = {"Accept": "application/json"}
response = requests.get(url, params=params, headers=headers, timeout=15)
response.raise_for_status()
data = response.json()
results = []
for article in data.get("results", []):
bibjson = article.get("bibjson", {})
results.append({
"title": bibjson.get("title", ""),
"abstract": bibjson.get("abstract", "")[:1000] if bibjson.get("abstract") else "",
"authors": [a.get("name", "") for a in bibjson.get("author", [])],
"year": bibjson.get("year"),
"journal": bibjson.get("journal", {}).get("title", ""),
"doi": bibjson.get("identifier", [{}])[0].get("id") if bibjson.get("identifier") else None,
"link": bibjson.get("link", [{}])[0].get("url") if bibjson.get("link") else None,
"keywords": bibjson.get("keywords", []),
})
return {
"success": True,
"source": "doaj",
"query": query,
"results": results,
"count": len(results),
"total": data.get("total", len(results)),
}
except Exception as e:
log.error(f"DOAJ search failed: {e}")
return {
"success": False,
"error": str(e),
"source": "doaj",
}
def science_aggregate_search(
query: str,
limit: int = 5,
) -> dict:
"""
Search multiple academic sources at once.
Args:
query: Search query
limit: Maximum results per source (default: 5)
Returns:
Dictionary with aggregated results from multiple sources
"""
results = []
errors = []
# Search arXiv
arxiv_result = science_search_arxiv(query, limit)
if arxiv_result.get("success"):
results.extend([{**r, "source": "arxiv"} for r in arxiv_result.get("results", [])])
else:
errors.append(f"arXiv: {arxiv_result.get('error')}")
# Search Semantic Scholar
ss_result = science_search_semantic_scholar(query, limit)
if ss_result.get("success"):
results.extend([{**r, "source": "semantic_scholar"} for r in ss_result.get("results", [])])
else:
errors.append(f"Semantic Scholar: {ss_result.get('error')}")
return {
"success": True,
"query": query,
"results": results,
"count": len(results),
"sources_checked": ["arxiv", "semantic_scholar"],
"errors": errors if errors else None,
}
# Tool schemas for OpenAI function calling
SCIENCE_SEARCH_ARXIV_SCHEMA = {
"type": "function",
"function": {
"name": "science_search_arxiv",
"description": "Search arXiv for scientific preprints. Best for physics, math, computer science, and AI research.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "Search query",
},
"max_results": {
"type": "integer",
"description": "Maximum number of results (default: 10)",
"default": 10,
},
"category": {
"type": "string",
"description": "arXiv category filter (e.g., cs.AI, cs.LG, physics, math.CO)",
},
},
"required": ["query"],
},
},
}
SCIENCE_SEARCH_SEMANTIC_SCHOLAR_SCHEMA = {
"type": "function",
"function": {
"name": "science_search_semantic_scholar",
"description": "Search Semantic Scholar for academic papers across all fields. Includes citation counts and open access PDFs.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "Search query",
},
"limit": {
"type": "integer",
"description": "Maximum number of results (default: 10)",
"default": 10,
},
"year": {
"type": "string",
"description": "Year filter (e.g., '2020-', '2018-2022')",
},
},
"required": ["query"],
},
},
}
SCIENCE_GET_PAPER_DETAILS_SCHEMA = {
"type": "function",
"function": {
"name": "science_get_paper_details",
"description": "Get detailed information about a specific paper including TLDR summary. Use paper ID from search results.",
"parameters": {
"type": "object",
"properties": {
"paper_id": {
"type": "string",
"description": "Semantic Scholar paper ID or DOI",
},
},
"required": ["paper_id"],
},
},
}
SCIENCE_SEARCH_DOAJ_SCHEMA = {
"type": "function",
"function": {
"name": "science_search_doaj",
"description": "Search DOAJ for open access journal articles. Best for peer-reviewed open access research.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "Search query",
},
"limit": {
"type": "integer",
"description": "Maximum number of results (default: 10)",
"default": 10,
},
},
"required": ["query"],
},
},
}
SCIENCE_AGGREGATE_SEARCH_SCHEMA = {
"type": "function",
"function": {
"name": "science_aggregate_search",
"description": "Search multiple academic sources (arXiv, Semantic Scholar) at once for comprehensive coverage.",
"parameters": {
"type": "object",
"properties": {
"query": {
"type": "string",
"description": "Search query",
},
"limit": {
"type": "integer",
"description": "Maximum results per source (default: 5)",
"default": 5,
},
},
"required": ["query"],
},
},
}