""" Scientific/Academic Tool - Search scientific papers and research Free sources used: - arXiv API (completely free, no key required) - Semantic Scholar API (free tier) - DOAJ (Directory of Open Access Journals - free) - CORE API (free access to research papers) All APIs are free for basic use. """ from __future__ import annotations import logging from datetime import datetime from typing import Optional import xml.etree.ElementTree as ET import requests log = logging.getLogger(__name__) # Free academic APIs ARXIV_API = "http://export.arxiv.org/api/query" SEMANTIC_SCHOLAR_API = "https://api.semanticscholar.org/graph/v1" DOAJ_API = "https://api.doaj.org" def science_search_arxiv( query: str, max_results: int = 10, category: Optional[str] = None, ) -> dict: """ Search arXiv for scientific preprints. Args: query: Search query max_results: Maximum number of results (default: 10) category: arXiv category filter (e.g., cs.AI, physics, math.CO) Returns: Dictionary with arXiv search results """ try: # Build search query search_query = query if category: search_query = f"cat:{category} AND {query}" params = { "search_query": search_query, "start": 0, "max_results": max_results, "sortBy": "relevance", "sortOrder": "descending", } response = requests.get(ARXIV_API, params=params, timeout=30) response.raise_for_status() # Parse XML response root = ET.fromstring(response.content) # Define namespace ns = {"atom": "http://www.w3.org/2005/Atom"} results = [] for entry in root.findall("atom:entry", ns): title = entry.find("atom:title", ns) summary = entry.find("atom:summary", ns) published = entry.find("atom:published", ns) updated = entry.find("atom:updated", ns) link = entry.find("atom:id", ns) authors = [] for author in entry.findall("atom:author", ns): name = author.find("atom:name", ns) if name is not None: authors.append(name.text) # Get categories categories = [] for cat in entry.findall("atom:category", ns): term = cat.get("term") if term: categories.append(term) results.append({ "title": title.text.strip() if title is not None else "", "abstract": summary.text.strip()[:1000] if summary is not None else "", "authors": authors, "published": published.text if published is not None else "", "updated": updated.text if updated is not None else "", "link": link.text if link is not None else "", "pdf_link": link.text.replace("/abs/", "/pdf/") if link is not None else "", "categories": categories, }) return { "success": True, "source": "arxiv", "query": query, "category": category, "results": results, "count": len(results), } except Exception as e: log.error(f"arXiv search failed: {e}") return { "success": False, "error": str(e), "source": "arxiv", } def science_search_semantic_scholar( query: str, limit: int = 10, year: Optional[str] = None, ) -> dict: """ Search Semantic Scholar for academic papers. Args: query: Search query limit: Maximum number of results (default: 10) year: Year filter (e.g., "2020-", "2018-2022") Returns: Dictionary with Semantic Scholar results """ try: url = f"{SEMANTIC_SCHOLAR_API}/paper/search" params = { "query": query, "limit": limit, "fields": "title,abstract,authors,year,venue,citationCount,openAccessPdf,url", } if year: params["year"] = year response = requests.get(url, params=params, timeout=15) response.raise_for_status() data = response.json() results = [] for paper in data.get("data", []): authors = [a.get("name", "") for a in paper.get("authors", [])] pdf_url = None if paper.get("openAccessPdf"): pdf_url = paper["openAccessPdf"].get("url") results.append({ "paper_id": paper.get("paperId"), "title": paper.get("title", ""), "abstract": paper.get("abstract", "")[:1000] if paper.get("abstract") else "", "authors": authors, "year": paper.get("year"), "venue": paper.get("venue", ""), "citations": paper.get("citationCount", 0), "url": paper.get("url"), "pdf_url": pdf_url, }) return { "success": True, "source": "semantic_scholar", "query": query, "year_filter": year, "results": results, "count": len(results), "total": data.get("total", len(results)), } except Exception as e: log.error(f"Semantic Scholar search failed: {e}") return { "success": False, "error": str(e), "source": "semantic_scholar", } def science_get_paper_details( paper_id: str, ) -> dict: """ Get detailed information about a paper from Semantic Scholar. Args: paper_id: Semantic Scholar paper ID or DOI Returns: Dictionary with paper details """ try: url = f"{SEMANTIC_SCHOLAR_API}/paper/{paper_id}" params = { "fields": "title,abstract,authors,year,venue,citationCount,referenceCount,openAccessPdf,url,journal,publicationVenue,tldr", } response = requests.get(url, params=params, timeout=15) response.raise_for_status() paper = response.json() authors = [a.get("name", "") for a in paper.get("authors", [])] pdf_url = None if paper.get("openAccessPdf"): pdf_url = paper["openAccessPdf"].get("url") tldr = None if paper.get("tldr"): tldr = paper["tldr"].get("text") return { "success": True, "source": "semantic_scholar", "paper_id": paper.get("paperId"), "title": paper.get("title", ""), "abstract": paper.get("abstract", ""), "authors": authors, "year": paper.get("year"), "venue": paper.get("venue", ""), "journal": paper.get("journal", {}).get("name") if paper.get("journal") else None, "citations": paper.get("citationCount", 0), "references": paper.get("referenceCount", 0), "url": paper.get("url"), "pdf_url": pdf_url, "tldr": tldr, } except Exception as e: log.error(f"Paper details fetch failed: {e}") return { "success": False, "error": str(e), "source": "semantic_scholar", } def science_search_doaj( query: str, limit: int = 10, ) -> dict: """ Search DOAJ (Directory of Open Access Journals). Args: query: Search query limit: Maximum number of results (default: 10) Returns: Dictionary with DOAJ results """ try: url = f"{DOAJ_API}/search/articles/{query}" params = { "pageSize": limit, "page": 1, } headers = {"Accept": "application/json"} response = requests.get(url, params=params, headers=headers, timeout=15) response.raise_for_status() data = response.json() results = [] for article in data.get("results", []): bibjson = article.get("bibjson", {}) results.append({ "title": bibjson.get("title", ""), "abstract": bibjson.get("abstract", "")[:1000] if bibjson.get("abstract") else "", "authors": [a.get("name", "") for a in bibjson.get("author", [])], "year": bibjson.get("year"), "journal": bibjson.get("journal", {}).get("title", ""), "doi": bibjson.get("identifier", [{}])[0].get("id") if bibjson.get("identifier") else None, "link": bibjson.get("link", [{}])[0].get("url") if bibjson.get("link") else None, "keywords": bibjson.get("keywords", []), }) return { "success": True, "source": "doaj", "query": query, "results": results, "count": len(results), "total": data.get("total", len(results)), } except Exception as e: log.error(f"DOAJ search failed: {e}") return { "success": False, "error": str(e), "source": "doaj", } def science_aggregate_search( query: str, limit: int = 5, ) -> dict: """ Search multiple academic sources at once. Args: query: Search query limit: Maximum results per source (default: 5) Returns: Dictionary with aggregated results from multiple sources """ results = [] errors = [] # Search arXiv arxiv_result = science_search_arxiv(query, limit) if arxiv_result.get("success"): results.extend([{**r, "source": "arxiv"} for r in arxiv_result.get("results", [])]) else: errors.append(f"arXiv: {arxiv_result.get('error')}") # Search Semantic Scholar ss_result = science_search_semantic_scholar(query, limit) if ss_result.get("success"): results.extend([{**r, "source": "semantic_scholar"} for r in ss_result.get("results", [])]) else: errors.append(f"Semantic Scholar: {ss_result.get('error')}") return { "success": True, "query": query, "results": results, "count": len(results), "sources_checked": ["arxiv", "semantic_scholar"], "errors": errors if errors else None, } # Tool schemas for OpenAI function calling SCIENCE_SEARCH_ARXIV_SCHEMA = { "type": "function", "function": { "name": "science_search_arxiv", "description": "Search arXiv for scientific preprints. Best for physics, math, computer science, and AI research.", "parameters": { "type": "object", "properties": { "query": { "type": "string", "description": "Search query", }, "max_results": { "type": "integer", "description": "Maximum number of results (default: 10)", "default": 10, }, "category": { "type": "string", "description": "arXiv category filter (e.g., cs.AI, cs.LG, physics, math.CO)", }, }, "required": ["query"], }, }, } SCIENCE_SEARCH_SEMANTIC_SCHOLAR_SCHEMA = { "type": "function", "function": { "name": "science_search_semantic_scholar", "description": "Search Semantic Scholar for academic papers across all fields. Includes citation counts and open access PDFs.", "parameters": { "type": "object", "properties": { "query": { "type": "string", "description": "Search query", }, "limit": { "type": "integer", "description": "Maximum number of results (default: 10)", "default": 10, }, "year": { "type": "string", "description": "Year filter (e.g., '2020-', '2018-2022')", }, }, "required": ["query"], }, }, } SCIENCE_GET_PAPER_DETAILS_SCHEMA = { "type": "function", "function": { "name": "science_get_paper_details", "description": "Get detailed information about a specific paper including TLDR summary. Use paper ID from search results.", "parameters": { "type": "object", "properties": { "paper_id": { "type": "string", "description": "Semantic Scholar paper ID or DOI", }, }, "required": ["paper_id"], }, }, } SCIENCE_SEARCH_DOAJ_SCHEMA = { "type": "function", "function": { "name": "science_search_doaj", "description": "Search DOAJ for open access journal articles. Best for peer-reviewed open access research.", "parameters": { "type": "object", "properties": { "query": { "type": "string", "description": "Search query", }, "limit": { "type": "integer", "description": "Maximum number of results (default: 10)", "default": 10, }, }, "required": ["query"], }, }, } SCIENCE_AGGREGATE_SEARCH_SCHEMA = { "type": "function", "function": { "name": "science_aggregate_search", "description": "Search multiple academic sources (arXiv, Semantic Scholar) at once for comprehensive coverage.", "parameters": { "type": "object", "properties": { "query": { "type": "string", "description": "Search query", }, "limit": { "type": "integer", "description": "Maximum results per source (default: 5)", "default": 5, }, }, "required": ["query"], }, }, }